diff --git a/source/Makefile.Objects b/source/Makefile.Objects index 764cf0ce51..48179be83a 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -334,6 +334,8 @@ OBJS_HAMILT=hamilt_pw.o\ op_exx_pw.o\ ekinetic_pw.o\ ekinetic_op.o\ + exx_pw_ace.o\ + exx_pw_pot.o\ hpsi_norm_op.o\ veff_pw.o\ veff_op.o\ diff --git a/source/source_cell/pseudo.cpp b/source/source_cell/pseudo.cpp index 233632069d..c6dc9de2f7 100644 --- a/source/source_cell/pseudo.cpp +++ b/source/source_cell/pseudo.cpp @@ -1,5 +1,6 @@ #include "pseudo.h" #include "source_base/tool_title.h" +#include pseudo::pseudo() { diff --git a/source/source_esolver/esolver_ks_pw.cpp b/source/source_esolver/esolver_ks_pw.cpp index 1f3c23da5a..66f0ca97d1 100644 --- a/source/source_esolver/esolver_ks_pw.cpp +++ b/source/source_esolver/esolver_ks_pw.cpp @@ -589,10 +589,20 @@ void ESolver_KS_PW::iter_finish(UnitCell& ucell, const int istep, int auto start = std::chrono::high_resolution_clock::now(); exx_helper.set_firstiter(false); exx_helper.op_exx->first_iter = false; + double dexx = 0.0; + if (PARAM.inp.exx_thr_type == "energy") + { + dexx = exx_helper.cal_exx_energy(this->kspw_psi); + } exx_helper.set_psi(this->kspw_psi); + if (PARAM.inp.exx_thr_type == "energy") + { + dexx -= exx_helper.cal_exx_energy(this->kspw_psi); + // std::cout << "dexx = " << dexx << std::endl; + } + bool conv_ene = std::abs(dexx) < PARAM.inp.exx_ene_thr; - conv_esolver = exx_helper.exx_after_converge(iter); - + conv_esolver = exx_helper.exx_after_converge(iter, conv_ene); if (!conv_esolver) { auto duration = std::chrono::high_resolution_clock::now() - start; @@ -602,6 +612,7 @@ void ESolver_KS_PW::iter_finish(UnitCell& ucell, const int istep, int exx_helper.op_exx->first_iter = false; XC_Functional::set_xc_type(ucell.atoms[0].ncpp.xc_func); update_pot(ucell, istep, iter, conv_esolver); + exx_helper.iter_inc(); } } } @@ -614,8 +625,9 @@ void ESolver_KS_PW::iter_finish(UnitCell& ucell, const int istep, int //---------------------------------------------------------- // 3) Print out electronic wavefunctions in pw basis //---------------------------------------------------------- - if (iter % PARAM.inp.out_freq_elec == 0 || iter == PARAM.inp.scf_nmax || conv_esolver) + if (iter % PARAM.inp.out_freq_elec == 0 || iter == PARAM.inp.scf_nmax) { + // conv_esolver == true has already been dealt with in after_scf ModuleIO::write_wfc_pw(GlobalV::KPAR, GlobalV::MY_POOL, GlobalV::MY_RANK, diff --git a/source/source_io/module_parameter/input_parameter.h b/source/source_io/module_parameter/input_parameter.h index db2b5d5a72..520db8d100 100644 --- a/source/source_io/module_parameter/input_parameter.h +++ b/source/source_io/module_parameter/input_parameter.h @@ -654,6 +654,8 @@ struct Input_para // EXX for planewave basis, rhx0820 2025-03-10 bool exxace = true; // exxace, exact exchange for planewave basis, https://doi.org/10.1021/acs.jctc.6b00092 bool exx_gamma_extrapolation = true; // gamma point extrapolation for exx, https://doi.org/10.1103/PhysRevB.79.205114 + std::string exx_thr_type = "density"; // threshold type for exx outer loop, energy or density + double exx_ene_thr = 1e-5; // threshold for exx outer loop when exx_thr_type = energy // ==== #Parameters (23.XC external parameterization) ======== /* diff --git a/source/source_io/read_input_item_other.cpp b/source/source_io/read_input_item_other.cpp index 5694c7b33b..cdd23df1c9 100644 --- a/source/source_io/read_input_item_other.cpp +++ b/source/source_io/read_input_item_other.cpp @@ -545,6 +545,32 @@ void ReadInput::item_others() read_sync_bool(input.exx_gamma_extrapolation); this->add_item(item); } + { + Input_Item item("exx_thr_type"); + item.annotation = "threshold type for exx outer loop, energy or density"; + read_sync_string(input.exx_thr_type); + item.check_value = [](const Input_Item& item, const Parameter& para) { + std::string thr_type = para.input.exx_thr_type; + std::transform(thr_type.begin(), thr_type.end(), thr_type.begin(), ::tolower); + if (thr_type != "energy" && thr_type != "density") + { + ModuleBase::WARNING_QUIT("ReadInput", "exx_thr_type should be energy or density"); + } + }; + this->add_item(item); + } + { + Input_Item item("exx_ene_thr"); + item.annotation = "threshold for exx outer loop when exx_thr_type = energy"; + read_sync_double(input.exx_ene_thr); + item.check_value = [](const Input_Item& item, const Parameter& para) { + if (para.input.exx_ene_thr <= 0) + { + ModuleBase::WARNING_QUIT("ReadInput", "exx_ene_thr must > 0"); + } + }; + this->add_item(item); + } } } // namespace ModuleIO \ No newline at end of file diff --git a/source/source_pw/module_pwdft/CMakeLists.txt b/source/source_pw/module_pwdft/CMakeLists.txt index 03e808f6e6..81ad6ddc2c 100644 --- a/source/source_pw/module_pwdft/CMakeLists.txt +++ b/source/source_pw/module_pwdft/CMakeLists.txt @@ -10,6 +10,8 @@ list(APPEND objects operator_pw/operator_pw.cpp operator_pw/onsite_proj_pw.cpp operator_pw/op_exx_pw.cpp + operator_pw/exx_pw_ace.cpp + operator_pw/exx_pw_pot.cpp forces_nl.cpp forces_cc.cpp forces_scc.cpp diff --git a/source/source_pw/module_pwdft/module_exx_helper/exx_helper.cpp b/source/source_pw/module_pwdft/module_exx_helper/exx_helper.cpp index ffa671c42d..1b41163967 100644 --- a/source/source_pw/module_pwdft/module_exx_helper/exx_helper.cpp +++ b/source/source_pw/module_pwdft/module_exx_helper/exx_helper.cpp @@ -8,7 +8,7 @@ double Exx_Helper::cal_exx_energy(psi::Psi *psi_) } template -bool Exx_Helper::exx_after_converge(int &iter) +bool Exx_Helper::exx_after_converge(int &iter, bool ene_conv) { if (op_exx->first_iter) { @@ -18,10 +18,20 @@ bool Exx_Helper::exx_after_converge(int &iter) { return true; } - else if (iter == 1) + else if (PARAM.inp.exx_thr_type == "energy" && ene_conv) { return true; } + else if (PARAM.inp.exx_thr_type == "density" && iter == 1) + { + return true; + } + else if (iter >= PARAM.inp.exx_hybrid_step) + { + GlobalV::ofs_running << " !!EXX IS NOT CONVERGED!!" << std::endl; + std::cout << " !!EXX IS NOT CONVERGED!!" << std::endl; + return true; + } GlobalV::ofs_running << "Updating EXX and rerun SCF" << std::endl; iter = 0; return false; diff --git a/source/source_pw/module_pwdft/module_exx_helper/exx_helper.h b/source/source_pw/module_pwdft/module_exx_helper/exx_helper.h index a67afbc80e..77c6c84295 100644 --- a/source/source_pw/module_pwdft/module_exx_helper/exx_helper.h +++ b/source/source_pw/module_pwdft/module_exx_helper/exx_helper.h @@ -21,6 +21,7 @@ struct Exx_Helper void set_firstiter(bool flag = true) { first_iter = flag; } void set_wg(const ModuleBase::matrix *wg_) { wg = wg_; } void set_psi(psi::Psi *psi_); + void iter_inc() { exx_iter++; } void set_op() { @@ -29,7 +30,7 @@ struct Exx_Helper op_exx->set_wg(wg); } - bool exx_after_converge(int &iter); + bool exx_after_converge(int &iter, bool ene_conv); double cal_exx_energy(psi::Psi *psi_); @@ -37,6 +38,7 @@ struct Exx_Helper bool first_iter = false; psi::Psi *psi = nullptr; const ModuleBase::matrix *wg = nullptr; + int exx_iter = 0; }; #endif // EXX_HELPER_H diff --git a/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt b/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt index e4f2ef4f10..0a0e923ee0 100644 --- a/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt +++ b/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt @@ -6,6 +6,9 @@ list(APPEND operator_ks_pw_srcs meta_pw.cpp velocity_pw.cpp onsite_proj_pw.cpp + op_exx_pw.cpp + exx_pw_ace.cpp + exx_pw_pot.cpp ) # this library is included in module_pwdft now diff --git a/source/source_pw/module_pwdft/operator_pw/exx_pw_ace.cpp b/source/source_pw/module_pwdft/operator_pw/exx_pw_ace.cpp new file mode 100644 index 0000000000..80c4a0dc6a --- /dev/null +++ b/source/source_pw/module_pwdft/operator_pw/exx_pw_ace.cpp @@ -0,0 +1,247 @@ +#include "op_exx_pw.h" + +namespace hamilt +{ +template +void OperatorEXXPW::act_op_ace(const int nbands, + const int nbasis, + const int npol, + const T *tmpsi_in, + T *tmhpsi, + const int ngk_ik, + const bool is_first_node) const +{ + ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace"); + // std::cout << "act_op_ace" << std::endl; + // hpsi += -Xi^\dagger * Xi * psi + T* Xi_ace = Xi_ace_k[this->ik]; + int nbands_tot = psi.get_nbands(); + int nbasis_max = psi.get_nbasis(); + // T* hpsi = nullptr; + // resmem_complex_op()(hpsi, nbands_tot * nbasis); + // setmem_complex_op()(hpsi, 0, nbands_tot * nbasis); + T* Xi_psi = nullptr; + resmem_complex_op()(Xi_psi, nbands_tot * nbands); + setmem_complex_op()(Xi_psi, 0, nbands_tot * nbands); + + char trans_N = 'N', trans_T = 'T', trans_C = 'C'; + T intermediate_one = 1.0, intermediate_zero = 0.0, intermediate_minus_one = -1.0; + // Xi * psi + gemm_complex_op()(trans_N, + trans_N, + nbands_tot, + nbands, + nbasis, + &intermediate_one, + Xi_ace, + nbands_tot, + tmpsi_in, + nbasis, + &intermediate_zero, + Xi_psi, + nbands_tot + ); + + Parallel_Reduce::reduce_pool(Xi_psi, nbands_tot * nbands); + + // Xi^\dagger * (Xi * psi) + gemm_complex_op()(trans_C, + trans_N, + nbasis, + nbands, + nbands_tot, + &intermediate_minus_one, + Xi_ace, + nbands_tot, + Xi_psi, + nbands_tot, + &intermediate_one, + tmhpsi, + nbasis + ); + + + // // negative sign, add to hpsi + // vec_add_vec_complex_op()(this->ctx, nbands * nbasis, tmhpsi, hpsi, -1, tmhpsi, 1); + // delmem_complex_op()(hpsi); + delmem_complex_op()(Xi_psi); + ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace"); + +} + +template +void OperatorEXXPW::construct_ace() const +{ + // int nkb = p_exx_helper->psi.get_nbands() * p_exx_helper->psi.get_nk(); + int nbands = psi.get_nbands(); + int nbasis = psi.get_nbasis(); + int nk = psi.get_nk(); + + int ik_save = this->ik; + int * ik_ = const_cast(&this->ik); + + T intermediate_one = 1.0, intermediate_zero = 0.0; + + if (h_psi_ace == nullptr) + { + resmem_complex_op()(h_psi_ace, nbands * nbasis); + setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); + } + + if (Xi_ace_k.size() != nk) + { + Xi_ace_k.resize(nk); + for (int i = 0; i < nk; i++) + { + resmem_complex_op()(Xi_ace_k[i], nbands * nbasis); + } + } + + for (int i = 0; i < nk; i++) + { + setmem_complex_op()(Xi_ace_k[i], 0, nbands * nbasis); + } + + if (L_ace == nullptr) + { + resmem_complex_op()(L_ace, nbands * nbands); + setmem_complex_op()(L_ace, 0, nbands * nbands); + } + + if (psi_h_psi_ace == nullptr) + { + resmem_complex_op()(psi_h_psi_ace, nbands * nbands); + } + + if (first_iter) return; + ModuleBase::timer::tick("OperatorEXXPW", "construct_ace"); + + for (int ik = 0; ik < nk; ik++) + { + int npwk = wfcpw->npwk[ik]; + + T* Xi_ace = Xi_ace_k[ik]; + psi.fix_kb(ik, 0); + T* p_psi = psi.get_pointer(); + + setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); + + *ik_ = ik; + + act_op( + nbands, + nbasis, + 1, + p_psi, + h_psi_ace, + nbasis, + false + ); + + // psi_h_psi_ace = psi^\dagger * h_psi_ace + // p_exx_helper->psi.fix_kb(0, 0); + gemm_complex_op()('C', + 'N', + nbands, + nbands, + npwk, + &intermediate_one, + p_psi, + nbasis, + h_psi_ace, + nbasis, + &intermediate_zero, + psi_h_psi_ace, + nbands); + + // reduction of psi_h_psi_ace, due to distributed memory + Parallel_Reduce::reduce_pool(psi_h_psi_ace, nbands * nbands); + + T intermediate_minus_one = -1.0; + axpy_complex_op()(nbands * nbands, + &intermediate_minus_one, + psi_h_psi_ace, + 1, + L_ace, + 1); + + + int info = 0; + char up = 'U', lo = 'L'; + + lapack_potrf()(lo, nbands, L_ace, nbands); + + // expand for-loop + for (int i = 0; i < nbands; ++i) { + setmem_complex_op()(L_ace + i * nbands, 0, i); + } + + // L_ace inv in place + char non = 'N'; + lapack_trtri()(lo, non, nbands, L_ace, nbands); + + // Xi_ace = L_ace^-1 * h_psi_ace^dagger + gemm_complex_op()('N', + 'C', + nbands, + npwk, + nbands, + &intermediate_one, + L_ace, + nbands, + h_psi_ace, + nbasis, + &intermediate_zero, + Xi_ace, + nbands); + + // clear mem + setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); + setmem_complex_op()(psi_h_psi_ace, 0, nbands * nbands); + setmem_complex_op()(L_ace, 0, nbands * nbands); + + } + + *ik_ = ik_save; + ModuleBase::timer::tick("OperatorEXXPW", "construct_ace"); + +} + +template +double OperatorEXXPW::cal_exx_energy_ace(psi::Psi* ppsi_) const +{ + double Eexx = 0; + + psi::Psi psi_ = *ppsi_; + int* ik_ = const_cast(&this->ik); + int ik_save = this->ik; + for (int i = 0; i < wfcpw->nks; i++) + { + setmem_complex_op()(h_psi_ace, 0, psi_.get_nbands() * psi_.get_nbasis()); + *ik_ = i; + psi_.fix_kb(i, 0); + T* psi_i = psi_.get_pointer(); + act_op_ace(psi_.get_nbands(), psi_.get_nbasis(), 1, psi_i, h_psi_ace, 0, true); + + for (int nband = 0; nband < psi_.get_nbands(); nband++) + { + psi_.fix_kb(i, nband); + T* psi_i_n = psi_.get_pointer(); + T* hpsi_i_n = h_psi_ace + nband * psi_.get_nbasis(); + double wg_i_n = (*wg)(i, nband); + // Eexx += dot(psi_i_n, h_psi_i_n) + Eexx += dot_op()(psi_.get_nbasis(), psi_i_n, hpsi_i_n, false) * wg_i_n * 2; + } + } + + Parallel_Reduce::reduce_pool(Eexx); + *ik_ = ik_save; + return Eexx; +} +template class OperatorEXXPW, base_device::DEVICE_CPU>; +template class OperatorEXXPW, base_device::DEVICE_CPU>; +#if ((defined __CUDA) || (defined __ROCM)) +template class OperatorEXXPW, base_device::DEVICE_GPU>; +template class OperatorEXXPW, base_device::DEVICE_GPU>; +#endif +} \ No newline at end of file diff --git a/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp b/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp new file mode 100644 index 0000000000..6019ac110c --- /dev/null +++ b/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp @@ -0,0 +1,552 @@ +#include "op_exx_pw.h" +#include "source_io/module_parameter/parameter.h" +#include "source_pw/module_pwdft/global.h" + +namespace hamilt +{ +template +void get_exx_potential(const K_Vectors* kv, + const ModulePW::PW_Basis_K* wfcpw, + ModulePW::PW_Basis* rhopw_dev, + Real* pot, + double tpiba, + bool gamma_extrapolation, + double ucell_omega) +{ + using setmem_real_cpu_op = base_device::memory::set_memory_op; + using syncmem_real_c2d_op = base_device::memory::synchronize_memory_op; + + Real nqs_half1 = 0.5 * kv->nmp[0]; + Real nqs_half2 = 0.5 * kv->nmp[1]; + Real nqs_half3 = 0.5 * kv->nmp[2]; + + Real* pot_cpu = nullptr; + int nks = wfcpw->nks, npw = rhopw_dev->npw; + double tpiba2 = tpiba * tpiba; + pot_cpu = new Real[npw * wfcpw->nks * wfcpw->nks]; + // fill zero + setmem_real_cpu_op()(pot_cpu, 0, npw * nks * nks); + + // calculate Fock pot + auto param_fock = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Fock]; + for (auto param: param_fock) + { + double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock, + 0.0, + kv, + wfcpw, + rhopw_dev, + tpiba, + gamma_extrapolation, + ucell_omega); + double alpha = std::stod(param["alpha"]); + for (int ik = 0; ik < nks; ik++) + { + for (int iq = 0; iq < nks; iq++) + { + const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; + const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; + const ModuleBase::Vector3 q_c = wfcpw->kvec_c[iq]; + const ModuleBase::Vector3 q_d = wfcpw->kvec_d[iq]; + +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (int ig = 0; ig < rhopw_dev->npw; ig++) + { + const ModuleBase::Vector3 g_d = rhopw_dev->gdirect[ig]; + const ModuleBase::Vector3 kqg_d = k_d - q_d + g_d; + // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) + // 7/8 of the points in the grid are "activated" and 1/8 are disabled. + // grid_factor is designed for the 7/8 of the grid to function like all of the points + Real grid_factor = 1; + double extrapolate_grid = 8.0 / 7.0; + if (gamma_extrapolation) + { + // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3) + auto isint = [](double x) { + double epsilon = 1e-6; // this follows the isint judgement in q-e + return std::abs(x - std::round(x)) < epsilon; + }; + if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)) + { + grid_factor = 0; + } + else + { + grid_factor = extrapolate_grid; + } + } + + const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; + const int nk = nks / nk_fac; + const int ig_kq = ik * nks * npw + iq * npw + ig; + + Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2; + // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2 + if (gg >= 1e-8) + { + Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg; + pot_cpu[ig_kq] += fac * grid_factor * alpha; + } + // } + else + { + pot_cpu[ig_kq] += exx_div * alpha; + } + // assert(is_finite(density_recip[ig])); + } + } + } + } + + // calculate erfc pot + auto param_erfc = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Erfc]; + for (auto param: param_erfc) + { + double erfc_omega = std::stod(param["omega"]); + double erfc_omega2 = erfc_omega * erfc_omega; + double alpha = std::stod(param["alpha"]); + double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc, + erfc_omega, + kv, + wfcpw, + rhopw_dev, + tpiba, + gamma_extrapolation, + ucell_omega); + for (int ik = 0; ik < nks; ik++) + { + for (int iq = 0; iq < nks; iq++) + { + const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; + const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; + const ModuleBase::Vector3 q_c = wfcpw->kvec_c[iq]; + const ModuleBase::Vector3 q_d = wfcpw->kvec_d[iq]; + +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (int ig = 0; ig < rhopw_dev->npw; ig++) + { + const ModuleBase::Vector3 g_d = rhopw_dev->gdirect[ig]; + const ModuleBase::Vector3 kqg_d = k_d - q_d + g_d; + // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) + // 7/8 of the points in the grid are "activated" and 1/8 are disabled. + // grid_factor is designed for the 7/8 of the grid to function like all of the points + Real grid_factor = 1; + double extrapolate_grid = 8.0 / 7.0; + if (gamma_extrapolation) + { + // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3) + auto isint = [](double x) { + double epsilon = 1e-6; // this follows the isint judgement in q-e + return std::abs(x - std::round(x)) < epsilon; + }; + if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)) + { + grid_factor = 0; + } + else + { + grid_factor = extrapolate_grid; + } + } + + const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; + const int nk = nks / nk_fac; + const int ig_kq = ik * nks * npw + iq * npw + ig; + + Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2; + // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2 + if (gg >= 1e-8) + { + Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg; + pot_cpu[ig_kq] += fac * (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) * grid_factor * alpha; + } + // } + else + { + // if (PARAM.inp.dft_functional == "hse") + if (!gamma_extrapolation) + { + pot_cpu[ig_kq] += (exx_div - ModuleBase::PI * ModuleBase::e2 / erfc_omega2) * alpha; + } + else + { + pot_cpu[ig_kq] += exx_div * alpha; + } + } + // assert(is_finite(density_recip[ig])); + } + } + } + } + + // copy the potential to the device memory + syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw * wfcpw->nks * wfcpw->nks); + + delete pot_cpu; +} + +template +void get_exx_stress_potential(const K_Vectors* kv, + const ModulePW::PW_Basis_K* wfcpw, + ModulePW::PW_Basis* rhopw_dev, + Real* pot, + double tpiba, + bool gamma_extrapolation, + double ucell_omega) +{ + using setmem_real_cpu_op = base_device::memory::set_memory_op; + using syncmem_real_c2d_op = base_device::memory::synchronize_memory_op; + + Real nqs_half1 = 0.5 * kv->nmp[0]; + Real nqs_half2 = 0.5 * kv->nmp[1]; + Real nqs_half3 = 0.5 * kv->nmp[2]; + + Real* pot_cpu = nullptr; + int nks = wfcpw->nks, npw = rhopw_dev->npw; + double tpiba2 = tpiba * tpiba; + pot_cpu = new Real[npw * wfcpw->nks * wfcpw->nks]; + // fill zero + setmem_real_cpu_op()(pot_cpu, 0, npw * nks * nks); + + // calculate Fock pot + auto param_fock = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Fock]; + for (auto param: param_fock) + { + double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock, + 0.0, + kv, + wfcpw, + rhopw_dev, + tpiba, + gamma_extrapolation, + ucell_omega); + double alpha = std::stod(param["alpha"]); + for (int ik = 0; ik < nks; ik++) + { + for (int iq = 0; iq < nks; iq++) + { + const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; + const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; + const ModuleBase::Vector3 q_c = wfcpw->kvec_c[iq]; + const ModuleBase::Vector3 q_d = wfcpw->kvec_d[iq]; + +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (int ig = 0; ig < rhopw_dev->npw; ig++) + { + const ModuleBase::Vector3 g_d = rhopw_dev->gdirect[ig]; + const ModuleBase::Vector3 kqg_d = k_d - q_d + g_d; + // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) + // 7/8 of the points in the grid are "activated" and 1/8 are disabled. + // grid_factor is designed for the 7/8 of the grid to function like all of the points + Real grid_factor = 1; + double extrapolate_grid = 8.0 / 7.0; + if (gamma_extrapolation) + { + // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3) + auto isint = [](double x) { + double epsilon = 1e-6; // this follows the isint judgement in q-e + return std::abs(x - std::round(x)) < epsilon; + }; + if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)) + { + grid_factor = 0; + } + else + { + grid_factor = extrapolate_grid; + } + } + + const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; + const int nk = nks / nk_fac; + const int ig_kq = ik * nks * npw + iq * npw + ig; + + Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2; + // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2 + if (gg >= 1e-8) + { + Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg; + pot_cpu[ig_kq] += 1.0 / gg * grid_factor * alpha; + } + } + } + } + } + + // calculate erfc pot + auto param_erfc = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Erfc]; + for (auto param: param_erfc) + { + double erfc_omega = std::stod(param["omega"]); + double erfc_omega2 = erfc_omega * erfc_omega; + double alpha = std::stod(param["alpha"]); + double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc, + erfc_omega, + kv, + wfcpw, + rhopw_dev, + tpiba, + gamma_extrapolation, + ucell_omega); + for (int ik = 0; ik < nks; ik++) + { + for (int iq = 0; iq < nks; iq++) + { + const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; + const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; + const ModuleBase::Vector3 q_c = wfcpw->kvec_c[iq]; + const ModuleBase::Vector3 q_d = wfcpw->kvec_d[iq]; + +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (int ig = 0; ig < rhopw_dev->npw; ig++) + { + const ModuleBase::Vector3 g_d = rhopw_dev->gdirect[ig]; + const ModuleBase::Vector3 kqg_d = k_d - q_d + g_d; + // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) + // 7/8 of the points in the grid are "activated" and 1/8 are disabled. + // grid_factor is designed for the 7/8 of the grid to function like all of the points + Real grid_factor = 1; + double extrapolate_grid = 8.0 / 7.0; + if (gamma_extrapolation) + { + // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3) + auto isint = [](double x) { + double epsilon = 1e-6; // this follows the isint judgement in q-e + return std::abs(x - std::round(x)) < epsilon; + }; + if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)) + { + grid_factor = 0; + } + else + { + grid_factor = extrapolate_grid; + } + } + + const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; + const int nk = nks / nk_fac; + const int ig_kq = ik * nks * npw + iq * npw + ig; + + Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2; + // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2 + if (gg >= 1e-8) + { + Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg; + pot_cpu[ig_kq] += (1.0 - (1.0 + gg / 4.0 / erfc_omega2) * std::exp(-gg / 4.0 / erfc_omega2)) / (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) / gg * grid_factor * alpha; + } + // } + else + { + // if (PARAM.inp.dft_functional == "hse") + if (!gamma_extrapolation) + { + pot_cpu[ig_kq] += 1.0 / 4.0 / erfc_omega2 * alpha; + } + } + // assert(is_finite(density_recip[ig])); + } + } + } + } + + // copy the potential to the device memory + syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw * wfcpw->nks * wfcpw->nks); + + delete pot_cpu; +} + +double exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type coulomb_type, + double erfc_omega, + const K_Vectors* kv, + const ModulePW::PW_Basis_K* wfcpw, + ModulePW::PW_Basis* rhopw_dev, + double tpiba, + bool gamma_extrapolation, + double ucell_omega) +{ + double exx_div = 0; + + double nqs_half1 = 0.5 * kv->nmp[0]; + double nqs_half2 = 0.5 * kv->nmp[1]; + double nqs_half3 = 0.5 * kv->nmp[2]; + + int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; + + // here we follow the exx_divergence subroutine in q-e (PW/src/exx_base.f90) + double alpha = 10.0 / wfcpw->gk_ecut; + double tpiba2 = tpiba * tpiba; + double div = 0; + + // this is the \sum_q F(q) part + // temporarily for all k points, should be replaced to q points later + for (int ik = 0; ik < wfcpw->nks; ik++) + { + const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; + const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : div) +#endif + for (int ig = 0; ig < rhopw_dev->npw; ig++) + { + const ModuleBase::Vector3 q_c = k_c + rhopw_dev->gcar[ig]; + const ModuleBase::Vector3 q_d = k_d + rhopw_dev->gdirect[ig]; + double qq = q_c.norm2(); + // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) + // 7/8 of the points in the grid are "activated" and 1/8 are disabled. + // grid_factor is designed for the 7/8 of the grid to function like all of the points + double grid_factor = 1; + double extrapolate_grid = 8.0 / 7.0; + if (gamma_extrapolation) + { + auto isint = [](double x) { + double epsilon = 1e-6; // this follows the isint judgement in q-e + return std::abs(x - std::round(x)) < epsilon; + }; + if (isint(q_d[0] * nqs_half1) && isint(q_d[1] * nqs_half2) && isint(q_d[2] * nqs_half3)) + { + grid_factor = 0; + } + else + { + grid_factor = extrapolate_grid; + } + } + + if (qq <= 1e-8) + continue; + // else if (PARAM.inp.dft_functional == "hse") + else if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) + { + double omega = erfc_omega; + double omega2 = omega * omega; + div += std::exp(-alpha * qq) / qq * (1.0 - std::exp(-qq * tpiba2 / 4.0 / omega2)) * grid_factor; + } + else + { + div += std::exp(-alpha * qq) / qq * grid_factor; + } + } + } + + Parallel_Reduce::reduce_pool(div); + // std::cout << "EXX div: " << div << std::endl; + + // if (PARAM.inp.dft_functional == "hse") + if (!gamma_extrapolation) + { + if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) + { + double omega = erfc_omega; + div += tpiba2 / 4.0 / omega / omega; // compensate for the finite value when qq = 0 + } + else + { + div -= alpha; + } + } + + div *= ModuleBase::e2 * ModuleBase::FOUR_PI / tpiba2 / wfcpw->nks; + // std::cout << "div: " << div << std::endl; + + // numerically value the mean value of F(q) in the reciprocal space + // This means we need to calculate the average of F(q) in the first brillouin zone + alpha /= tpiba2; + int nqq = 100000; + double dq = 5.0 / std::sqrt(alpha) / nqq; + double aa = 0.0; + // if (PARAM.inp.dft_functional == "hse") + if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) + { + double omega = erfc_omega; + double omega2 = omega * omega; +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : aa) +#endif + for (int i = 0; i < nqq; i++) + { + double q = dq * (i + 0.5); + aa -= exp(-alpha * q * q) * exp(-q * q / 4.0 / omega2) * dq; + } + } + aa *= 8 / ModuleBase::FOUR_PI; + aa += 1.0 / std::sqrt(alpha * ModuleBase::PI); + + div -= ModuleBase::e2 * ucell_omega * aa; + exx_div = div * wfcpw->nks / nk_fac; + // exx_div = 0; + // std::cout << "EXX divergence: " << exx_div << std::endl; + + return exx_div; +} +template class OperatorEXXPW, base_device::DEVICE_CPU>; +template class OperatorEXXPW, base_device::DEVICE_CPU>; +template void get_exx_potential(const K_Vectors*, + const ModulePW::PW_Basis_K*, + ModulePW::PW_Basis*, + float*, + double, + bool, + double); +template void get_exx_potential(const K_Vectors*, + const ModulePW::PW_Basis_K*, + ModulePW::PW_Basis*, + double*, + double, + bool, + double); +template void get_exx_stress_potential(const K_Vectors*, + const ModulePW::PW_Basis_K*, + ModulePW::PW_Basis*, + float*, + double, + bool, + double); +template void get_exx_stress_potential(const K_Vectors*, + const ModulePW::PW_Basis_K*, + ModulePW::PW_Basis*, + double*, + double, + bool, + double); +#if ((defined __CUDA) || (defined __ROCM)) +template class OperatorEXXPW, base_device::DEVICE_GPU>; +template class OperatorEXXPW, base_device::DEVICE_GPU>; +template void get_exx_potential(const K_Vectors*, + const ModulePW::PW_Basis_K*, + ModulePW::PW_Basis*, + float*, + double, + bool, + double); +template void get_exx_potential(const K_Vectors*, + const ModulePW::PW_Basis_K*, + ModulePW::PW_Basis*, + double*, + double, + bool, + double); +template void get_exx_stress_potential(const K_Vectors*, + const ModulePW::PW_Basis_K*, + ModulePW::PW_Basis*, + float*, + double, + bool, + double); +template void get_exx_stress_potential(const K_Vectors*, + const ModulePW::PW_Basis_K*, + ModulePW::PW_Basis*, + double*, + double, + bool, + double); +#endif +} // namespace hamilt \ No newline at end of file diff --git a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp index 68bfca3158..83879d634f 100644 --- a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp +++ b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp @@ -170,7 +170,7 @@ void OperatorEXXPW::act_op(const int nbands, // << std::endl; if (!potential_got) { - get_potential(); + get_exx_potential(kv, wfcpw, rhopw_dev, pot, tpiba, gamma_extrapolation, ucell->omega); potential_got = true; } @@ -263,211 +263,6 @@ void OperatorEXXPW::act_op(const int nbands, } -template -void OperatorEXXPW::act_op_ace(const int nbands, - const int nbasis, - const int npol, - const T *tmpsi_in, - T *tmhpsi, - const int ngk_ik, - const bool is_first_node) const -{ - ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace"); -// std::cout << "act_op_ace" << std::endl; - // hpsi += -Xi^\dagger * Xi * psi - T* Xi_ace = Xi_ace_k[this->ik]; - int nbands_tot = psi.get_nbands(); - int nbasis_max = psi.get_nbasis(); -// T* hpsi = nullptr; -// resmem_complex_op()(hpsi, nbands_tot * nbasis); -// setmem_complex_op()(hpsi, 0, nbands_tot * nbasis); - T* Xi_psi = nullptr; - resmem_complex_op()(Xi_psi, nbands_tot * nbands); - setmem_complex_op()(Xi_psi, 0, nbands_tot * nbands); - - char trans_N = 'N', trans_T = 'T', trans_C = 'C'; - T intermediate_one = 1.0, intermediate_zero = 0.0, intermediate_minus_one = -1.0; - // Xi * psi - gemm_complex_op()(trans_N, - trans_N, - nbands_tot, - nbands, - nbasis, - &intermediate_one, - Xi_ace, - nbands_tot, - tmpsi_in, - nbasis, - &intermediate_zero, - Xi_psi, - nbands_tot - ); - - Parallel_Reduce::reduce_pool(Xi_psi, nbands_tot * nbands); - - // Xi^\dagger * (Xi * psi) - gemm_complex_op()(trans_C, - trans_N, - nbasis, - nbands, - nbands_tot, - &intermediate_minus_one, - Xi_ace, - nbands_tot, - Xi_psi, - nbands_tot, - &intermediate_one, - tmhpsi, - nbasis - ); - - -// // negative sign, add to hpsi -// vec_add_vec_complex_op()(this->ctx, nbands * nbasis, tmhpsi, hpsi, -1, tmhpsi, 1); -// delmem_complex_op()(hpsi); - delmem_complex_op()(Xi_psi); - ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace"); - -} - -template -void OperatorEXXPW::construct_ace() const -{ - ModuleBase::timer::tick("OperatorEXXPW", "construct_ace"); -// int nkb = p_exx_helper->psi.get_nbands() * p_exx_helper->psi.get_nk(); - int nbands = psi.get_nbands(); - int nbasis = psi.get_nbasis(); - int nk = psi.get_nk(); - - int ik_save = this->ik; - int * ik_ = const_cast(&this->ik); - - T intermediate_one = 1.0, intermediate_zero = 0.0; - - if (h_psi_ace == nullptr) - { - resmem_complex_op()(h_psi_ace, nbands * nbasis); - setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); - } - - if (Xi_ace_k.size() != nk) - { - Xi_ace_k.resize(nk); - for (int i = 0; i < nk; i++) - { - resmem_complex_op()(Xi_ace_k[i], nbands * nbasis); - } - } - - for (int i = 0; i < nk; i++) - { - setmem_complex_op()(Xi_ace_k[i], 0, nbands * nbasis); - } - - if (L_ace == nullptr) - { - resmem_complex_op()(L_ace, nbands * nbands); - setmem_complex_op()(L_ace, 0, nbands * nbands); - } - - if (psi_h_psi_ace == nullptr) - { - resmem_complex_op()(psi_h_psi_ace, nbands * nbands); - } - - if (first_iter) return; - - for (int ik = 0; ik < nk; ik++) - { - int npwk = wfcpw->npwk[ik]; - - T* Xi_ace = Xi_ace_k[ik]; - psi.fix_kb(ik, 0); - T* p_psi = psi.get_pointer(); - - setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); - - *ik_ = ik; - - act_op( - nbands, - nbasis, - 1, - p_psi, - h_psi_ace, - nbasis, - false - ); - - // psi_h_psi_ace = psi^\dagger * h_psi_ace - // p_exx_helper->psi.fix_kb(0, 0); - gemm_complex_op()('C', - 'N', - nbands, - nbands, - npwk, - &intermediate_one, - p_psi, - nbasis, - h_psi_ace, - nbasis, - &intermediate_zero, - psi_h_psi_ace, - nbands); - - // reduction of psi_h_psi_ace, due to distributed memory - Parallel_Reduce::reduce_pool(psi_h_psi_ace, nbands * nbands); - - T intermediate_minus_one = -1.0; - axpy_complex_op()(nbands * nbands, - &intermediate_minus_one, - psi_h_psi_ace, - 1, - L_ace, - 1); - - - int info = 0; - char up = 'U', lo = 'L'; - - lapack_potrf()(lo, nbands, L_ace, nbands); - - // expand for-loop - for (int i = 0; i < nbands; ++i) { - setmem_complex_op()(L_ace + i * nbands, 0, i); - } - - // L_ace inv in place - char non = 'N'; - lapack_trtri()(lo, non, nbands, L_ace, nbands); - - // Xi_ace = L_ace^-1 * h_psi_ace^dagger - gemm_complex_op()('N', - 'C', - nbands, - npwk, - nbands, - &intermediate_one, - L_ace, - nbands, - h_psi_ace, - nbasis, - &intermediate_zero, - Xi_ace, - nbands); - - // clear mem - setmem_complex_op()(h_psi_ace, 0, nbands * nbasis); - setmem_complex_op()(psi_h_psi_ace, 0, nbands * nbands); - setmem_complex_op()(L_ace, 0, nbands * nbands); - - } - - *ik_ = ik_save; - ModuleBase::timer::tick("OperatorEXXPW", "construct_ace"); - -} - template std::vector OperatorEXXPW::get_q_points(const int ik) const { @@ -560,291 +355,6 @@ OperatorEXXPW::OperatorEXXPW(const OperatorEXXPW *op } -template -void OperatorEXXPW::get_potential() const -{ - Real nqs_half1 = 0.5 * kv->nmp[0]; - Real nqs_half2 = 0.5 * kv->nmp[1]; - Real nqs_half3 = 0.5 * kv->nmp[2]; - - Real* pot_cpu = nullptr; - int nks = wfcpw->nks, npw = rhopw_dev->npw; - double tpiba2 = tpiba * tpiba; - pot_cpu = new Real[npw * wfcpw->nks * wfcpw->nks]; - // fill zero - setmem_real_cpu_op()(pot_cpu, 0, npw * nks * nks); - - // calculate Fock pot - auto param_fock = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Fock]; - for (auto param : param_fock) - { - double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock); - double alpha = std::stod(param["alpha"]); - for (int ik = 0; ik < nks; ik++) - { - for (int iq = 0; iq < nks; iq++) - { - const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; - const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; - const ModuleBase::Vector3 q_c = wfcpw->kvec_c[iq]; - const ModuleBase::Vector3 q_d = wfcpw->kvec_d[iq]; - -#ifdef _OPENMP -#pragma omp parallel for schedule(static) -#endif - for (int ig = 0; ig < rhopw_dev->npw; ig++) - { - const ModuleBase::Vector3 g_d = rhopw_dev->gdirect[ig]; - const ModuleBase::Vector3 kqg_d = k_d - q_d + g_d; - // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) - // 7/8 of the points in the grid are "activated" and 1/8 are disabled. - // grid_factor is designed for the 7/8 of the grid to function like all of the points - Real grid_factor = 1; - double extrapolate_grid = 8.0 / 7.0; - if (gamma_extrapolation) - { - // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3) - auto isint = [](double x) { - double epsilon = 1e-6; // this follows the isint judgement in q-e - return std::abs(x - std::round(x)) < epsilon; - }; - if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)) - { - grid_factor = 0; - } - else - { - grid_factor = extrapolate_grid; - } - } - - const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; - const int nk = nks / nk_fac; - const int ig_kq = ik * nks * npw + iq * npw + ig; - - Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2; - // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2 - if (gg >= 1e-8) - { - Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg; - pot_cpu[ig_kq] += fac * grid_factor * alpha; - } - // } - else - { - pot_cpu[ig_kq] += exx_div * alpha; - } - // assert(is_finite(density_recip[ig])); - } - } - } - } - - // calculate erfc pot - auto param_erfc = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Erfc]; - for (auto param : param_erfc) - { - double erfc_omega = std::stod(param["omega"]); - double erfc_omega2 = erfc_omega * erfc_omega; - double alpha = std::stod(param["alpha"]); - double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc, erfc_omega); - for (int ik = 0; ik < nks; ik++) - { - for (int iq = 0; iq < nks; iq++) - { - const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; - const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; - const ModuleBase::Vector3 q_c = wfcpw->kvec_c[iq]; - const ModuleBase::Vector3 q_d = wfcpw->kvec_d[iq]; - -#ifdef _OPENMP -#pragma omp parallel for schedule(static) -#endif - for (int ig = 0; ig < rhopw_dev->npw; ig++) - { - const ModuleBase::Vector3 g_d = rhopw_dev->gdirect[ig]; - const ModuleBase::Vector3 kqg_d = k_d - q_d + g_d; - // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) - // 7/8 of the points in the grid are "activated" and 1/8 are disabled. - // grid_factor is designed for the 7/8 of the grid to function like all of the points - Real grid_factor = 1; - double extrapolate_grid = 8.0 / 7.0; - if (gamma_extrapolation) - { - // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3) - auto isint = [](double x) { - double epsilon = 1e-6; // this follows the isint judgement in q-e - return std::abs(x - std::round(x)) < epsilon; - }; - if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)) - { - grid_factor = 0; - } - else - { - grid_factor = extrapolate_grid; - } - } - - const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; - const int nk = nks / nk_fac; - const int ig_kq = ik * nks * npw + iq * npw + ig; - - Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2; - // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2 - if (gg >= 1e-8) - { - Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg; - pot_cpu[ig_kq] += fac * (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) * grid_factor * alpha; - } - // } - else - { - // if (PARAM.inp.dft_functional == "hse") - if (!gamma_extrapolation) - { - pot_cpu[ig_kq] += (exx_div - ModuleBase::PI * ModuleBase::e2 / erfc_omega2) * alpha; - } - else - { - pot_cpu[ig_kq] += exx_div * alpha; - } - } - // assert(is_finite(density_recip[ig])); - } - } - } - } - - // copy the potential to the device memory - syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw * wfcpw->nks * wfcpw->nks); - - delete pot_cpu; -} - -template -double OperatorEXXPW::exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type coulomb_type, double erfc_omega) const -{ - double exx_div = 0; - - Real nqs_half1 = 0.5 * kv->nmp[0]; - Real nqs_half2 = 0.5 * kv->nmp[1]; - Real nqs_half3 = 0.5 * kv->nmp[2]; - - int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1; - - // here we follow the exx_divergence subroutine in q-e (PW/src/exx_base.f90) - double alpha = 10.0 / wfcpw->gk_ecut; - double tpiba2 = tpiba * tpiba; - double div = 0; - - // this is the \sum_q F(q) part - // temporarily for all k points, should be replaced to q points later - for (int ik = 0; ik < wfcpw->nks; ik++) - { - const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; - const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; -#ifdef _OPENMP -#pragma omp parallel for reduction(+:div) -#endif - for (int ig = 0; ig < rhopw_dev->npw; ig++) - { - const ModuleBase::Vector3 q_c = k_c + rhopw_dev->gcar[ig]; - const ModuleBase::Vector3 q_d = k_d + rhopw_dev->gdirect[ig]; - double qq = q_c.norm2(); - // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) - // 7/8 of the points in the grid are "activated" and 1/8 are disabled. - // grid_factor is designed for the 7/8 of the grid to function like all of the points - Real grid_factor = 1; - double extrapolate_grid = 8.0/7.0; - if (gamma_extrapolation) - { - auto isint = [](double x) - { - double epsilon = 1e-6; // this follows the isint judgement in q-e - return std::abs(x - std::round(x)) < epsilon; - }; - if (isint(q_d[0] * nqs_half1) && - isint(q_d[1] * nqs_half2) && - isint(q_d[2] * nqs_half3)) - { - grid_factor = 0; - } - else - { - grid_factor = extrapolate_grid; - } - } - - if (qq <= 1e-8) continue; - // else if (PARAM.inp.dft_functional == "hse") - else if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) - { - double omega = erfc_omega; - double omega2 = omega * omega; - div += std::exp(-alpha * qq) / qq * (1.0 - std::exp(-qq*tpiba2 / 4.0 / omega2)) * grid_factor; - } - else - { - div += std::exp(-alpha * qq) / qq * grid_factor; - } - } - } - - Parallel_Reduce::reduce_pool(div); - // std::cout << "EXX div: " << div << std::endl; - - // if (PARAM.inp.dft_functional == "hse") - if (!gamma_extrapolation) - { - if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) - { - double omega = erfc_omega; - div += tpiba2 / 4.0 / omega / omega; // compensate for the finite value when qq = 0 - } - else - { - div -= alpha; - } - - } - - div *= ModuleBase::e2 * ModuleBase::FOUR_PI / tpiba2 / wfcpw->nks; -// std::cout << "div: " << div << std::endl; - - // numerically value the mean value of F(q) in the reciprocal space - // This means we need to calculate the average of F(q) in the first brillouin zone - alpha /= tpiba2; - int nqq = 100000; - double dq = 5.0 / std::sqrt(alpha) / nqq; - double aa = 0.0; - // if (PARAM.inp.dft_functional == "hse") - if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc) - { - double omega = erfc_omega; - double omega2 = omega * omega; -#ifdef _OPENMP -#pragma omp parallel for reduction(+:aa) -#endif - for (int i = 0; i < nqq; i++) - { - double q = dq * (i+0.5); - aa -= exp(-alpha * q * q) * exp(-q*q / 4.0 / omega2) * dq; - } - } - aa *= 8 / ModuleBase::FOUR_PI; - aa += 1.0 / std::sqrt(alpha * ModuleBase::PI); - - // printf("ucell: %p\n", ucell); - double omega = ucell->omega; - div -= ModuleBase::e2 * omega * aa; - exx_div = div * wfcpw->nks / nk_fac; -// exx_div = 0; -// std::cout << "EXX divergence: " << exx_div << std::endl; - - return exx_div; -} - template double OperatorEXXPW::cal_exx_energy(psi::Psi *psi_) const { @@ -858,41 +368,6 @@ double OperatorEXXPW::cal_exx_energy(psi::Psi *psi_) const } } -template -double OperatorEXXPW::cal_exx_energy_ace(psi::Psi *ppsi_) const -{ - double Eexx = 0; - - psi::Psi psi_ = *ppsi_; - int *ik_ = const_cast(&this->ik); - int ik_save = this->ik; - for (int i = 0; i < wfcpw->nks; i++) - { - setmem_complex_op()(h_psi_ace, 0, psi_.get_nbands() * psi_.get_nbasis()); - *ik_ = i; - psi_.fix_kb(i, 0); - T* psi_i = psi_.get_pointer(); - act_op_ace(psi_.get_nbands(), psi_.get_nbasis(), 1, psi_i, h_psi_ace, 0, true); - - for (int nband = 0; nband < psi_.get_nbands(); nband++) - { - psi_.fix_kb(i, nband); - T* psi_i_n = psi_.get_pointer(); - T* hpsi_i_n = h_psi_ace + nband * psi_.get_nbasis(); - double wg_i_n = (*wg)(i, nband); - // Eexx += dot(psi_i_n, h_psi_i_n) - Eexx += dot_op()(psi_.get_nbasis(), psi_i_n, hpsi_i_n, false) * wg_i_n * 2; - - } - - - } - - Parallel_Reduce::reduce_pool(Eexx); - *ik_ = ik_save; - return Eexx; -} - template double OperatorEXXPW::cal_exx_energy_op(psi::Psi *ppsi_) const { diff --git a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h index 51b69b13a9..eebd9ff607 100644 --- a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h +++ b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h @@ -69,10 +69,6 @@ class OperatorEXXPW : public OperatorPW void multiply_potential(T *density_recip, int ik, int iq) const; - double exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type coulomb_type, double erfc_omega = 0) const; - - void get_potential() const; - void act_op(const int nbands, const int nbasis, const int npol, @@ -164,6 +160,33 @@ class OperatorEXXPW : public OperatorPW }; +template +void get_exx_potential(const K_Vectors* kv, + const ModulePW::PW_Basis_K* wfcpw, + ModulePW::PW_Basis* rhopw_dev, + Real* pot, + double tpiba, + bool gamma_extrapolation, + double ucell_omega); + +template +void get_exx_stress_potential(const K_Vectors* kv, + const ModulePW::PW_Basis_K* wfcpw, + ModulePW::PW_Basis* rhopw_dev, + Real* pot, + double tpiba, + bool gamma_extrapolation, + double ucell_omega); + +double exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type coulomb_type, + double erfc_omega, + const K_Vectors* kv, + const ModulePW::PW_Basis_K* wfcpw, + ModulePW::PW_Basis* rhopw_dev, + double tpiba, + bool gamma_extrapolation, + double ucell_omega); + } // namespace hamilt #endif // OPEXXPW_H \ No newline at end of file diff --git a/source/source_pw/module_pwdft/stress_func_exx.cpp b/source/source_pw/module_pwdft/stress_func_exx.cpp index 1885bb16ee..4aef3c872f 100644 --- a/source/source_pw/module_pwdft/stress_func_exx.cpp +++ b/source/source_pw/module_pwdft/stress_func_exx.cpp @@ -1,5 +1,6 @@ -#include "stress_pw.h" #include "global.h" +#include "operator_pw/op_exx_pw.h" +#include "stress_pw.h" template void Stress_PW::stress_exx(ModuleBase::matrix& sigma, @@ -56,198 +57,8 @@ void Stress_PW::stress_exx(ModuleBase::matrix& sigma, resmem_real_op()(pot, rhopw->npw * nks * nks); resmem_real_op()(pot_stress, rhopw->npw * nks * nks); - // prepare the coefficients - double exx_div = 0; - - // pasted from op_exx_pw.cpp - { - if (GlobalC::exx_info.info_lip.lambda == 0.0) - { - return; - } - - // here we follow the exx_divergence subroutine in q-e (PW/src/exx_base.f90) - double alpha = 10.0 / wfcpw->gk_ecut; - double div = 0; - - // this is the \sum_q F(q) part - // temporarily for all k points, should be replaced to q points later - for (int ik = 0; ik < wfcpw->nks; ik++) - { - const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; - const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; -#ifdef _OPENMP -#pragma omp parallel for reduction(+:div) -#endif - for (int ig = 0; ig < rhopw->npw; ig++) - { - const ModuleBase::Vector3 q_c = k_c + rhopw->gcar[ig]; - const ModuleBase::Vector3 q_d = k_d + rhopw->gdirect[ig]; - double qq = q_c.norm2(); - // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) - // 7/8 of the points in the grid are "activated" and 1/8 are disabled. - // grid_factor is designed for the 7/8 of the grid to function like all of the points - double grid_factor = 1; - double extrapolate_grid = 8.0/7.0; - if (gamma_extrapolation) - { - if (isint(q_d[0] * nqs_half1) && - isint(q_d[1] * nqs_half2) && - isint(q_d[2] * nqs_half3)) - { - grid_factor = 0; - } - else - { - grid_factor = extrapolate_grid; - } - } - - - if (qq <= 1e-8) continue; - else if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erfc) - { - double hse_omega = GlobalC::exx_info.info_global.hse_omega; - double omega2 = hse_omega * hse_omega; - div += std::exp(-alpha * qq) / qq * (1.0 - std::exp(-qq*tpiba2 / 4.0 / omega2)) * grid_factor; - } - else - { - div += std::exp(-alpha * qq) / qq * grid_factor; - } - } - } - - Parallel_Reduce::reduce_pool(div); - // std::cout << "EXX div: " << div << std::endl; - - // if (PARAM.inp.dft_functional == "hse") - if (!gamma_extrapolation) - { - if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erfc) - { - double omega = GlobalC::exx_info.info_global.hse_omega; - div += tpiba2 / 4.0 / omega / omega; // compensate for the finite value when qq = 0 - } - else - { - div -= alpha; - } - - } - - div *= ModuleBase::e2 * ModuleBase::FOUR_PI / tpiba2 / wfcpw->nks; - - // numerically value the mean value of F(q) in the reciprocal space - // This means we need to calculate the average of F(q) in the first brillouin zone - alpha /= tpiba2; - int nqq = 100000; - double dq = 5.0 / std::sqrt(alpha) / nqq; - double aa = 0.0; - // if (PARAM.inp.dft_functional == "hse") - if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erfc) - { - double hse_omega = GlobalC::exx_info.info_global.hse_omega; - double omega2 = hse_omega * hse_omega; - #ifdef _OPENMP - #pragma omp parallel for reduction(+:aa) - #endif - for (int i = 0; i < nqq; i++) - { - double q = dq * (i+0.5); - aa -= exp(-alpha * q * q) * exp(-q*q / 4.0 / omega2) * dq; - } - } - aa *= 8 / ModuleBase::FOUR_PI; - aa += 1.0 / std::sqrt(alpha * ModuleBase::PI); - div -= ModuleBase::e2 * omega * aa; - exx_div = div * wfcpw->nks; - // std::cout << "EXX divergence: " << exx_div << std::endl; - - } - - // prepare for the potential - for (int ik = 0; ik < nks; ik++) - { - for (int iq = 0; iq < nks; iq++) - { - const ModuleBase::Vector3 k_c = wfcpw->kvec_c[ik]; - const ModuleBase::Vector3 k_d = wfcpw->kvec_d[ik]; - const ModuleBase::Vector3 q_c = wfcpw->kvec_c[iq]; - const ModuleBase::Vector3 q_d = wfcpw->kvec_d[iq]; - - #ifdef _OPENMP - #pragma omp parallel for schedule(static) - #endif - for (int ig = 0; ig < rhopw->npw; ig++) - { - const ModuleBase::Vector3 g_d = rhopw->gdirect[ig]; - const ModuleBase::Vector3 kqg_d = k_d - q_d + g_d; - - // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114) - // 7/8 of the points in the grid are "activated" and 1/8 are disabled. - // grid_factor is designed for the 7/8 of the grid to function like all of the points - Real grid_factor = 1; - if (gamma_extrapolation) - { - double extrapolate_grid = 8.0/7.0; - if (isint(kqg_d[0] * nqs_half1) && - isint(kqg_d[1] * nqs_half2) && - isint(kqg_d[2] * nqs_half3)) - { - grid_factor = 0; - } - else - { - grid_factor = extrapolate_grid; - } - } - - const int ig_kq = ik * nks * rhopw->npw + iq * rhopw->npw + ig; - - Real gg = (k_c - q_c + rhopw->gcar[ig]).norm2() * tpiba2; - Real hse_omega2 = GlobalC::exx_info.info_global.hse_omega * GlobalC::exx_info.info_global.hse_omega; - // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2 - if (gg >= 1e-8) - { - Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg; - // if (PARAM.inp.dft_functional == "hse") - if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erfc) - { - pot[ig_kq] = fac * (1.0 - std::exp(-gg / 4.0 / hse_omega2)) * grid_factor; - pot_stress[ig_kq] = (1.0 - (1.0 + gg / 4.0 / hse_omega2) * std::exp(-gg / 4.0 / hse_omega2)) / (1.0 - std::exp(-gg / 4.0 / hse_omega2)) / gg; - } - else if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erf) - { - ModuleBase::WARNING("Stress_PW", "Stress for Erf is not implemented yet"); - pot[ig_kq] = fac * grid_factor; - pot_stress[ig_kq] = 1.0 / gg; - } - else if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Hf) - { - pot[ig_kq] = fac * grid_factor; - pot_stress[ig_kq] = 1.0 / gg; - } - } - // } - else - { - // if (PARAM.inp.dft_functional == "hse") - if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erfc && !gamma_extrapolation) - { - pot[ig_kq] = - ModuleBase::PI * ModuleBase::e2 / hse_omega2; // maybe we should add a exx_div here, but q-e does not do that - pot_stress[ig_kq] = 1 / 4.0 / hse_omega2; - } - else - { - pot[ig_kq] = exx_div; - pot_stress[ig_kq] = 0; - } - } - // assert(is_finite(density_recip[ig])); - } - } - } + hamilt::get_exx_potential(p_kv, wfcpw, rhopw, pot, tpiba, gamma_extrapolation, omega); + hamilt::get_exx_stress_potential(p_kv, wfcpw, rhopw, pot_stress, tpiba, gamma_extrapolation, omega); // calculate the stress