From 44b21367652663ba1496a76a180b9af66a2f49ba Mon Sep 17 00:00:00 2001 From: dyzheng Date: Wed, 13 Aug 2025 18:30:48 +0800 Subject: [PATCH 1/9] Fixed the bug in memory statistics --- source/module_hamilt_lcao/module_gint/gint.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp index 6ee4c238d1..1222bb82ed 100644 --- a/source/module_hamilt_lcao/module_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/gint.cpp @@ -199,6 +199,8 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons } else { this->hRGintCd->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol); this->hRGintCd->allocate(nullptr, true); + ModuleBase::Memory::record("Gint::hRGintCd", + this->hRGintCd->get_memory_size()); for(int is = 0; is < nspin; is++) { this->hRGint_tmp[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in); this->DMRGint[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in); @@ -208,8 +210,7 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons ModuleBase::Memory::record("Gint::hRGint_tmp", this->hRGint_tmp[0]->get_memory_size()*nspin); ModuleBase::Memory::record("Gint::DMRGint", - this->DMRGint[0]->get_memory_size() - * this->DMRGint.size()*nspin); + this->DMRGint[0]->get_memory_size()*nspin); #ifdef __MPI this->DMRGint_full->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol); this->DMRGint_full->allocate(nullptr, true); From 07a9a23530437959c6a2e78a3d85492905cd3f8d Mon Sep 17 00:00:00 2001 From: hn <3022939753@qq.com> Date: Thu, 21 Aug 2025 20:19:32 +0800 Subject: [PATCH 2/9] delete tem Hcontainer to reduce memory usage --- .../module_hamilt_lcao/module_gint/gint.cpp | 105 +++++++++++------- source/module_hamilt_lcao/module_gint/gint.h | 2 +- source/module_lr/utils/gint_move.hpp | 4 +- 3 files changed, 67 insertions(+), 44 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp index 1222bb82ed..6690446bed 100644 --- a/source/module_hamilt_lcao/module_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/gint.cpp @@ -33,7 +33,7 @@ Gint::~Gint() { delete this->hRGint_tmp[is]; } #ifdef __MPI - delete this->DMRGint_full; + delete this->DM2D_tmp; #endif } @@ -171,10 +171,9 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons this->hRGint_tmp[is] = new hamilt::HContainer(ucell_in.nat); } #ifdef __MPI - if (this->DMRGint_full != nullptr) { - delete this->DMRGint_full; + if (this->DM2D_tmp != nullptr) { + delete this->DM2D_tmp; } - this->DMRGint_full = new hamilt::HContainer(ucell_in.nat); #endif } @@ -211,12 +210,6 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons this->hRGint_tmp[0]->get_memory_size()*nspin); ModuleBase::Memory::record("Gint::DMRGint", this->DMRGint[0]->get_memory_size()*nspin); -#ifdef __MPI - this->DMRGint_full->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol); - this->DMRGint_full->allocate(nullptr, true); - ModuleBase::Memory::record("Gint::DMRGint_full", - this->DMRGint_full->get_memory_size()); -#endif } } @@ -232,10 +225,8 @@ void Gint::reset_DMRGint(const int& nspin) { for (auto& d : this->DMRGint) { d->allocate(nullptr, false); } #ifdef __MPI - delete this->DMRGint_full; - this->DMRGint_full = new hamilt::HContainer(*this->hRGint); - this->DMRGint_full->allocate(nullptr, false); -#endif + delete this->DM2D_tmp; +#endif } } } @@ -263,37 +254,69 @@ void Gint::transfer_DM2DtoGrid(std::vector*> DM2D) { } else // NSPIN=4 case { #ifdef __MPI - hamilt::transferParallels2Serials(*DM2D[0], this->DMRGint_full); -#else - this->DMRGint_full = DM2D[0]; -#endif - std::vector tmp_pointer(4, nullptr); - for (int iap = 0; iap < this->DMRGint_full->size_atom_pairs(); ++iap) { - auto& ap = this->DMRGint_full->get_atom_pair(iap); - int iat1 = ap.get_atom_i(); - int iat2 = ap.get_atom_j(); - for (int ir = 0; ir < ap.get_R_size(); ++ir) { - const ModuleBase::Vector3 r_index = ap.get_R_index(ir); - for (int is = 0; is < 4; is++) { - tmp_pointer[is] = this->DMRGint[is] - ->find_matrix(iat1, iat2, r_index) - ->get_pointer(); - } - double* data_full = ap.get_pointer(ir); - for (int irow = 0; irow < ap.get_row_size(); irow += 2) { - for (int icol = 0; icol < ap.get_col_size(); icol += 2) { - *(tmp_pointer[0])++ = data_full[icol]; - *(tmp_pointer[1])++ = data_full[icol + 1]; - } - data_full += ap.get_col_size(); - for (int icol = 0; icol < ap.get_col_size(); icol += 2) { - *(tmp_pointer[2])++ = data_full[icol]; - *(tmp_pointer[3])++ = data_full[icol + 1]; + int mg = DM2D[0]->get_paraV()->get_global_row_size()/2; + int ng = DM2D[0]->get_paraV()->get_global_col_size()/2; + int nb = DM2D[0]->get_paraV()->get_block_size()/2; + int blacs_ctxt = DM2D[0]->get_paraV()->blacs_ctxt; + int *iat2iwt = new int[ucell->nat]; + for (int iat = 0; iat < ucell->nat; iat++) { + iat2iwt[iat] = ucell->get_iat2iwt()[iat]/2; + } + Parallel_Orbitals *pv = new Parallel_Orbitals(); + pv->set(mg, ng, nb, blacs_ctxt); + pv->set_atomic_trace(iat2iwt, ucell->nat, mg); + auto ijr_info = DM2D[0]->get_ijr_info(); + this-> DM2D_tmp = new hamilt::HContainer(pv, nullptr, &ijr_info); + ModuleBase::Memory::record("Gint::DM2D_tmp", this->DM2D_tmp->get_memory_size()); + for (int is = 0; is < 4; is++){ + for (int iap = 0; iap < DM2D[0]->size_atom_pairs(); ++iap) { + auto& ap = DM2D[0]->get_atom_pair(iap); + int iat1 = ap.get_atom_i(); + int iat2 = ap.get_atom_j(); + for (int ir = 0; ir < ap.get_R_size(); ++ir) { + const ModuleBase::Vector3 r_index = ap.get_R_index(ir); + double* tmp_pointer = this -> DM2D_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer(); + double* data_full = ap.get_pointer(ir); + for (int irow = 0; irow < ap.get_row_size(); irow += 2) { + switch (is) {//todo: It can be written more compactly + case 0: + for (int icol = 0; icol < ap.get_col_size(); icol += 2) { + *(tmp_pointer)++ = data_full[icol]; + } + data_full += ap.get_col_size() * 2; + break; + case 1: + for (int icol = 0; icol < ap.get_col_size(); icol += 2) { + *(tmp_pointer)++ = data_full[icol + 1]; + } + data_full += ap.get_col_size() * 2; + break; + case 2: + data_full += ap.get_col_size(); + for (int icol = 0; icol < ap.get_col_size(); icol += 2) { + *(tmp_pointer)++ = data_full[icol]; + } + data_full += ap.get_col_size(); + break; + case 3: + data_full += ap.get_col_size(); + for (int icol = 0; icol < ap.get_col_size(); icol += 2) { + *(tmp_pointer)++ = data_full[icol + 1]; + } + data_full += ap.get_col_size(); + break; + } } - data_full += ap.get_col_size(); } } + hamilt::transferParallels2Serials( *(this->DM2D_tmp), this->DMRGint[is]); } + // delete iat2iwt; + // delete pv; + // delete this-> DM2D_tmp; +#else + //this->DMRGint_full = DM2D[0]; +#endif } ModuleBase::timer::tick("Gint", "transfer_DMR"); } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/gint.h b/source/module_hamilt_lcao/module_gint/gint.h index 22d75e650b..ad1c7b72fd 100644 --- a/source/module_hamilt_lcao/module_gint/gint.h +++ b/source/module_hamilt_lcao/module_gint/gint.h @@ -264,7 +264,7 @@ class Gint { std::vector*> DMRGint; //! tmp tools used in transfer_DM2DtoGrid - hamilt::HContainer* DMRGint_full = nullptr; + hamilt::HContainer* DM2D_tmp = nullptr; std::vector> pvdpRx_reduced; std::vector> pvdpRy_reduced; diff --git a/source/module_lr/utils/gint_move.hpp b/source/module_lr/utils/gint_move.hpp index 6b532073a2..2d610cc77e 100644 --- a/source/module_lr/utils/gint_move.hpp +++ b/source/module_lr/utils/gint_move.hpp @@ -60,8 +60,8 @@ Gint& Gint::operator=(Gint&& rhs) this->pvdpRz_reduced = std::move(rhs.pvdpRz_reduced); this->DMRGint = std::move(rhs.DMRGint); this->hRGint_tmp = std::move(rhs.hRGint_tmp); - this->DMRGint_full = rhs.DMRGint_full; - rhs.DMRGint_full = nullptr; + this->DM2D_tmp = rhs.DM2D_tmp; + rhs.DM2D_tmp = nullptr; return *this; } From 0a799ba06acc8be690aebaf99cf98cadef8270e9 Mon Sep 17 00:00:00 2001 From: hn <3022939753@qq.com> Date: Tue, 26 Aug 2025 11:49:57 +0800 Subject: [PATCH 3/9] delete tem hRGintCd to reduce memory usage --- .../module_hamilt_lcao/module_gint/gint.cpp | 20 +- source/module_hamilt_lcao/module_gint/gint.h | 2 +- .../module_gint/gint_k_pvpr.cpp | 206 +++++++++++++----- source/module_lr/utils/gint_move.hpp | 4 +- 4 files changed, 161 insertions(+), 71 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp index 6690446bed..40d7ef4f57 100644 --- a/source/module_hamilt_lcao/module_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/gint.cpp @@ -23,7 +23,7 @@ Gint::~Gint() { delete this->hRGint; - delete this->hRGintCd; + //delete this->hRGintCd; // in gamma_only case, DMRGint.size()=0, // in multi-k case, DMRGint.size()=nspin for (int is = 0; is < this->DMRGint.size(); is++) { @@ -155,11 +155,11 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons this->hRGint = new hamilt::HContainer(ucell_in.nat); } else { npol = 2; - if (this->hRGintCd != nullptr) { - delete this->hRGintCd; - } - this->hRGintCd - = new hamilt::HContainer>(ucell_in.nat); + // if (this->hRGintCd != nullptr) { + // delete this->hRGintCd; + // } + // this->hRGintCd + // = new hamilt::HContainer>(ucell_in.nat); for (int is = 0; is < nspin; is++) { if (this->DMRGint[is] != nullptr) { delete this->DMRGint[is]; @@ -196,10 +196,10 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons this->DMRGint[0]->get_memory_size() * this->DMRGint.size()); } else { - this->hRGintCd->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol); - this->hRGintCd->allocate(nullptr, true); - ModuleBase::Memory::record("Gint::hRGintCd", - this->hRGintCd->get_memory_size()); + // this->hRGintCd->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol); + // this->hRGintCd->allocate(nullptr, true); + // ModuleBase::Memory::record("Gint::hRGintCd", + // this->hRGintCd->get_memory_size()); for(int is = 0; is < nspin; is++) { this->hRGint_tmp[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in); this->DMRGint[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in); diff --git a/source/module_hamilt_lcao/module_gint/gint.h b/source/module_hamilt_lcao/module_gint/gint.h index ad1c7b72fd..1803a4241c 100644 --- a/source/module_hamilt_lcao/module_gint/gint.h +++ b/source/module_hamilt_lcao/module_gint/gint.h @@ -258,7 +258,7 @@ class Gint { std::vector*> hRGint_tmp; //! stores Hamiltonian in sparse format - hamilt::HContainer>* hRGintCd = nullptr; + //hamilt::HContainer>* hRGintCd = nullptr; //! stores DMR in sparse format std::vector*> DMRGint; diff --git a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp index 9a6cca47f7..443f940626 100644 --- a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp +++ b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp @@ -78,86 +78,176 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, ModuleBase::TITLE("Gint_k", "transfer_pvpR"); ModuleBase::timer::tick("Gint_k", "transfer_pvpR"); - this->hRGintCd->set_zero(); + //this->hRGintCd->set_zero(); - for (int iap = 0; iap < this->hRGintCd->size_atom_pairs(); iap++) - { - auto* ap = &this->hRGintCd->get_atom_pair(iap); - const int iat1 = ap->get_atom_i(); - const int iat2 = ap->get_atom_j(); - if (iat1 <= iat2) - { - hamilt::AtomPair>* upper_ap = ap; - hamilt::AtomPair>* lower_ap = this->hRGintCd->find_pair(iat2, iat1); - const hamilt::AtomPair* ap_nspin_0 = this->hRGint_tmp[0]->find_pair(iat1, iat2); - const hamilt::AtomPair* ap_nspin_3 = this->hRGint_tmp[3]->find_pair(iat1, iat2); - for (int ir = 0; ir < upper_ap->get_R_size(); ir++) - { - const auto R_index = upper_ap->get_R_index(ir); - auto upper_mat = upper_ap->find_matrix(R_index); - auto mat_nspin_0 = ap_nspin_0->find_matrix(R_index); - auto mat_nspin_3 = ap_nspin_3->find_matrix(R_index); +#ifdef __MPI + int mg = hR->get_paraV()->get_global_row_size()/2; + int ng = hR->get_paraV()->get_global_col_size()/2; + int nb = hR->get_paraV()->get_block_size()/2; + int blacs_ctxt = hR->get_paraV()->blacs_ctxt; + int *iat2iwt = new int[ucell_in->nat]; + for (int iat = 0; iat < ucell_in->nat; iat++) { + iat2iwt[iat] = ucell_in->get_iat2iwt()[iat]/2; + } + Parallel_Orbitals *pv = new Parallel_Orbitals(); + pv->set(mg, ng, nb, blacs_ctxt); + pv->set_atomic_trace(iat2iwt, ucell_in->nat, mg); + auto ijr_info = hR->get_ijr_info(); + + + hamilt::HContainer* hR_tmp = new hamilt::HContainer(pv, nullptr, &ijr_info); + ModuleBase::Memory::record("Gint::hRGintCd", hR_tmp->get_memory_size()); + // std::cout<<"test1"<size_atom_pairs()<size_atom_pairs()<size_atom_pairs(); i++){ + // std::cout<<"hRGint_tmp"<get_atom_pair(i).get_row_size()<<' '<get_atom_pair(i).get_row_size()<get_atom_pair(i).get_row_size()<<' '<get_atom_pair(i).get_row_size()<get_indexes_col(0).size()<< ' '<get_indexes_row(0).size()<get_indexes_col().size()<< ' '<get_indexes_row().size()<get_row_size(); ++irow) + + for (int is = 0; is < 4; is++){ + hR_tmp->set_zero(); + //std::cout<<"is: "<hRGint_tmp[is]), hR_tmp); + for (int iap = 0; iap < hR->size_atom_pairs(); iap++) + { + //std::cout<<"iap: "<get_atom_pair(iap); + const int iat1 = ap->get_atom_i(); + const int iat2 = ap->get_atom_j(); + const hamilt::AtomPair* ap_nspin = nullptr; + if (iat1 <= iat2) + { + hamilt::AtomPair>* upper_ap = ap; + hamilt::AtomPair>* lower_ap = hR->find_pair(iat2, iat1); + switch (is) { - for (int icol = 0; icol < mat_nspin_0->get_col_size(); ++icol) - { - upper_mat->get_value(2*irow, 2*icol) = mat_nspin_0->get_value(irow, icol) + mat_nspin_3->get_value(irow, icol); - upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin_0->get_value(irow, icol) - mat_nspin_3->get_value(irow, icol); - } + case 0: + ap_nspin = hR_tmp->find_pair(iat1, iat2); + break; + case 3: + ap_nspin = hR_tmp->find_pair(iat1, iat2); + break; } + if(ap_nspin == nullptr) break; + //const hamilt::AtomPair* ap_nspin_0 = this->hR_tmp[0]->find_pair(iat1, iat2); + //const hamilt::AtomPair* ap_nspin_3 = this->hRGint_tmp[3]->find_pair(iat1, iat2); + for (int ir = 0; ir < upper_ap->get_R_size(); ir++) + { + const auto R_index = upper_ap->get_R_index(ir); + auto upper_mat = upper_ap->find_matrix(R_index); + //auto mat_nspin_0 = ap_nspin_0->find_matrix(R_index); + //auto mat_nspin_3 = ap_nspin_3->find_matrix(R_index); + auto mat_nspin = ap_nspin->find_matrix(R_index); - if (PARAM.globalv.domag) - { - const hamilt::AtomPair* ap_nspin_1 = this->hRGint_tmp[1]->find_pair(iat1, iat2); - const hamilt::AtomPair* ap_nspin_2 = this->hRGint_tmp[2]->find_pair(iat1, iat2); - const auto mat_nspin_1 = ap_nspin_1->find_matrix(R_index); - const auto mat_nspin_2 = ap_nspin_2->find_matrix(R_index); - for (int irow = 0; irow < mat_nspin_1->get_row_size(); ++irow) + // The row size and the col size of upper_matrix is double that of matrix_nspin_0 + for (int irow = 0; irow < mat_nspin->get_row_size(); ++irow) { - for (int icol = 0; icol < mat_nspin_1->get_col_size(); ++icol) + for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol) { - upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin_1->get_value(irow, icol) + std::complex(0.0, 1.0) * mat_nspin_2->get_value(irow, icol); - upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin_1->get_value(irow, icol) - std::complex(0.0, 1.0) * mat_nspin_2->get_value(irow, icol); + //upper_mat->get_value(2*irow, 2*icol) = mat_nspin_0->get_value(irow, icol) + mat_nspin_3->get_value(irow, icol); + //upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin_0->get_value(irow, icol) - mat_nspin_3->get_value(irow, icol); + switch (is) + { + case 0: + upper_mat->get_value(2*irow, 2*icol) = mat_nspin->get_value(irow, icol); + upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin->get_value(irow, icol); + break; + case 3: + upper_mat->get_value(2*irow, 2*icol) += mat_nspin->get_value(irow, icol); + upper_mat->get_value(2*irow+1, 2*icol+1) -= mat_nspin->get_value(irow, icol); + break; + } } } - } - // fill the lower triangle matrix - if (iat1 < iat2) - { - auto lower_mat = lower_ap->find_matrix(-R_index); - for (int irow = 0; irow < upper_mat->get_row_size(); ++irow) + if (PARAM.globalv.domag) { - for (int icol = 0; icol < upper_mat->get_col_size(); ++icol) + const hamilt::AtomPair* ap_nspin = nullptr; + switch (is) + { + case 1: + ap_nspin = hR_tmp->find_pair(iat1, iat2); + break; + case 2: + ap_nspin = hR_tmp->find_pair(iat1, iat2); + break; + } + //const hamilt::AtomPair* ap_nspin_1 = this->hRGint_tmp[1]->find_pair(iat1, iat2); + //const hamilt::AtomPair* ap_nspin_2 = this->hRGint_tmp[2]->find_pair(iat1, iat2); + //const auto mat_nspin_1 = ap_nspin_1->find_matrix(R_index); + //const auto mat_nspin_2 = ap_nspin_2->find_matrix(R_index); + const auto mat_nspin = ap_nspin->find_matrix(R_index); + for (int irow = 0; irow < mat_nspin->get_row_size(); ++irow) { - lower_mat->get_value(icol, irow) = conj(upper_mat->get_value(irow, icol)); + for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol) + { + switch(is) + { + case 1: + upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin->get_value(irow, icol); + upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin->get_value(irow, icol); + break; + case 2: + upper_mat->get_value(2*irow, 2*icol+1) += std::complex(0.0, 1.0) * mat_nspin->get_value(irow, icol); + upper_mat->get_value(2*irow+1, 2*icol) -= std::complex(0.0, 1.0) * mat_nspin->get_value(irow, icol); + break; + } + //upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin->get_value(irow, icol) + std::complex(0.0, 1.0) * mat_nspin_2->get_value(irow, icol); + //upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin->get_value(irow, icol) - std::complex(0.0, 1.0) * mat_nspin_2->get_value(irow, icol); + } } } + + // fill the lower triangle matrix + if(is == 3){ + if (iat1 < iat2) + { + auto lower_mat = lower_ap->find_matrix(-R_index); + for (int irow = 0; irow < upper_mat->get_row_size(); ++irow) + { + for (int icol = 0; icol < upper_mat->get_col_size(); ++icol) + { + lower_mat->get_value(icol, irow) = conj(upper_mat->get_value(irow, icol)); + } + } + } + } + // std::cout<<"end"<, std::complex>> // =================================== -#ifdef __MPI - int size; - MPI_Comm_size(MPI_COMM_WORLD, &size); - if (size == 1) - { - hR->add(*this->hRGintCd); - } - else - { - hamilt::transferSerials2Parallels>(*this->hRGintCd, hR); - } -#else - hR->add(*this->hRGintCd); -#endif - +// #ifdef __MPI +// int size; +// MPI_Comm_size(MPI_COMM_WORLD, &size); +// if (size == 1) +// { +// hR->add(*this->hRGintCd); +// } +// else +// { +// hamilt::transferSerials2Parallels>(*this->hRGintCd, hR); +// } +// #else +// hR->add(*this->hRGintCd); +// #endif ModuleBase::timer::tick("Gint_k", "transfer_pvpR"); return; } diff --git a/source/module_lr/utils/gint_move.hpp b/source/module_lr/utils/gint_move.hpp index 2d610cc77e..c3ee35f566 100644 --- a/source/module_lr/utils/gint_move.hpp +++ b/source/module_lr/utils/gint_move.hpp @@ -45,8 +45,8 @@ Gint& Gint::operator=(Gint&& rhs) // move hR after refactor this->hRGint = rhs.hRGint; rhs.hRGint = nullptr; - this->hRGintCd = rhs.hRGintCd; - rhs.hRGintCd = nullptr; + // this->hRGintCd = rhs.hRGintCd; + // rhs.hRGintCd = nullptr; for (int i = 0; i < this->DMRGint.size(); i++) { delete this->DMRGint[i]; From 4640fbfba3973407ceef302a123d76e8e8262a47 Mon Sep 17 00:00:00 2001 From: hn <3022939753@qq.com> Date: Sat, 30 Aug 2025 12:28:30 +0800 Subject: [PATCH 4/9] fix parallel bug --- .../module_hamilt_lcao/module_gint/gint.cpp | 10 +- source/module_hamilt_lcao/module_gint/gint.h | 2 +- .../module_gint/gint_k_pvpr.cpp | 170 ++++++++---------- source/module_lr/utils/gint_move.hpp | 4 +- 4 files changed, 82 insertions(+), 104 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp index 40d7ef4f57..f1a8805f97 100644 --- a/source/module_hamilt_lcao/module_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/gint.cpp @@ -23,7 +23,7 @@ Gint::~Gint() { delete this->hRGint; - //delete this->hRGintCd; + delete this->hR_tmp; // in gamma_only case, DMRGint.size()=0, // in multi-k case, DMRGint.size()=nspin for (int is = 0; is < this->DMRGint.size(); is++) { @@ -155,11 +155,9 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons this->hRGint = new hamilt::HContainer(ucell_in.nat); } else { npol = 2; - // if (this->hRGintCd != nullptr) { - // delete this->hRGintCd; - // } - // this->hRGintCd - // = new hamilt::HContainer>(ucell_in.nat); + if (this->hR_tmp != nullptr) { + delete this->hR_tmp; + } for (int is = 0; is < nspin; is++) { if (this->DMRGint[is] != nullptr) { delete this->DMRGint[is]; diff --git a/source/module_hamilt_lcao/module_gint/gint.h b/source/module_hamilt_lcao/module_gint/gint.h index 1803a4241c..d8090df191 100644 --- a/source/module_hamilt_lcao/module_gint/gint.h +++ b/source/module_hamilt_lcao/module_gint/gint.h @@ -258,7 +258,7 @@ class Gint { std::vector*> hRGint_tmp; //! stores Hamiltonian in sparse format - //hamilt::HContainer>* hRGintCd = nullptr; + hamilt::HContainer>* hR_tmp = nullptr; //! stores DMR in sparse format std::vector*> DMRGint; diff --git a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp index 443f940626..f0a3bc6518 100644 --- a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp +++ b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp @@ -78,12 +78,10 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, ModuleBase::TITLE("Gint_k", "transfer_pvpR"); ModuleBase::timer::tick("Gint_k", "transfer_pvpR"); - //this->hRGintCd->set_zero(); - -#ifdef __MPI int mg = hR->get_paraV()->get_global_row_size()/2; int ng = hR->get_paraV()->get_global_col_size()/2; int nb = hR->get_paraV()->get_block_size()/2; +#ifdef __MPI int blacs_ctxt = hR->get_paraV()->blacs_ctxt; int *iat2iwt = new int[ucell_in->nat]; for (int iat = 0; iat < ucell_in->nat; iat++) { @@ -93,119 +91,64 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, pv->set(mg, ng, nb, blacs_ctxt); pv->set_atomic_trace(iat2iwt, ucell_in->nat, mg); auto ijr_info = hR->get_ijr_info(); - - - hamilt::HContainer* hR_tmp = new hamilt::HContainer(pv, nullptr, &ijr_info); - ModuleBase::Memory::record("Gint::hRGintCd", hR_tmp->get_memory_size()); - // std::cout<<"test1"<size_atom_pairs()<size_atom_pairs()<size_atom_pairs(); i++){ - // std::cout<<"hRGint_tmp"<get_atom_pair(i).get_row_size()<<' '<get_atom_pair(i).get_row_size()<get_atom_pair(i).get_row_size()<<' '<get_atom_pair(i).get_row_size()<get_indexes_col(0).size()<< ' '<get_indexes_row(0).size()<get_indexes_col().size()<< ' '<get_indexes_row().size()<hR_tmp = new hamilt::HContainer>(pv, nullptr, &ijr_info); + ModuleBase::Memory::record("Gint::hRGintCd", this->hR_tmp->get_memory_size()); + //0,3;1,2;1,2;0,3 + std::vector first = {0, 1, 1, 0}; + std::vector second= {3, 2, 2, 3}; for (int is = 0; is < 4; is++){ - hR_tmp->set_zero(); - //std::cout<<"is: "<hRGint_tmp[is]), hR_tmp); - for (int iap = 0; iap < hR->size_atom_pairs(); iap++) + this->hR_tmp->set_zero(); + hamilt::HContainer>* hRGint_tmpCd = new hamilt::HContainer>(this->ucell->nat); + hRGint_tmpCd->insert_ijrs(this->gridt->get_ijr_info(), *(this->ucell)); + hRGint_tmpCd->allocate(nullptr, true); + hRGint_tmpCd->set_zero(); + for (int iap = 0; iap < hRGint_tmpCd->size_atom_pairs(); iap++) { //std::cout<<"iap: "<get_atom_pair(iap); + auto* ap = &hRGint_tmpCd->get_atom_pair(iap); const int iat1 = ap->get_atom_i(); const int iat2 = ap->get_atom_j(); - const hamilt::AtomPair* ap_nspin = nullptr; if (iat1 <= iat2) { hamilt::AtomPair>* upper_ap = ap; - hamilt::AtomPair>* lower_ap = hR->find_pair(iat2, iat1); - switch (is) - { - case 0: - ap_nspin = hR_tmp->find_pair(iat1, iat2); - break; - case 3: - ap_nspin = hR_tmp->find_pair(iat1, iat2); - break; - } - if(ap_nspin == nullptr) break; - //const hamilt::AtomPair* ap_nspin_0 = this->hR_tmp[0]->find_pair(iat1, iat2); - //const hamilt::AtomPair* ap_nspin_3 = this->hRGint_tmp[3]->find_pair(iat1, iat2); + hamilt::AtomPair>* lower_ap = hRGint_tmpCd->find_pair(iat2, iat1); + const hamilt::AtomPair* ap_nspin1 = this->hRGint_tmp[first[is]] ->find_pair(iat1, iat2); + const hamilt::AtomPair* ap_nspin2 = this->hRGint_tmp[second[is]] ->find_pair(iat1, iat2); for (int ir = 0; ir < upper_ap->get_R_size(); ir++) { + //std::cout<<"ir"<get_R_index(ir); auto upper_mat = upper_ap->find_matrix(R_index); - //auto mat_nspin_0 = ap_nspin_0->find_matrix(R_index); - //auto mat_nspin_3 = ap_nspin_3->find_matrix(R_index); - auto mat_nspin = ap_nspin->find_matrix(R_index); - + auto mat_nspin1 = ap_nspin1->find_matrix(R_index); + auto mat_nspin2 = ap_nspin2->find_matrix(R_index); // The row size and the col size of upper_matrix is double that of matrix_nspin_0 - for (int irow = 0; irow < mat_nspin->get_row_size(); ++irow) + for (int irow = 0; irow < mat_nspin1->get_row_size(); ++irow) { - for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol) + for (int icol = 0; icol < mat_nspin1->get_col_size(); ++icol) { - //upper_mat->get_value(2*irow, 2*icol) = mat_nspin_0->get_value(irow, icol) + mat_nspin_3->get_value(irow, icol); - //upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin_0->get_value(irow, icol) - mat_nspin_3->get_value(irow, icol); switch (is) { case 0: - upper_mat->get_value(2*irow, 2*icol) = mat_nspin->get_value(irow, icol); - upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin->get_value(irow, icol); + upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) + mat_nspin2->get_value(irow, icol); + break; + case 1: + upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) + + std::complex(0.0, 1.0) * mat_nspin2->get_value(irow, icol); + break; + case 2: + upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) - + std::complex(0.0, 1.0) * mat_nspin2->get_value(irow, icol); break; case 3: - upper_mat->get_value(2*irow, 2*icol) += mat_nspin->get_value(irow, icol); - upper_mat->get_value(2*irow+1, 2*icol+1) -= mat_nspin->get_value(irow, icol); + upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) - mat_nspin2->get_value(irow, icol); break; } } } - - if (PARAM.globalv.domag) - { - const hamilt::AtomPair* ap_nspin = nullptr; - switch (is) - { - case 1: - ap_nspin = hR_tmp->find_pair(iat1, iat2); - break; - case 2: - ap_nspin = hR_tmp->find_pair(iat1, iat2); - break; - } - //const hamilt::AtomPair* ap_nspin_1 = this->hRGint_tmp[1]->find_pair(iat1, iat2); - //const hamilt::AtomPair* ap_nspin_2 = this->hRGint_tmp[2]->find_pair(iat1, iat2); - //const auto mat_nspin_1 = ap_nspin_1->find_matrix(R_index); - //const auto mat_nspin_2 = ap_nspin_2->find_matrix(R_index); - const auto mat_nspin = ap_nspin->find_matrix(R_index); - for (int irow = 0; irow < mat_nspin->get_row_size(); ++irow) - { - for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol) - { - switch(is) - { - case 1: - upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin->get_value(irow, icol); - upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin->get_value(irow, icol); - break; - case 2: - upper_mat->get_value(2*irow, 2*icol+1) += std::complex(0.0, 1.0) * mat_nspin->get_value(irow, icol); - upper_mat->get_value(2*irow+1, 2*icol) -= std::complex(0.0, 1.0) * mat_nspin->get_value(irow, icol); - break; - } - //upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin->get_value(irow, icol) + std::complex(0.0, 1.0) * mat_nspin_2->get_value(irow, icol); - //upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin->get_value(irow, icol) - std::complex(0.0, 1.0) * mat_nspin_2->get_value(irow, icol); - } - } - } - - // fill the lower triangle matrix - if(is == 3){ + //fill the lower triangle matrix + if (PARAM.globalv.domag){ if (iat1 < iat2) { auto lower_mat = lower_ap->find_matrix(-R_index); @@ -218,15 +161,52 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, } } } - // std::cout<<"end"<hR_tmp); + for (int iap = 0; iap < hR->size_atom_pairs(); iap++) + { + //std::cout<<"iap: "<get_atom_pair(iap); + const int iat1 = ap->get_atom_i(); + const int iat2 = ap->get_atom_j(); + auto* ap_nspin = this->hR_tmp ->find_pair(iat1, iat2); + for (int ir = 0; ir < ap->get_R_size(); ir++) + { + const auto R_index = ap->get_R_index(ir); + auto upper_mat = ap->find_matrix(R_index); + auto mat_nspin = ap_nspin->find_matrix(R_index); + + // The row size and the col size of upper_matrix is double that of matrix_nspin_0 + for (int irow = 0; irow < mat_nspin->get_row_size(); ++irow) + { + for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol) + { + switch (is) + { + case 0: + upper_mat->get_value(2*irow, 2*icol) += mat_nspin->get_value(irow, icol); + break; + case 1: + upper_mat->get_value(2*irow, 2*icol+1) += mat_nspin->get_value(irow, icol); + break; + case 2: + upper_mat->get_value(2*irow+1, 2*icol) += mat_nspin->get_value(irow, icol); + break; + case 3: + upper_mat->get_value(2*irow+1, 2*icol+1) += mat_nspin->get_value(irow, icol); + break; + } + } + } + } + } } delete[] iat2iwt; - delete pv; - delete hR_tmp; #else #endif @@ -246,7 +226,7 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, // hamilt::transferSerials2Parallels>(*this->hRGintCd, hR); // } // #else -// hR->add(*this->hRGintCd); + // hR->add(*this->hRGintCd); // #endif ModuleBase::timer::tick("Gint_k", "transfer_pvpR"); return; diff --git a/source/module_lr/utils/gint_move.hpp b/source/module_lr/utils/gint_move.hpp index c3ee35f566..8062ff895d 100644 --- a/source/module_lr/utils/gint_move.hpp +++ b/source/module_lr/utils/gint_move.hpp @@ -45,8 +45,8 @@ Gint& Gint::operator=(Gint&& rhs) // move hR after refactor this->hRGint = rhs.hRGint; rhs.hRGint = nullptr; - // this->hRGintCd = rhs.hRGintCd; - // rhs.hRGintCd = nullptr; + this->hR_tmp = rhs.hR_tmp; + rhs.hR_tmp = nullptr; for (int i = 0; i < this->DMRGint.size(); i++) { delete this->DMRGint[i]; From e8afb27681284eda291dd444f9d9c7c5a82ff870 Mon Sep 17 00:00:00 2001 From: hn <3022939753@qq.com> Date: Sun, 31 Aug 2025 15:29:22 +0800 Subject: [PATCH 5/9] Simplify the computational code --- .../module_hamilt_lcao/module_gint/gint.cpp | 42 ++++--------- .../module_gint/gint_k_pvpr.cpp | 62 ++++--------------- 2 files changed, 23 insertions(+), 81 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp index f1a8805f97..dee289c12d 100644 --- a/source/module_hamilt_lcao/module_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/gint.cpp @@ -252,6 +252,10 @@ void Gint::transfer_DM2DtoGrid(std::vector*> DM2D) { } else // NSPIN=4 case { #ifdef __MPI + // is=0:↑↑, 1:↑↓, 2:↓↑, 3:↓↓ + const int row_set[4] = {0, 0, 1, 1}; + const int col_set[4] = {0, 1, 0, 1}; + int mg = DM2D[0]->get_paraV()->get_global_row_size()/2; int ng = DM2D[0]->get_paraV()->get_global_col_size()/2; int nb = DM2D[0]->get_paraV()->get_block_size()/2; @@ -267,42 +271,20 @@ void Gint::transfer_DM2DtoGrid(std::vector*> DM2D) { this-> DM2D_tmp = new hamilt::HContainer(pv, nullptr, &ijr_info); ModuleBase::Memory::record("Gint::DM2D_tmp", this->DM2D_tmp->get_memory_size()); for (int is = 0; is < 4; is++){ + this->DM2D_tmp->set_zero(); for (int iap = 0; iap < DM2D[0]->size_atom_pairs(); ++iap) { auto& ap = DM2D[0]->get_atom_pair(iap); int iat1 = ap.get_atom_i(); int iat2 = ap.get_atom_j(); for (int ir = 0; ir < ap.get_R_size(); ++ir) { const ModuleBase::Vector3 r_index = ap.get_R_index(ir); - double* tmp_pointer = this -> DM2D_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer(); - double* data_full = ap.get_pointer(ir); - for (int irow = 0; irow < ap.get_row_size(); irow += 2) { - switch (is) {//todo: It can be written more compactly - case 0: - for (int icol = 0; icol < ap.get_col_size(); icol += 2) { - *(tmp_pointer)++ = data_full[icol]; - } - data_full += ap.get_col_size() * 2; - break; - case 1: - for (int icol = 0; icol < ap.get_col_size(); icol += 2) { - *(tmp_pointer)++ = data_full[icol + 1]; - } - data_full += ap.get_col_size() * 2; - break; - case 2: - data_full += ap.get_col_size(); - for (int icol = 0; icol < ap.get_col_size(); icol += 2) { - *(tmp_pointer)++ = data_full[icol]; - } - data_full += ap.get_col_size(); - break; - case 3: - data_full += ap.get_col_size(); - for (int icol = 0; icol < ap.get_col_size(); icol += 2) { - *(tmp_pointer)++ = data_full[icol + 1]; - } - data_full += ap.get_col_size(); - break; + double* matrix_out = this -> DM2D_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer(); + double* matrix_in = ap.get_pointer(ir); + for (int irow = 0; irow < ap.get_row_size()/2; irow ++) { + for (int icol = 0; icol < ap.get_col_size()/2; icol++){ + int index_i = irow* ap.get_col_size()/2 + icol; + int index_j = (irow*2+row_set[is]) * ap.get_col_size() + icol*2+col_set[is]; + matrix_out[index_i] = matrix_in[index_j]; } } } diff --git a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp index f0a3bc6518..9e80ea22c1 100644 --- a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp +++ b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp @@ -81,6 +81,7 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, int mg = hR->get_paraV()->get_global_row_size()/2; int ng = hR->get_paraV()->get_global_col_size()/2; int nb = hR->get_paraV()->get_block_size()/2; + hR->set_zero(); #ifdef __MPI int blacs_ctxt = hR->get_paraV()->blacs_ctxt; int *iat2iwt = new int[ucell_in->nat]; @@ -98,8 +99,11 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, //0,3;1,2;1,2;0,3 std::vector first = {0, 1, 1, 0}; std::vector second= {3, 2, 2, 3}; + std::vector row_set = {0, 0, 1, 1}; + std::vector col_set = {0, 1, 0, 1}; + std::vector clx_i = {1, 0, 0, -1}; + std::vector clx_j = {0, 1, -1, 0}; for (int is = 0; is < 4; is++){ - this->hR_tmp->set_zero(); hamilt::HContainer>* hRGint_tmpCd = new hamilt::HContainer>(this->ucell->nat); hRGint_tmpCd->insert_ijrs(this->gridt->get_ijr_info(), *(this->ucell)); hRGint_tmpCd->allocate(nullptr, true); @@ -128,23 +132,8 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, { for (int icol = 0; icol < mat_nspin1->get_col_size(); ++icol) { - switch (is) - { - case 0: - upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) + mat_nspin2->get_value(irow, icol); - break; - case 1: - upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) + - std::complex(0.0, 1.0) * mat_nspin2->get_value(irow, icol); - break; - case 2: - upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) - - std::complex(0.0, 1.0) * mat_nspin2->get_value(irow, icol); - break; - case 3: - upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) - mat_nspin2->get_value(irow, icol); - break; - } + upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) + + std::complex(clx_i[is], clx_j[is]) * mat_nspin2->get_value(irow, icol); } } //fill the lower triangle matrix @@ -166,7 +155,7 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, } //std::cout<<"success"<hR_tmp->set_zero(); hamilt::transferSerials2Parallels( *hRGint_tmpCd, this->hR_tmp); for (int iap = 0; iap < hR->size_atom_pairs(); iap++) { @@ -186,48 +175,19 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, { for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol) { - switch (is) - { - case 0: - upper_mat->get_value(2*irow, 2*icol) += mat_nspin->get_value(irow, icol); - break; - case 1: - upper_mat->get_value(2*irow, 2*icol+1) += mat_nspin->get_value(irow, icol); - break; - case 2: - upper_mat->get_value(2*irow+1, 2*icol) += mat_nspin->get_value(irow, icol); - break; - case 3: - upper_mat->get_value(2*irow+1, 2*icol+1) += mat_nspin->get_value(irow, icol); - break; - } + upper_mat->get_value(2*irow+row_set[is], 2*icol+col_set[is]) = + mat_nspin->get_value(irow, icol); } } } } + delete hRGint_tmpCd; } delete[] iat2iwt; #else #endif - // =================================== - // transfer HR from Gint to Veff, std::complex>> - // =================================== -// #ifdef __MPI -// int size; -// MPI_Comm_size(MPI_COMM_WORLD, &size); -// if (size == 1) -// { -// hR->add(*this->hRGintCd); -// } -// else -// { -// hamilt::transferSerials2Parallels>(*this->hRGintCd, hR); -// } -// #else - // hR->add(*this->hRGintCd); -// #endif ModuleBase::timer::tick("Gint_k", "transfer_pvpR"); return; } From 67d93e789eb4626b30aa06f5b1bd6b97289ca56a Mon Sep 17 00:00:00 2001 From: hn <3022939753@qq.com> Date: Tue, 16 Sep 2025 11:40:57 +0800 Subject: [PATCH 6/9] improve the serial code --- source/module_hamilt_lcao/module_gint/gint.cpp | 12 ++++++++---- .../module_gint/gint_k_pvpr.cpp | 16 +++++++++------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp index 490228a2f1..4279ab5c30 100644 --- a/source/module_hamilt_lcao/module_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/gint.cpp @@ -247,15 +247,15 @@ void Gint::transfer_DM2DtoGrid(std::vector*> DM2D) { } } else // NSPIN=4 case { -#ifdef __MPI + // is=0:↑↑, 1:↑↓, 2:↓↑, 3:↓↓ const int row_set[4] = {0, 0, 1, 1}; const int col_set[4] = {0, 1, 0, 1}; int mg = DM2D[0]->get_paraV()->get_global_row_size()/2; int ng = DM2D[0]->get_paraV()->get_global_col_size()/2; int nb = DM2D[0]->get_paraV()->get_block_size()/2; +#ifdef __MPI int blacs_ctxt = DM2D[0]->get_paraV()->blacs_ctxt; - std::vector iat2iwt(ucell->nat); for (int iat = 0; iat < ucell->nat; iat++) { iat2iwt[iat] = ucell->get_iat2iwt()[iat]/2; @@ -266,6 +266,7 @@ void Gint::transfer_DM2DtoGrid(std::vector*> DM2D) { auto ijr_info = DM2D[0]->get_ijr_info(); this-> DM2D_tmp = new hamilt::HContainer(pv, nullptr, &ijr_info); this-> DM2D_tmp->set_zero(); +#endif ModuleBase::Memory::record("Gint::DM2D_tmp", this->DM2D_tmp->get_memory_size()); for (int is = 0; is < 4; is++){ for (int iap = 0; iap < DM2D[0]->size_atom_pairs(); ++iap) { @@ -285,11 +286,14 @@ void Gint::transfer_DM2DtoGrid(std::vector*> DM2D) { } } } +#ifdef __MPI hamilt::transferParallels2Serials( *(this->DM2D_tmp), this->DMRGint[is]); - } #else - //this->DMRGint_full = DM2D[0]; + this->DMRGint[is]->set_zero(); + this->DMRGint[is]->add(*(this->DM2D_tmp)); #endif + } + } ModuleBase::timer::tick("Gint", "transfer_DMR"); } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp index 9a70c8e3f1..c93effbf25 100644 --- a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp +++ b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp @@ -78,10 +78,12 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, ModuleBase::TITLE("Gint_k", "transfer_pvpR"); ModuleBase::timer::tick("Gint_k", "transfer_pvpR"); + auto ijr_info = hR->get_ijr_info(); + +#ifdef __MPI int mg = hR->get_paraV()->get_global_row_size()/2; int ng = hR->get_paraV()->get_global_col_size()/2; int nb = hR->get_paraV()->get_block_size()/2; -#ifdef __MPI int blacs_ctxt = hR->get_paraV()->blacs_ctxt; std::vector iat2iwt(ucell_in->nat); for (int iat = 0; iat < ucell_in->nat; iat++) { @@ -90,9 +92,9 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, Parallel_Orbitals *pv = new Parallel_Orbitals(); pv->set(mg, ng, nb, blacs_ctxt); pv->set_atomic_trace(iat2iwt.data(), ucell_in->nat, mg); - auto ijr_info = hR->get_ijr_info(); - this->hR_tmp = new hamilt::HContainer>(pv, nullptr, &ijr_info); +#endif + ModuleBase::Memory::record("Gint::hRGintCd", this->hR_tmp->get_memory_size()); //select hRGint_tmp @@ -155,8 +157,12 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, } } } +#ifdef __MPI // transfer hRGint_tmpCd to parallel hR_tmp hamilt::transferSerials2Parallels( *hRGint_tmpCd, this->hR_tmp); +#else + this->hR_tmp = hRGint_tmpCd; +#endif // merge hR_tmp to hR for (int iap = 0; iap < hR->size_atom_pairs(); iap++) { @@ -182,10 +188,6 @@ void Gint_k::transfer_pvpR(hamilt::HContainer>* hR, } delete hRGint_tmpCd; } - delete[] iat2iwt; -#else - -#endif ModuleBase::timer::tick("Gint_k", "transfer_pvpR"); return; } From e04dce324b3e24229c8747bc53aadb69ab0a0b46 Mon Sep 17 00:00:00 2001 From: hn <3022939753@qq.com> Date: Tue, 16 Sep 2025 12:14:04 +0800 Subject: [PATCH 7/9] fix bug --- source/module_hamilt_lcao/module_gint/gint.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp index 4279ab5c30..e59839dbf6 100644 --- a/source/module_hamilt_lcao/module_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/gint.cpp @@ -254,6 +254,7 @@ void Gint::transfer_DM2DtoGrid(std::vector*> DM2D) { int mg = DM2D[0]->get_paraV()->get_global_row_size()/2; int ng = DM2D[0]->get_paraV()->get_global_col_size()/2; int nb = DM2D[0]->get_paraV()->get_block_size()/2; + auto ijr_info = DM2D[0]->get_ijr_info(); #ifdef __MPI int blacs_ctxt = DM2D[0]->get_paraV()->blacs_ctxt; std::vector iat2iwt(ucell->nat); @@ -263,9 +264,15 @@ void Gint::transfer_DM2DtoGrid(std::vector*> DM2D) { Parallel_Orbitals *pv = new Parallel_Orbitals(); pv->set(mg, ng, nb, blacs_ctxt); pv->set_atomic_trace(iat2iwt.data(), ucell->nat, mg); - auto ijr_info = DM2D[0]->get_ijr_info(); this-> DM2D_tmp = new hamilt::HContainer(pv, nullptr, &ijr_info); this-> DM2D_tmp->set_zero(); +#else + if (this->DM2D_tmp != nullptr) { + delete this->DM2D_tmp; + } + this-> DM2D_tmp = new hamilt::HContainer(*this->hRGint); + this-> DM2D_tmp -> insert_ijrs(this->gridt->get_ijr_info(), *(this->ucell)); + this-> DM2D_tmp -> allocate(nullptr, true); #endif ModuleBase::Memory::record("Gint::DM2D_tmp", this->DM2D_tmp->get_memory_size()); for (int is = 0; is < 4; is++){ @@ -275,7 +282,7 @@ void Gint::transfer_DM2DtoGrid(std::vector*> DM2D) { int iat2 = ap.get_atom_j(); for (int ir = 0; ir < ap.get_R_size(); ++ir) { const ModuleBase::Vector3 r_index = ap.get_R_index(ir); - double* matrix_out = DM2D_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer(); + double* matrix_out = this-> DM2D_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer(); double* matrix_in = ap.get_pointer(ir); for (int irow = 0; irow < ap.get_row_size()/2; irow ++) { for (int icol = 0; icol < ap.get_col_size()/2; icol ++) { From 4350e3d5682b81fafedfd2922cd8e4f221cd3990 Mon Sep 17 00:00:00 2001 From: dzzz2001 <153698752+dzzz2001@users.noreply.github.com> Date: Sat, 19 Jul 2025 15:17:42 +0800 Subject: [PATCH 8/9] Perf: support GPU version of cal_force_cc with LCAO basis (#6392) * support GPU version of cal_force_cc with LCAO basis * fix a bug --- .../hamilt_lcaodft/FORCE_STRESS.cpp | 51 ++++++++++++------- .../hamilt_lcaodft/FORCE_STRESS.h | 1 - 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp index 532bf7e561..3454a65963 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp @@ -22,7 +22,7 @@ #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h" template -Force_Stress_LCAO::Force_Stress_LCAO(Record_adj& ra, const int nat_in) : RA(&ra), f_pw(nat_in), nat(nat_in) +Force_Stress_LCAO::Force_Stress_LCAO(Record_adj& ra, const int nat_in) : RA(&ra), nat(nat_in) { } template @@ -861,24 +861,39 @@ void Force_Stress_LCAO::calForcePwPart(UnitCell& ucell, const Structure_Factor& sf) { ModuleBase::TITLE("Force_Stress_LCAO", "calForcePwPart"); - //-------------------------------------------------------- - // local pseudopotential force: - // use charge density; plane wave; local pseudopotential; - //-------------------------------------------------------- - f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr); - //-------------------------------------------------------- - // ewald force: use plane wave only. - //-------------------------------------------------------- - f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem +#ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Forces f_pw(nat); + + //-------------------------------------------------------- + // local pseudopotential force: + // use charge density; plane wave; local pseudopotential; + //-------------------------------------------------------- + f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr); + //-------------------------------------------------------- + // ewald force: use plane wave only. + //-------------------------------------------------------- + f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem + + //-------------------------------------------------------- + // force due to core correlation. + //-------------------------------------------------------- + f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell); + //-------------------------------------------------------- + // force due to self-consistent charge. + //-------------------------------------------------------- + f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell); + } else +#endif + { + Forces f_pw(nat); + f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr); + f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem + f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell); + f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell); + } - //-------------------------------------------------------- - // force due to core correlation. - //-------------------------------------------------------- - f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell); - //-------------------------------------------------------- - // force due to self-consistent charge. - //-------------------------------------------------------- - f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell); return; } diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h index a11add920e..b55caa7935 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h +++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h @@ -60,7 +60,6 @@ class Force_Stress_LCAO Record_adj* RA; Force_LCAO flk; Stress_Func sc_pw; - Forces f_pw; void forceSymmetry(const UnitCell& ucell, ModuleBase::matrix& fcs, From 639f19d9095d4a1bcd87fa019418c61fa71f7bd3 Mon Sep 17 00:00:00 2001 From: hn <3022939753@qq.com> Date: Fri, 10 Oct 2025 21:11:28 +0800 Subject: [PATCH 9/9] Apply the changes related to force calculation from the develop branch, fixing the memory error bug that occurs during direct cherry-pick. --- .../module_hamilt_pw/hamilt_pwdft/forces.cpp | 689 ++++++------------ source/module_hamilt_pw/hamilt_pwdft/forces.h | 16 +- .../hamilt_pwdft/forces_cc.cpp | 4 +- .../hamilt_pwdft/forces_scc.cpp | 4 +- .../hamilt_pwdft/forces_us.cpp | 2 +- .../hamilt_pwdft/kernels/cuda/force_op.cu | 427 ++++++----- .../hamilt_pwdft/kernels/force_op.cpp | 108 +-- .../hamilt_pwdft/kernels/force_op.h | 141 ++-- 8 files changed, 534 insertions(+), 857 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index a4d779366b..ecdf746019 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -20,15 +20,13 @@ #ifdef _OPENMP #include #endif -#ifdef USE_PAW -#include "module_cell/module_paw/paw_cell.h" -#endif + template void Forces::cal_force(UnitCell& ucell, ModuleBase::matrix& force, const elecstate::ElecState& elec, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* const rho_basis, ModuleSymmetry::Symmetry* p_symm, Structure_Factor* p_sf, surchem& solvent, @@ -55,15 +53,7 @@ void Forces::cal_force(UnitCell& ucell, ModuleBase::matrix forceonsite(nat, 3); // Force due to local ionic potential - // For PAW, calculated together in paw_cell.calculate_force - if (!PARAM.inp.use_paw) - { - this->cal_force_loc(ucell,forcelc, rho_basis, locpp->vloc, chr); - } - else - { - forcelc.zero_out(); - } + this->cal_force_loc(ucell,forcelc, rho_basis, locpp->vloc, chr); // Ewald this->cal_force_ew(ucell,forceion, rho_basis, p_sf); @@ -71,94 +61,15 @@ void Forces::cal_force(UnitCell& ucell, // Force due to nonlocal part of pseudopotential if (wfc_basis != nullptr) { - if (!PARAM.inp.use_paw) - { - this->npwx = wfc_basis->npwk_max; - Forces::cal_force_nl(forcenl, wg, ekb, pkv, wfc_basis, p_sf, *p_nlpp, ucell, psi_in); - if (PARAM.globalv.use_uspp) - { - this->cal_force_us(forcenl, rho_basis, *p_nlpp, elec, ucell); - } - } - else - { -#ifdef USE_PAW - for (int ik = 0; ik < wfc_basis->nks; ik++) - { - const int npw = wfc_basis->npwk[ik]; - ModuleBase::Vector3* _gk = new ModuleBase::Vector3[npw]; - for (int ig = 0; ig < npw; ig++) - { - _gk[ig] = wfc_basis->getgpluskcar(ik, ig); - } + this->npwx = wfc_basis->npwk_max; + Forces::cal_force_nl(forcenl, wg, ekb, pkv, wfc_basis, p_sf, *p_nlpp, ucell, psi_in); - double* kpt; - kpt = new double[3]; - kpt[0] = wfc_basis->kvec_c[ik].x; - kpt[1] = wfc_basis->kvec_c[ik].y; - kpt[2] = wfc_basis->kvec_c[ik].z; - - double** kpg; - double** gcar; - kpg = new double*[npw]; - gcar = new double*[npw]; - for (int ipw = 0; ipw < npw; ipw++) - { - kpg[ipw] = new double[3]; - kpg[ipw][0] = _gk[ipw].x; - kpg[ipw][1] = _gk[ipw].y; - kpg[ipw][2] = _gk[ipw].z; - - gcar[ipw] = new double[3]; - gcar[ipw][0] = wfc_basis->getgcar(ik, ipw).x; - gcar[ipw][1] = wfc_basis->getgcar(ik, ipw).y; - gcar[ipw][2] = wfc_basis->getgcar(ik, ipw).z; - } - - GlobalC::paw_cell.set_paw_k(npw, - wfc_basis->npwk_max, - kpt, - wfc_basis->get_ig2ix(ik).data(), - wfc_basis->get_ig2iy(ik).data(), - wfc_basis->get_ig2iz(ik).data(), - (const double**)kpg, - ucell.tpiba, - (const double**)gcar); - - delete[] kpt; - for (int ipw = 0; ipw < npw; ipw++) - { - delete[] kpg[ipw]; - delete[] gcar[ipw]; - } - delete[] kpg; - delete[] gcar; - - GlobalC::paw_cell.get_vkb(); - - GlobalC::paw_cell.set_currentk(ik); - - psi_in[0].fix_k(ik); - double *weight, *epsilon; - weight = new double[PARAM.inp.nbands]; - epsilon = new double[PARAM.inp.nbands]; - for (int ib = 0; ib < PARAM.inp.nbands; ib++) - { - weight[ib] = wg(ik, ib); - epsilon[ib] = ekb(ik, ib); - } - GlobalC::paw_cell.paw_nl_force(reinterpret_cast*>(psi_in[0].get_pointer()), - epsilon, - weight, - PARAM.inp.nbands, - forcenl.c); - - delete[] weight; - delete[] epsilon; - } -#endif + if (PARAM.globalv.use_uspp) + { + this->cal_force_us(forcenl, rho_basis, *p_nlpp, elec, ucell); } + // DFT+U and DeltaSpin if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch) { @@ -167,26 +78,10 @@ void Forces::cal_force(UnitCell& ucell, } // non-linear core correction - // not relevant for PAW - if (!PARAM.inp.use_paw) - { - Forces::cal_force_cc(forcecc, rho_basis, chr, locpp->numeric, ucell); - } - else - { - forcecc.zero_out(); - } + Forces::cal_force_cc(forcecc, rho_basis, chr, locpp->numeric, ucell); // force due to core charge - // For PAW, calculated together in paw_cell.calculate_force - if (!PARAM.inp.use_paw) - { - this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, locpp->numeric, ucell); - } - else - { - forcescc.zero_out(); - } + this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, locpp->numeric, ucell); ModuleBase::matrix stress_vdw_pw; //.create(3,3); ModuleBase::matrix force_vdw; @@ -240,52 +135,6 @@ void Forces::cal_force(UnitCell& ucell, } } -#ifdef USE_PAW - if (PARAM.inp.use_paw) - { - double* force_paw; - double* rhor; - rhor = new double[rho_basis->nrxx]; - for (int ir = 0; ir < rho_basis->nrxx; ir++) - { - rhor[ir] = 0.0; - } - for (int is = 0; is < PARAM.inp.nspin; is++) - { - for (int ir = 0; ir < rho_basis->nrxx; ir++) - { - rhor[ir] += chr->rho[is][ir] + chr->nhat[is][ir]; - } - } - - force_paw = new double[3 * this->nat]; - ModuleBase::matrix v_xc, v_effective; - v_effective.create(PARAM.inp.nspin, rho_basis->nrxx); - v_effective.zero_out(); - elec.pot->update_from_charge(elec.charge, &ucell); - v_effective = elec.pot->get_effective_v(); - - v_xc.create(PARAM.inp.nspin, rho_basis->nrxx); - v_xc.zero_out(); - const std::tuple etxc_vtxc_v - = XC_Functional::v_xc(rho_basis->nrxx, elec.charge, &ucell); - v_xc = std::get<2>(etxc_vtxc_v); - - GlobalC::paw_cell.calculate_force(v_effective.c, v_xc.c, rhor, force_paw); - - for (int iat = 0; iat < this->nat; iat++) - { - // Ha to Ry - forcepaw(iat, 0) = force_paw[3 * iat] * 2.0; - forcepaw(iat, 1) = force_paw[3 * iat + 1] * 2.0; - forcepaw(iat, 2) = force_paw[3 * iat + 2] * 2.0; - } - - delete[] force_paw; - delete[] rhor; - } -#endif - // impose total force = 0 int iat = 0; for (int ipol = 0; ipol < 3; ipol++) @@ -300,11 +149,6 @@ void Forces::cal_force(UnitCell& ucell, force(iat, ipol) = forcelc(iat, ipol) + forceion(iat, ipol) + forcenl(iat, ipol) + forcecc(iat, ipol) + forcescc(iat, ipol); - if (PARAM.inp.use_paw) - { - force(iat, ipol) += forcepaw(iat, ipol); - } - if (vdw_solver != nullptr) // linpz and jiyy added vdw force, modified by zhengdy { force(iat, ipol) += force_vdw(iat, ipol); @@ -442,14 +286,7 @@ void Forces::cal_force(UnitCell& ucell, ModuleIO::print_force(GlobalV::ofs_running, ucell, "NLCC FORCE (eV/Angstrom)", forcecc, false); ModuleIO::print_force(GlobalV::ofs_running, ucell, "ION FORCE (eV/Angstrom)", forceion, false); ModuleIO::print_force(GlobalV::ofs_running, ucell, "SCC FORCE (eV/Angstrom)", forcescc, false); - if (PARAM.inp.use_paw) - { - ModuleIO::print_force(GlobalV::ofs_running, - ucell, - "PAW FORCE (eV/Angstrom)", - forcepaw, - false); - } + if (PARAM.inp.efield_flag) { ModuleIO::print_force(GlobalV::ofs_running, ucell, "EFIELD FORCE (eV/Angstrom)", force_e, false); @@ -487,13 +324,13 @@ void Forces::cal_force(UnitCell& ucell, template void Forces::cal_force_loc(const UnitCell& ucell, ModuleBase::matrix& forcelc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* const rho_basis, const ModuleBase::matrix& vloc, const Charge* const chr) { ModuleBase::TITLE("Forces", "cal_force_loc"); ModuleBase::timer::tick("Forces", "cal_force_loc"); - + this->device = base_device::get_device_type(this->ctx); std::complex* aux = new std::complex[rho_basis->nmaxgr]; // now, in all pools , the charge are the same, // so, the force calculated by each pool is equal. @@ -532,110 +369,100 @@ void Forces::cal_force_loc(const UnitCell& ucell, // to G space. maybe need fftw with OpenMP rho_basis->real2recip(aux, aux); - // sincos op for G space - - - // data preparation - std::vector tau_flat(this->nat * 3); - std::vector gcar_flat(rho_basis->npw * 3); - - - for (int iat = 0; iat < this->nat; iat++) { - int it = ucell.iat2it[iat]; - int ia = ucell.iat2ia[iat]; - - tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); - tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); - tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); - } - - for (int ig = 0; ig < rho_basis->npw; ig++) { - gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); - gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); - gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); - } - - // calculate vloc_factors for all atom types - std::vector vloc_per_type_host(this->nat * rho_basis->npw); - for (int iat = 0; iat < this->nat; iat++) { - int it = ucell.iat2it[iat]; - for (int ig = 0; ig < rho_basis->npw; ig++) { - vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); - } - } - - std::vector> aux_fptype(rho_basis->npw); - for (int ig = 0; ig < rho_basis->npw; ig++) { - aux_fptype[ig] = static_cast>(aux[ig]); - } - - FPTYPE* d_gcar = gcar_flat.data(); - FPTYPE* d_tau = tau_flat.data(); - FPTYPE* d_vloc_per_type = vloc_per_type_host.data(); - std::complex* d_aux = aux_fptype.data(); - FPTYPE* d_force = nullptr; - std::vector force_host(this->nat * 3); - - if (this->device == base_device::GpuDevice) - { - d_gcar = nullptr; - d_tau = nullptr; - d_vloc_per_type = nullptr; - d_aux = nullptr; - - resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); - resmem_var_op()(this->ctx, d_tau, this->nat * 3); - resmem_var_op()(this->ctx, d_vloc_per_type, this->nat * rho_basis->npw); - resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); - resmem_var_op()(this->ctx, d_force, this->nat * 3); - - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), this->nat * rho_basis->npw); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); - - base_device::memory::set_memory_op()(this->ctx, d_force, 0.0, this->nat * 3); - } - else + if(this->device == base_device::GpuDevice) { - d_force = force_host.data(); - std::fill(force_host.begin(), force_host.end(), static_cast(0.0)); - } - - const FPTYPE scale_factor = static_cast(ucell.tpiba * ucell.omega); - - // call op for sincos calculation - hamilt::cal_force_loc_sincos_op()( - this->ctx, - this->nat, - rho_basis->npw, - this->nat, - d_gcar, - d_tau, - d_vloc_per_type, - d_aux, - scale_factor, - d_force - ); - - if (this->device == base_device::GpuDevice) - { - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3); - - delmem_var_op()(this->ctx, d_gcar); - delmem_var_op()(this->ctx, d_tau); - delmem_var_op()(this->ctx, d_vloc_per_type); - delmem_complex_op()(this->ctx, d_aux); - delmem_var_op()(this->ctx, d_force); - } - - for (int iat = 0; iat < this->nat; iat++) { - forcelc(iat, 0) = static_cast(force_host[iat * 3 + 0]); - forcelc(iat, 1) = static_cast(force_host[iat * 3 + 1]); - forcelc(iat, 2) = static_cast(force_host[iat * 3 + 2]); - } + std::vector tau_h; + std::vector gcar_h; + tau_h.resize(this->nat * 3); + for(int iat = 0; iat < this->nat; ++iat) + { + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + tau_h[iat * 3] = ucell.atoms[it].tau[ia].x; + tau_h[iat * 3 + 1] = ucell.atoms[it].tau[ia].y; + tau_h[iat * 3 + 2] = ucell.atoms[it].tau[ia].z; + } - // this->print(GlobalV: :ofs_running, "local forces", forcelc); + gcar_h.resize(rho_basis->npw * 3); + for(int ig = 0; ig < rho_basis->npw; ++ig) + { + gcar_h[ig * 3] = rho_basis->gcar[ig].x; + gcar_h[ig * 3 + 1] = rho_basis->gcar[ig].y; + gcar_h[ig * 3 + 2] = rho_basis->gcar[ig].z; + } + + int* iat2it_d = nullptr; + int* ig2gg_d = nullptr; + double* gcar_d = nullptr; + double* tau_d = nullptr; + std::complex* aux_d = nullptr; + double* forcelc_d = nullptr; + double* vloc_d = nullptr; + + resmem_int_op()(this->ctx,iat2it_d, this->nat); + resmem_int_op()(this->ctx,ig2gg_d, rho_basis->npw); + resmem_var_op()(this->ctx,gcar_d, rho_basis->npw * 3); + resmem_var_op()(this->ctx,tau_d, this->nat * 3); + resmem_complex_op()(this->ctx,aux_d, rho_basis->npw); + resmem_var_op()(this->ctx,forcelc_d, this->nat * 3); + resmem_var_op()(this->ctx,vloc_d, vloc.nr * vloc.nc); + + syncmem_int_h2d_op()( this->ctx, this->cpu_ctx,iat2it_d, ucell.iat2it, this->nat); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, ig2gg_d, rho_basis->ig2igg, rho_basis->npw); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar_d, gcar_h.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, tau_d, tau_h.data(), this->nat * 3); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, aux_d, aux, rho_basis->npw); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, forcelc_d, forcelc.c, this->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, vloc_d, vloc.c, vloc.nr * vloc.nc); + + hamilt::cal_force_loc_op()( + this->nat, + rho_basis->npw, + ucell.tpiba * ucell.omega, + iat2it_d, + ig2gg_d, + gcar_d, + tau_d, + aux_d, + vloc_d, + vloc.nc, + forcelc_d); + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcelc.c, forcelc_d, this->nat * 3); + + delmem_int_op()(this->ctx,iat2it_d); + delmem_int_op()(this->ctx,ig2gg_d); + delmem_var_op()(this->ctx,gcar_d); + delmem_var_op()(this->ctx,tau_d); + delmem_complex_op()(this->ctx,aux_d); + delmem_var_op()(this->ctx,forcelc_d); + delmem_var_op()(this->ctx,vloc_d); + } + else{ // calculate forces on CPU + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int iat = 0; iat < this->nat; ++iat) + { + // read `it` `ia` from the table + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + for (int ig = 0; ig < rho_basis->npw; ig++) + { + const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]); + double sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + const double factor + = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor; + forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor; + forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor; + } + forcelc(iat, 0) *= (ucell.tpiba * ucell.omega); + forcelc(iat, 1) *= (ucell.tpiba * ucell.omega); + forcelc(iat, 2) *= (ucell.tpiba * ucell.omega); + } + } + // this->print(GlobalV::ofs_running, "local forces", forcelc); Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc); delete[] aux; ModuleBase::timer::tick("Forces", "cal_force_loc"); @@ -645,14 +472,14 @@ void Forces::cal_force_loc(const UnitCell& ucell, template void Forces::cal_force_ew(const UnitCell& ucell, ModuleBase::matrix& forceion, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* const rho_basis, const Structure_Factor* p_sf) { ModuleBase::TITLE("Forces", "cal_force_ew"); ModuleBase::timer::tick("Forces", "cal_force_ew"); - + this->device = base_device::get_device_type(this->ctx); double fact = 2.0; - std::complex* aux = new std::complex[rho_basis->npw]; + std::vector> aux(rho_basis->npw); /* blocking rho_basis->nrxnpwx for data locality. @@ -662,9 +489,7 @@ void Forces::cal_force_ew(const UnitCell& ucell, performance will be better when number of atom is quite huge */ const int block_ig = 1024; -#ifdef _OPENMP #pragma omp parallel for -#endif for (int igb = 0; igb < rho_basis->npw; igb += block_ig) { // calculate the actual task length of this block @@ -679,13 +504,6 @@ void Forces::cal_force_ew(const UnitCell& ucell, if (ucell.atoms[it].na != 0) { double dzv; - if (PARAM.inp.use_paw) - { - #ifdef USE_PAW - dzv = GlobalC::paw_cell.get_val(it); - #endif - } - else { dzv = ucell.atoms[it].ncpp.zv; } @@ -702,13 +520,6 @@ void Forces::cal_force_ew(const UnitCell& ucell, double charge = 0.0; for (int it = 0; it < ucell.ntype; it++) { - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - charge += ucell.atoms[it].na * GlobalC::paw_cell.get_val(it); -#endif - } - else { charge += ucell.atoms[it].na * ucell.atoms[it].ncpp.zv; // mohan modify 2007-11-7 } @@ -729,14 +540,16 @@ void Forces::cal_force_ew(const UnitCell& ucell, upperbound = 2.0 * charge * charge * sqrt(2.0 * alpha / ModuleBase::TWO_PI) * erfc(sqrt(ucell.tpiba2 * rho_basis->ggecut / 4.0 / alpha)); } while (upperbound > 1.0e-6); - -#ifdef _OPENMP + const int ig0 = rho_basis->ig_gge0; #pragma omp parallel for -#endif for (int ig = 0; ig < rho_basis->npw; ig++) { + if (ig== ig0) + { + continue; // skip G=0 + } aux[ig] *= ModuleBase::libm::exp(-1.0 * rho_basis->gg[ig] * ucell.tpiba2 / alpha / 4.0) - / (rho_basis->gg[ig] * ucell.tpiba2); + / (rho_basis->gg[ig] * ucell.tpiba2); } // set pos rho_basis->ig_gge0 to zero @@ -744,144 +557,99 @@ void Forces::cal_force_ew(const UnitCell& ucell, { aux[rho_basis->ig_gge0] = std::complex(0.0, 0.0); } - - // sincos op for cal_force_ew - - std::vector it_facts_host(this->nat); - std::vector tau_flat(this->nat * 3); - - // iterate over by lookup table - for (int iat = 0; iat < this->nat; iat++) { - int it = ucell.iat2it[iat]; - int ia = ucell.iat2ia[iat]; - - double zv; - if (PARAM.inp.use_paw) + if(this->device == base_device::GpuDevice) + { + std::vector tau_h(this->nat * 3); + std::vector gcar_h(rho_basis->npw * 3); + for(int iat = 0; iat < this->nat; ++iat) { -#ifdef USE_PAW - zv = GlobalC::paw_cell.get_val(it); -#endif + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + tau_h[iat * 3] = ucell.atoms[it].tau[ia].x; + tau_h[iat * 3 + 1] = ucell.atoms[it].tau[ia].y; + tau_h[iat * 3 + 2] = ucell.atoms[it].tau[ia].z; } - else + for(int ig = 0; ig < rho_basis->npw; ++ig) { - zv = ucell.atoms[it].ncpp.zv; + gcar_h[ig * 3] = rho_basis->gcar[ig].x; + gcar_h[ig * 3 + 1] = rho_basis->gcar[ig].y; + gcar_h[ig * 3 + 2] = rho_basis->gcar[ig].z; } + std::vector it_fact_h(ucell.ntype); + for(int it = 0; it < ucell.ntype; ++it) + { + it_fact_h[it] = ucell.atoms[it].ncpp.zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact; + } + + int* iat2it_d = nullptr; + double* gcar_d = nullptr; + double* tau_d = nullptr; + double* it_fact_d = nullptr; + std::complex* aux_d = nullptr; + double* forceion_d = nullptr; + resmem_int_op()(this->ctx, iat2it_d, this->nat); + resmem_var_op()(this->ctx, gcar_d, rho_basis->npw * 3); + resmem_var_op()(this->ctx, tau_d, this->nat * 3); + resmem_var_op()(this->ctx, it_fact_d, ucell.ntype); + resmem_complex_op()(this->ctx, aux_d, rho_basis->npw); + resmem_var_op()(this->ctx, forceion_d, this->nat * 3); + + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, iat2it_d, ucell.iat2it, this->nat); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar_d, gcar_h.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, tau_d, tau_h.data(), this->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, it_fact_d, it_fact_h.data(), ucell.ntype); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, aux_d, aux.data(), rho_basis->npw); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, forceion_d, forceion.c, this->nat * 3); + + hamilt::cal_force_ew_op()( + this->nat, + rho_basis->npw, + rho_basis->ig_gge0, + iat2it_d, + gcar_d, + tau_d, + it_fact_d, + aux_d, + forceion_d); - it_facts_host[iat] = static_cast(zv * ModuleBase::e2 * ucell.tpiba * - ModuleBase::TWO_PI / ucell.omega * fact); - - tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); - tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); - tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); - } - - std::vector gcar_flat(rho_basis->npw * 3); - for (int ig = 0; ig < rho_basis->npw; ig++) { - gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); - gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); - gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); - } - - std::vector> aux_fptype(rho_basis->npw); - for (int ig = 0; ig < rho_basis->npw; ig++) { - aux_fptype[ig] = static_cast>(aux[ig]); - } - - FPTYPE* d_gcar = gcar_flat.data(); - FPTYPE* d_tau = tau_flat.data(); - FPTYPE* d_it_facts = it_facts_host.data(); - std::complex* d_aux = aux_fptype.data(); - FPTYPE* d_force_g = nullptr; - std::vector force_g_host(this->nat * 3); - - if (this->device == base_device::GpuDevice) - { - d_gcar = nullptr; - d_tau = nullptr; - d_it_facts = nullptr; - d_aux = nullptr; - - resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); - resmem_var_op()(this->ctx, d_tau, this->nat * 3); - resmem_var_op()(this->ctx, d_it_facts, this->nat); - resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); - resmem_var_op()(this->ctx, d_force_g, this->nat * 3); - - - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); - - - base_device::memory::set_memory_op()(this->ctx, d_force_g, 0.0, this->nat * 3); - } - else - { - d_force_g = force_g_host.data(); - std::fill(force_g_host.begin(), force_g_host.end(), static_cast(0.0)); - } - - // call op for sincos calculation - hamilt::cal_force_ew_sincos_op()( - this->ctx, - this->nat, - rho_basis->npw, - rho_basis->ig_gge0, - d_gcar, - d_tau, - d_it_facts, - d_aux, - d_force_g - ); - - - if (this->device == base_device::GpuDevice) + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forceion.c, forceion_d, this->nat * 3); + delmem_int_op()(this->ctx,iat2it_d); + delmem_var_op()(this->ctx,gcar_d); + delmem_var_op()(this->ctx,tau_d); + delmem_var_op()(this->ctx,it_fact_d); + delmem_complex_op()(this->ctx,aux_d); + delmem_var_op()(this->ctx,forceion_d); + } else // calculate forces on CPU { - - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3); - - - delmem_var_op()(this->ctx, d_gcar); - delmem_var_op()(this->ctx, d_tau); - delmem_var_op()(this->ctx, d_it_facts); - delmem_complex_op()(this->ctx, d_aux); - delmem_var_op()(this->ctx, d_force_g); - } - + #pragma omp parallel for + for(int iat = 0; iat < this->nat; ++iat) + { + const int it = ucell.iat2it[iat]; + const int ia = ucell.iat2ia[iat]; + double it_fact = ucell.atoms[it].ncpp.zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact; - for (int iat = 0; iat < this->nat; iat++) { - forceion(iat, 0) += static_cast(force_g_host[iat * 3 + 0]); - forceion(iat, 1) += static_cast(force_g_host[iat * 3 + 1]); - forceion(iat, 2) += static_cast(force_g_host[iat * 3 + 2]); + for(int ig = 0; ig < rho_basis->npw; ++ig) + { + if(ig != rho_basis->ig_gge0) // skip G=0 + { + const ModuleBase::Vector3 gcar = rho_basis->gcar[ig]; + const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]); + double sinp, cosp; + ModuleBase::libm::sincos(arg, &sinp, &cosp); + double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); + forceion(iat, 0) += gcar[0] * sumnb; + forceion(iat, 1) += gcar[1] * sumnb; + forceion(iat, 2) += gcar[2] * sumnb; + } + } + forceion(iat, 0) *= it_fact; + forceion(iat, 1) *= it_fact; + forceion(iat, 2) *= it_fact; + } } - - -// calculate real space force -#ifdef _OPENMP -#pragma omp parallel + // means that the processor contains G=0 term. + #pragma omp parallel { - int num_threads = omp_get_num_threads(); - int thread_id = omp_get_thread_num(); -#else - int num_threads = 1; - int thread_id = 0; -#endif - - /* Here is task distribution for multi-thread, - 0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`. - To avoid syncing, we must calculate work range of each thread by our self - 1. Calculate the iat range [iat_beg, iat_end) by each thread - a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat) - 2. each thread iterate atoms form `iat_beg` to `iat_end-1` - */ - int iat_beg, iat_end; - int it_beg, ia_beg; - ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end); - iat_end = iat_beg + iat_end; - ucell.iat2iait(iat_beg, &ia_beg, &it_beg); - - if (rho_basis->ig_gge0 >= 0) { double rmax = 5.0 / (sqrt(alpha) * ucell.lat0); @@ -890,84 +658,55 @@ void Forces::cal_force_ew(const UnitCell& ucell, // output of rgen: the number of vectors in the sphere const int mxr = 200; // the maximum number of R vectors included in r - ModuleBase::Vector3* r = new ModuleBase::Vector3[mxr]; - double* r2 = new double[mxr]; - ModuleBase::GlobalFunc::ZEROS(r2, mxr); - int* irr = new int[mxr]; - ModuleBase::GlobalFunc::ZEROS(irr, mxr); + std::vector> r(mxr); + std::vector r2(mxr); + std::vector irr(mxr); // the square modulus of R_j-tau_s-tau_s' - int iat1 = iat_beg; - int T1 = it_beg; - int I1 = ia_beg; const double sqa = sqrt(alpha); const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI); // iterating atoms. - // do not need to sync threads because task range of each thread is isolated - while (iat1 < iat_end) + #pragma omp for + for(int iat1 = 0; iat1 < this->nat; iat1++) { - int iat2 = 0; // mohan fix bug 2011-06-07 - int I2 = 0; - int T2 = 0; - while (iat2 < this->nat) + int T1 = ucell.iat2it[iat1]; + int I1 = ucell.iat2ia[iat1]; + for(int iat2 = 0; iat2 < this->nat; iat2++) { - if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0) + int T2 = ucell.iat2it[iat2]; + int I2 = ucell.iat2ia[iat2]; + if (iat1 != iat2) { ModuleBase::Vector3 d_tau = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2]; - H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm); + H_Ewald_pw::rgen(d_tau, rmax, irr.data(), ucell.latvec, ucell.G, r.data(), r2.data(), nrm); for (int n = 0; n < nrm; n++) { const double rr = sqrt(r2[n]) * ucell.lat0; double factor; - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2 - / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) - * ucell.lat0; -#endif - } - else { factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv - * ModuleBase::e2 / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) - * ucell.lat0; + * ModuleBase::e2 / (rr * rr) + * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) + * ucell.lat0; } - forceion(iat1, 0) -= factor * r[n].x; forceion(iat1, 1) -= factor * r[n].y; forceion(iat1, 2) -= factor * r[n].z; } } - ++iat2; - ucell.step_iait(&I2, &T2); } // atom b - ++iat1; - ucell.step_iait(&I1, &T1); } // atom a - - delete[] r; - delete[] r2; - delete[] irr; } -#ifdef _OPENMP } -#endif - Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc); - // this->print(GlobalV::ofs_running, "ewald forces", forceion); ModuleBase::timer::tick("Forces", "cal_force_ew"); - delete[] aux; - return; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.h b/source/module_hamilt_pw/hamilt_pwdft/forces.h index 695520dceb..370c6a2eb8 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.h +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.h @@ -37,7 +37,7 @@ class Forces void cal_force(UnitCell& ucell, ModuleBase::matrix& force, const elecstate::ElecState& elec, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, ModuleSymmetry::Symmetry* p_symm, Structure_Factor* p_sf, surchem& solvent, @@ -53,15 +53,15 @@ class Forces void cal_force_loc(const UnitCell& ucell, ModuleBase::matrix& forcelc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const ModuleBase::matrix& vloc, const Charge* const chr); void cal_force_ew(const UnitCell& ucell, ModuleBase::matrix& forceion, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const Structure_Factor* p_sf); void cal_force_cc(ModuleBase::matrix& forcecc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const Charge* const chr, const bool* numeric, UnitCell& ucell_in); @@ -96,13 +96,13 @@ class Forces const UnitCell& ucell_in, const psi::Psi, Device>* psi_in = nullptr); void cal_force_scc(ModuleBase::matrix& forcescc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const ModuleBase::matrix& v_current, const bool vnew_exist, const bool* numeric, const UnitCell& ucell_in); void cal_force_us(ModuleBase::matrix& forcenl, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const pseudopot_cell_vnl& ppcell_in, const elecstate::ElecState& elec, const UnitCell& ucell); @@ -113,7 +113,7 @@ class Forces const FPTYPE* rab, const FPTYPE* rhoc, FPTYPE* drhocg, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, int type, const UnitCell& ucell_in); // used in nonlinear core correction stress void deriv_drhoc_scc(const bool& numeric, @@ -122,7 +122,7 @@ class Forces const FPTYPE* rab, const FPTYPE* rhoc, FPTYPE* drhocg, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const UnitCell& ucell_in); // used in nonlinear core correction stress protected: Device* ctx = {}; diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp index 3346724deb..f3bc7469a3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp @@ -31,7 +31,7 @@ template void Forces::cal_force_cc(ModuleBase::matrix& forcecc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const Charge* const chr, const bool* numeric, UnitCell& ucell_in) @@ -239,7 +239,7 @@ void Forces::deriv_drhoc const FPTYPE *rab, const FPTYPE *rhoc, FPTYPE *drhocg, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, int type, const UnitCell& ucell_in ) diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp index f670ad9b27..4b089d1d26 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp @@ -24,7 +24,7 @@ template void Forces::cal_force_scc(ModuleBase::matrix& forcescc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const ModuleBase::matrix& vnew, const bool vnew_exist, const bool* numeric, @@ -147,7 +147,7 @@ void Forces::deriv_drhoc_scc(const bool& numeric, const FPTYPE* rab, const FPTYPE* rhoc, FPTYPE* drhocg, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const UnitCell& ucell_in) { int igl0 = 0; double gx = 0; diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp index 4323ee6c4f..8879c89c35 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp @@ -15,7 +15,7 @@ // On output: the contribution is added to \(\text{forcenl}\). template void Forces::cal_force_us(ModuleBase::matrix& forcenl, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const pseudopot_cell_vnl& nlpp, const elecstate::ElecState& elec, const UnitCell& ucell) diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index 5ff3ddf2b1..b8bca46302 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -1,6 +1,7 @@ #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" -// #include "module_psi/kernels/device.h" +// #include "source_psi/kernels/device.h" #include "module_base/module_device/types.h" +#include "module_base/constants.h" #include @@ -10,124 +11,18 @@ #include #define THREADS_PER_BLOCK 256 +#define FULL_MASK 0xffffffff +#define WARP_SIZE 32 namespace hamilt { -// CUDA kernels for sincos loops template -__global__ void cal_force_loc_sincos_kernel( - const int nat, - const int npw, - const int ntype, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* vloc_per_type, - const thrust::complex* aux, - const FPTYPE scale_factor, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Calculate phase: 2π * (G · τ) - const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - - // Use CUDA intrinsic for sincos - FPTYPE sinp, cosp; - sincos(phase, &sinp, &cosp); - - // Calculate force factor - const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; - const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * factor; - local_force_y += gcar[ig * 3 + 1] * factor; - local_force_z += gcar[ig * 3 + 2] * factor; - } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); - atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); - atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); -} - -template -__global__ void cal_force_ew_sincos_kernel( - const int nat, - const int npw, - const int ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* it_facts, - const thrust::complex* aux, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Skip G=0 term - if (ig == ig_gge0) continue; - - // Calculate phase: 2π * (G · τ) - const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - - // Use CUDA intrinsic for sincos - FPTYPE sinp, cosp; - sincos(phase, &sinp, &cosp); - - // Calculate Ewald sum contribution (fixed sign error) - const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * factor; - local_force_y += gcar[ig * 3 + 1] * factor; - local_force_z += gcar[ig * 3 + 2] * factor; +__forceinline__ +__device__ +void warp_reduce(FPTYPE & val) { + for (int offset = 16; offset > 0; offset >>= 1) { + val += __shfl_down_sync(FULL_MASK, val, offset); } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x); - atomicAdd(&force[iat * 3 + 1], local_force_y); - atomicAdd(&force[iat * 3 + 2], local_force_z); } template @@ -304,65 +199,6 @@ void cal_force_nl_op::operator()(const base_dev cudaCheckOnDebug(); } -// GPU operators -template -void cal_force_loc_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ntype, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* vloc_per_type, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force) -{ - // Calculate optimal grid configuration for GPU load balancing - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_per_sm = 32; // Configurable per GPU architecture - const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - cal_force_loc_sincos_kernel<<>>( - nat, npw, ntype, gcar, tau, vloc_per_type, - reinterpret_cast*>(aux), - scale_factor, force - ); - - cudaCheckOnDebug(); -} - -template -void cal_force_ew_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force) -{ - // Calculate optimal grid configuration for GPU load balancing - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_per_sm = 32; // Configurable per GPU architecture - const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - cal_force_ew_sincos_kernel<<>>( - nat, npw, ig_gge0, gcar, tau, it_facts, - reinterpret_cast*>(aux), force - ); - - cudaCheckOnDebug(); -} - template __global__ void cal_force_nl( const int ntype, @@ -781,6 +617,242 @@ void revertVkbValues( cudaCheckOnDebug(); } +template +__global__ void force_loc_kernel( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2gg_d, + const FPTYPE* gcar_d, + const FPTYPE* tau_d, + const thrust::complex* aux_d, + const FPTYPE* vloc_d, + const int vloc_nc, + FPTYPE* forcelc_d) +{ + const int iat = blockIdx.x; + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + if (iat >= nat) return; + const int it = iat2it[iat]; // get the type of atom + + // Initialize force components + FPTYPE force_x = 0.0; + FPTYPE force_y = 0.0; + FPTYPE force_z = 0.0; + + const auto tau_x = tau_d[iat * 3 + 0]; + const auto tau_y = tau_d[iat * 3 + 1]; + const auto tau_z = tau_d[iat * 3 + 2]; + + // Process all plane waves in chunks of blockDim.x + for (int ig = tid; ig < npw; ig += blockDim.x) { + const auto gcar_x = gcar_d[ig * 3 + 0]; + const auto gcar_y = gcar_d[ig * 3 + 1]; + const auto gcar_z = gcar_d[ig * 3 + 2]; + + // Calculate phase factor + const FPTYPE phase = ModuleBase::TWO_PI * (gcar_x * tau_x + + gcar_y * tau_y + + gcar_z * tau_z); + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Get vloc value + const FPTYPE vloc_val = vloc_d[it * vloc_nc + ig2gg_d[ig]]; + + // Calculate factor + const auto aux_val = aux_d[ig]; + const FPTYPE factor = vloc_val * (cosp * aux_val.imag() + sinp * aux_val.real()); + + // Multiply by gcar components + force_x += gcar_x * factor; + force_y += gcar_y * factor; + force_z += gcar_z * factor; + } + + // Warp-level reduction + warp_reduce(force_x); + warp_reduce(force_y); + warp_reduce(force_z); + + // First thread in each warp writes to shared memory + __shared__ FPTYPE warp_sums_x[THREADS_PER_BLOCK / WARP_SIZE]; // 256 threads / 32 = 8 warps + __shared__ FPTYPE warp_sums_y[THREADS_PER_BLOCK / WARP_SIZE]; + __shared__ FPTYPE warp_sums_z[THREADS_PER_BLOCK / WARP_SIZE]; + + if (lane_id == 0) { + warp_sums_x[warp_id] = force_x; + warp_sums_y[warp_id] = force_y; + warp_sums_z[warp_id] = force_z; + } + + __syncthreads(); + + // Final reduction by first warp + if (warp_id == 0) { + FPTYPE final_x = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_x[lane_id] : 0.0; + FPTYPE final_y = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_y[lane_id] : 0.0; + FPTYPE final_z = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_z[lane_id] : 0.0; + + warp_reduce(final_x); + warp_reduce(final_y); + warp_reduce(final_z); + + if (lane_id == 0) { + forcelc_d[iat * 3 + 0] = final_x * tpiba_omega; + forcelc_d[iat * 3 + 1] = final_y * tpiba_omega; + forcelc_d[iat * 3 + 2] = final_z * tpiba_omega; + } + } +} + +template +__global__ void force_ew_kernel( + const int nat, + const int npw, + const int ig_gge0, + const int* iat2it, + const FPTYPE* gcar_d, + const FPTYPE* tau_d, + const FPTYPE* it_fact_d, + const thrust::complex* aux_d, + FPTYPE* forceion_d) +{ + const int iat = blockIdx.x; + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + if( iat >= nat) return; + const int it = iat2it[iat]; // get the type of atom + const FPTYPE it_fact_val = it_fact_d[it]; // Get it_fact value + + // Initialize force components + FPTYPE force_x = 0.0; + FPTYPE force_y = 0.0; + FPTYPE force_z = 0.0; + + const auto tau_x = tau_d[iat * 3 + 0]; + const auto tau_y = tau_d[iat * 3 + 1]; + const auto tau_z = tau_d[iat * 3 + 2]; + + for (int ig = tid; ig < npw; ig += blockDim.x) { + if(ig == ig_gge0) + { continue; } + const auto gcar_x = gcar_d[ig * 3 + 0]; + const auto gcar_y = gcar_d[ig * 3 + 1]; + const auto gcar_z = gcar_d[ig * 3 + 2]; + + // Calculate phase factor + const FPTYPE phase = ModuleBase::TWO_PI * (gcar_x * tau_x + + gcar_y * tau_y + + gcar_z * tau_z); + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate force contribution + const FPTYPE sumnb = -cosp * aux_d[ig].imag() + sinp * aux_d[ig].real(); + + // Multiply by gcar components + force_x += gcar_x * sumnb; + force_y += gcar_y * sumnb; + force_z += gcar_z * sumnb; + } + + // Warp-level reduction + warp_reduce(force_x); + warp_reduce(force_y); + warp_reduce(force_z); + + // First thread in each warp writes to shared memory + __shared__ FPTYPE warp_sums_x[THREADS_PER_BLOCK / WARP_SIZE]; // 256 threads / 32 = 8 warps + __shared__ FPTYPE warp_sums_y[THREADS_PER_BLOCK / WARP_SIZE]; + __shared__ FPTYPE warp_sums_z[THREADS_PER_BLOCK / WARP_SIZE]; + + if (lane_id == 0) { + warp_sums_x[warp_id] = force_x; + warp_sums_y[warp_id] = force_y; + warp_sums_z[warp_id] = force_z; + } + + __syncthreads(); + + // Final reduction by first warp + if (warp_id == 0) { + FPTYPE final_x = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_x[lane_id] : 0.0; + FPTYPE final_y = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_y[lane_id] : 0.0; + FPTYPE final_z = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_z[lane_id] : 0.0; + + warp_reduce(final_x); + warp_reduce(final_y); + warp_reduce(final_z); + + if (lane_id == 0) { + forceion_d[iat * 3 + 0] = final_x * it_fact_val; + forceion_d[iat * 3 + 1] = final_y * it_fact_val; + forceion_d[iat * 3 + 2] = final_z * it_fact_val; + } + } +} + +template +void cal_force_loc_op::operator()( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2igg, + const FPTYPE* gcar, + const FPTYPE* tau, + const std::complex* aux, + const FPTYPE* vloc, + const int vloc_nc, + FPTYPE* forcelc) +{ + force_loc_kernel + <<>>(nat, + npw, + tpiba_omega, + iat2it, + ig2igg, + gcar, + tau, + reinterpret_cast*>(aux), + vloc, + vloc_nc, + forcelc); // array of data + +} + +template +void cal_force_ew_op::operator()( + const int nat, + const int npw, + const int ig_gge0, + const int* iat2it, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_fact, + const std::complex* aux, + FPTYPE* forceion) +{ + force_ew_kernel + <<>>(nat, + npw, + ig_gge0, + iat2it, + gcar, + tau, + it_fact, + reinterpret_cast*>(aux), + forceion); // array of data +} + + // for revertVkbValues functions instantiation template void revertVkbValues(const int *gcar_zero_ptrs, std::complex *vkb_ptr, const std::complex *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx, const std::complex coeff); // for saveVkbValues functions instantiation @@ -788,12 +860,11 @@ template void saveVkbValues(const int *gcar_zero_ptrs, const std::comple template struct cal_vkb1_nl_op; template struct cal_force_nl_op; -template struct cal_force_loc_sincos_op; -template struct cal_force_ew_sincos_op; +template struct cal_force_loc_op; +template struct cal_force_ew_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; -template struct cal_force_loc_sincos_op; -template struct cal_force_ew_sincos_op; - +template struct cal_force_loc_op; +template struct cal_force_ew_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 109321d6c3..367415d76b 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -428,116 +428,10 @@ struct cal_force_nl_op } }; -// CPU implementation of local force sincos operator -template -struct cal_force_loc_sincos_op -{ - void operator()(const base_device::DEVICE_CPU* ctx, - const int& nat, - const int& npw, - const int& ntype, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* vloc_per_type, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force) - { - const FPTYPE TWO_PI = 2.0 * M_PI; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < nat; ++iat) - { - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - - FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - - for (int ig = 0; ig < npw; ig++) - { - const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(phase, &sinp, &cosp); - - const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; - const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor; - - local_force[0] += gcar[ig * 3 + 0] * factor; - local_force[1] += gcar[ig * 3 + 1] * factor; - local_force[2] += gcar[ig * 3 + 2] * factor; - } - - force[iat * 3 + 0] = local_force[0]; - force[iat * 3 + 1] = local_force[1]; - force[iat * 3 + 2] = local_force[2]; - } - } -}; - -// CPU implementation of Ewald force sincos operator -template -struct cal_force_ew_sincos_op -{ - void operator()(const base_device::DEVICE_CPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force) - { - const FPTYPE TWO_PI = 2.0 * M_PI; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < nat; ++iat) - { - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; - - FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - - for (int ig = 0; ig < npw; ig++) - { - // Skip G=0 term - if (ig == ig_gge0) continue; - - const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(phase, &sinp, &cosp); - - const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); - - local_force[0] += gcar[ig * 3 + 0] * factor; - local_force[1] += gcar[ig * 3 + 1] * factor; - local_force[2] += gcar[ig * 3 + 2] * factor; - } - - force[iat * 3 + 0] = local_force[0]; - force[iat * 3 + 1] = local_force[1]; - force[iat * 3 + 2] = local_force[2]; - } - } -}; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; -template struct cal_force_loc_sincos_op; -template struct cal_force_ew_sincos_op; + template struct cal_vkb1_nl_op; template struct cal_force_nl_op; -template struct cal_force_loc_sincos_op; -template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index acf490b278..ca2c6694cf 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -146,67 +146,39 @@ struct cal_force_nl_op const FPTYPE* lambda, const std::complex* becp, const std::complex* dbecp, - FPTYPE* force); + FPTYPE* force); }; template -struct cal_force_loc_sincos_op -{ - /// @brief Calculate local pseudopotential forces (sincos loop only) - /// - /// Input Parameters - /// @param ctx - which device this function runs on - /// @param nat - number of atoms - /// @param npw - number of plane waves - /// @param ntype - number of atom types - /// @param gcar - G-vector Cartesian coordinates [npw * 3] - /// @param tau - atomic positions [nat * 3] - /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw] - /// @param aux - charge density in G-space [npw] - /// @param scale_factor - tpiba * omega - /// - /// Output Parameters - /// @param force - output forces [nat * 3] - void operator()(const Device* ctx, - const int& nat, - const int& npw, - const int& ntype, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* vloc_per_type, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force); +struct cal_force_loc_op{ + void operator()( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2igg, + const FPTYPE* gcar, + const FPTYPE* tau, + const std::complex* aux, + const FPTYPE* vloc, + const int vloc_nr, + FPTYPE* forcelc) {}; }; template -struct cal_force_ew_sincos_op -{ - /// @brief Calculate Ewald forces (sincos loop only) - /// - /// Input Parameters - /// @param ctx - which device this function runs on - /// @param nat - number of atoms - /// @param npw - number of plane waves - /// @param ig_gge0 - index of G=0 vector (-1 if not present) - /// @param gcar - G-vector Cartesian coordinates [npw * 3] - /// @param tau - atomic positions [nat * 3] - /// @param it_facts - precomputed it_fact for each atom [nat] - /// @param aux - structure factor related array [npw] - /// - /// Output Parameters - /// @param force - output forces [nat * 3] - void operator()(const Device* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force); +struct cal_force_ew_op{ + void operator()( + const int nat, + const int npw, + const int ig_gge0, + const int* iat2it, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_fact, + const std::complex* aux, + FPTYPE* forceion + ) {}; }; - #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template struct cal_vkb1_nl_op @@ -306,35 +278,6 @@ struct cal_force_nl_op FPTYPE* force); }; -template -struct cal_force_loc_sincos_op -{ - void operator()(const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ntype, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* vloc_per_type, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force); -}; - -template -struct cal_force_ew_sincos_op -{ - void operator()(const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force); -}; - /** * @brief revert the vkb values for force_nl calculation */ @@ -362,6 +305,36 @@ void saveVkbValues(const int* gcar_zero_ptrs, int ipol, int npwx); +template +struct cal_force_loc_op{ + void operator()( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2igg, + const FPTYPE* gcar, + const FPTYPE* tau, + const std::complex* aux, + const FPTYPE* vloc, + const int vloc_nr, + FPTYPE* forcelc); +}; + +template +struct cal_force_ew_op{ + void operator()( + const int nat, + const int npw, + const int ig_gge0, + const int* iat2it, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_fact, + const std::complex* aux, + FPTYPE* forceion + ); +}; #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM } // namespace hamilt -#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_PW_HAMILT_PWDFT_KERNELS_FORCE_OP_H \ No newline at end of file +#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H \ No newline at end of file