diff --git a/CMakeLists.txt b/CMakeLists.txt index 0824dd762e..2da2648743 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -252,8 +252,8 @@ if(ENABLE_LCAO) add_compile_definitions(__PEXSI) set(CMAKE_CXX_STANDARD 14) endif() - if(NEW_GINT) - add_compile_definitions(__NEW_GINT) + if(OLD_GINT) + add_compile_definitions(__OLD_GINT) endif() else() set(ENABLE_MLALGO OFF) diff --git a/source/Makefile.Objects b/source/Makefile.Objects index c94a7f4f9e..85a5c14a94 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -61,6 +61,7 @@ VPATH=./src_global:\ ./module_hamilt_lcao/module_deltaspin:\ ./module_hamilt_lcao/hamilt_lcaodft/operator_lcao:\ ./module_hamilt_lcao/module_gint:\ +./module_hamilt_lcao/module_gint/temp_gint:\ ./module_relax:\ ./source_hamilt/module_vdw:\ ./module_io:\ @@ -273,13 +274,13 @@ OBJS_ESOLVER_LCAO=esolver_ks_lcao.o\ lcao_others.o\ esolver_dm2rho.o\ -OBJS_GINT=gint.o\ +OBJS_GINT=gint_old.o\ gint_gamma_env.o\ gint_gamma_vl.o\ - gint_fvl.o\ - gint_rho.o\ - gint_tau.o\ - gint_vl.o\ + gint_fvl_old.o\ + gint_rho_old.o\ + gint_tau_old.o\ + gint_vl_old.o\ gint_k_env.o\ gint_k_sparse1.o\ gint_k_pvpr.o\ @@ -298,6 +299,30 @@ OBJS_GINT=gint.o\ cal_ddpsir_ylm.o\ mult_psi_dmr.o\ init_orb.o\ + batch_biggrid.o\ + big_grid.o\ + biggrid_info.o\ + divide_info.o\ + gint_atom.o\ + gint_common.o\ + gint_dvlocal.o\ + gint_env_gamma.o\ + gint_env_k.o\ + gint_fvl_meta.o\ + gint_fvl.o\ + gint_info.o\ + gint_interface.o\ + gint_rho.o\ + gint_tau.o\ + gint_vl_metagga_nspin4.o\ + gint_vl_metagga.o\ + gint_vl_nspin4.o\ + gint_vl.o\ + gint.o\ + localcell_info.o\ + phi_operator.o\ + set_ddphi.o\ + unitcell_info.o\ OBJS_HAMILT=hamilt_pw.o\ hamilt_sdft_pw.o\ diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp index b64f046e3a..6253b1dcf4 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp @@ -68,7 +68,7 @@ void Veff>::contributeHR() double* vr_eff1 = this->pot->get_effective_v(this->current_spin); double* vofk_eff1 = this->pot->get_effective_vofk(this->current_spin); -#ifndef __NEW_GINT +#ifdef __OLD_GINT if(XC_Functional::get_ked_flag()) { Gint_inout inout(vr_eff1, vofk_eff1, Gint_Tools::job_type::vlocal_meta); @@ -113,7 +113,7 @@ void Veff, double>>::contributeHR() double* vr_eff1 = this->pot->get_effective_v(this->current_spin); double* vofk_eff1 = this->pot->get_effective_vofk(this->current_spin); -#ifndef __NEW_GINT +#ifdef __OLD_GINT // if you change the place of the following code, // rememeber to delete the #include if(XC_Functional::get_ked_flag()) @@ -155,7 +155,7 @@ void Veff, std::complex>>::contributeH ModuleBase::TITLE("Veff", "contributeHR"); ModuleBase::timer::tick("Veff", "contributeHR"); -#ifndef __NEW_GINT +#ifdef __OLD_GINT double* vr_eff1 = nullptr; double* vofk_eff1 = nullptr; for (int is = 0; is < 4; is++) @@ -187,19 +187,15 @@ void Veff, std::complex>>::contributeH if(XC_Functional::get_ked_flag()) { vofk_eff[is] = this->pot->get_effective_vofk(is); - if(is == 3) - { - ModuleGint::cal_gint_vl_metagga(vr_eff, vofk_eff, this->hR); - } - } - else - { - if(is == 3) - { - ModuleGint::cal_gint_vl(vr_eff, this->hR); - } } } + if(XC_Functional::get_ked_flag()) + { + ModuleGint::cal_gint_vl_metagga(vr_eff, vofk_eff, this->hR); + } else + { + ModuleGint::cal_gint_vl(vr_eff, this->hR); + } #endif ModuleBase::timer::tick("Veff", "contributeHR"); diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h index 696f094048..8f456695ce 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h +++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h @@ -50,7 +50,9 @@ class Veff> : public OperatorLCAO this->cal_type = calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); +#ifdef __OLD_GINT GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin); +#endif } /** * @brief Construct a new Veff object for Gamma-only calculation @@ -69,8 +71,9 @@ class Veff> : public OperatorLCAO { this->cal_type = calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); - +#ifdef __OLD_GINT GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin); +#endif } ~Veff>(){}; diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp b/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp index 6d2e3326aa..aa59ad87d4 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp @@ -20,7 +20,7 @@ namespace PulayForceStress { const int nspin = PARAM.inp.nspin; -#ifndef __NEW_GINT +#ifdef __OLD_GINT if (set_dmr_gint) { gint.transfer_DM2DtoGrid(dm.get_DMR_vector()); } // 2d block to grid for (int is = 0; is < nspin; ++is) { diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp index fddf2e584d..381c61ec87 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp @@ -2,6 +2,7 @@ #include "module_parameter/parameter.h" #include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" #include void sparse_format::cal_dS(const UnitCell& ucell, @@ -49,7 +50,6 @@ delete[] fsr_dh.DHloc_fixedR_y; delete[] fsr_dh.DHloc_fixedR_z; return; } - void sparse_format::cal_dH(const UnitCell& ucell, const Parallel_Orbitals& pv, LCAO_HS_Arrays& HS_Arrays, @@ -58,6 +58,7 @@ void sparse_format::cal_dH(const UnitCell& ucell, const LCAO_Orbitals& orb, const int& current_spin, const double& sparse_thr, + const ModuleBase::matrix& v_eff, Gint_k& gint_k) { ModuleBase::TITLE("sparse_format", "cal_dH"); @@ -106,8 +107,38 @@ void sparse_format::cal_dH(const UnitCell& ucell, delete[] fsr_dh.DHloc_fixedR_y; delete[] fsr_dh.DHloc_fixedR_z; - gint_k.cal_dvlocal_R_sparseMatrix(current_spin, sparse_thr, HS_Arrays, &pv, ucell, grid); - + if(PARAM.inp.nspin==2) + { +#ifdef __OLD_GINT + gint_k.allocate_pvdpR(); + // note: some MPI process will not have grids when MPI cores are too + // many, v_eff in these processes are empty + const double* vr_eff1 + = v_eff.nc * v_eff.nr > 0 ? &(v_eff(current_spin, 0)) : nullptr; + + if (!PARAM.globalv.gamma_only_local) + { + if (PARAM.inp.vl_in_h) + { + Gint_inout inout(vr_eff1, + current_spin, + Gint_Tools::job_type::dvlocal); + gint_k.cal_gint(&inout); + } + } + gint_k.cal_dvlocal_R_sparseMatrix(current_spin, sparse_thr, HS_Arrays, &pv, ucell, grid); + gint_k.destroy_pvdpR(); +#else + const double* vr_eff1 + = v_eff.nc * v_eff.nr > 0 ? &(v_eff(current_spin, 0)) : nullptr; + if (!PARAM.globalv.gamma_only_local) + { + ModuleGint::cal_dvlocal_R_sparseMatrix( + PARAM.inp.nspin, PARAM.globalv.npol, current_spin, PARAM.globalv.nlocal, + sparse_thr, vr_eff1, pv, ucell, grid, HS_Arrays); + } +#endif + } return; } diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h index 1ec555f4fa..a477a29648 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h +++ b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h @@ -19,6 +19,7 @@ void cal_dH(const UnitCell& ucell, const LCAO_Orbitals& orb, const int& current_spin, const double& sparse_thr, + const ModuleBase::matrix& v_eff, Gint_k& gint_k); // calculated the derivative of the overlap matrix: diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index 7b43114adb..0505957b9c 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -2,13 +2,13 @@ if(ENABLE_LCAO) list(APPEND objects - gint.cpp + gint_old.cpp gint_gamma_env.cpp gint_gamma_vl.cpp - gint_fvl.cpp - gint_rho.cpp - gint_tau.cpp - gint_vl.cpp + gint_fvl_old.cpp + gint_rho_old.cpp + gint_tau_old.cpp + gint_vl_old.cpp gint_k_env.cpp gint_k_sparse1.cpp gint_k_pvpr.cpp @@ -29,7 +29,7 @@ list(APPEND objects init_orb.cpp ) -if(NEW_GINT) +if(NOT DEFINED OLD_GINT) list(APPEND objects temp_gint/biggrid_info.cpp temp_gint/big_grid.cpp @@ -45,6 +45,9 @@ if(NEW_GINT) temp_gint/gint_tau.cpp temp_gint/gint_fvl.cpp temp_gint/gint_fvl_meta.cpp + temp_gint/gint_env_gamma.cpp + temp_gint/gint_env_k.cpp + temp_gint/gint_dvlocal.cpp temp_gint/localcell_info.cpp temp_gint/phi_operator.cpp temp_gint/set_ddphi.cpp @@ -52,6 +55,24 @@ if(NEW_GINT) temp_gint/gint_common.cpp temp_gint/gint_interface.cpp ) + if(USE_CUDA) + list(APPEND objects + temp_gint/kernel/gint_gpu_vars.cpp + temp_gint/kernel/phi_operator_gpu.cu + temp_gint/kernel/phi_operator_kernel.cu + temp_gint/kernel/set_const_mem.cu + temp_gint/batch_biggrid.cpp + temp_gint/gint_vl_gpu.cpp + temp_gint/gint_rho_gpu.cpp + temp_gint/gint_fvl_gpu.cpp + temp_gint/gint_vl_metagga_gpu.cpp + temp_gint/gint_vl_nspin4_gpu.cpp + temp_gint/gint_vl_metagga_nspin4_gpu.cpp + temp_gint/gint_tau_gpu.cpp + temp_gint/gint_fvl_meta_gpu.cpp + temp_gint/kernel/dgemm_vbatch.cu + ) + endif() endif() if(USE_CUDA) diff --git a/source/module_hamilt_lcao/module_gint/gint_fvl.cpp b/source/module_hamilt_lcao/module_gint/gint_fvl_old.cpp similarity index 100% rename from source/module_hamilt_lcao/module_gint/gint_fvl.cpp rename to source/module_hamilt_lcao/module_gint/gint_fvl_old.cpp diff --git a/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp b/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp index 8cd610bde7..4cd9cd7dbb 100644 --- a/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp +++ b/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp @@ -337,7 +337,6 @@ void Gint_k::cal_dvlocal_R_sparseMatrix(const int& current_spin, std::map, std::map>>> pvdpRz_soc_sparseMatrix; - int lgd = 0; double temp_value_double; std::complex temp_value_complex; diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint_old.cpp similarity index 100% rename from source/module_hamilt_lcao/module_gint/gint.cpp rename to source/module_hamilt_lcao/module_gint/gint_old.cpp diff --git a/source/module_hamilt_lcao/module_gint/gint_rho.cpp b/source/module_hamilt_lcao/module_gint/gint_rho_old.cpp similarity index 100% rename from source/module_hamilt_lcao/module_gint/gint_rho.cpp rename to source/module_hamilt_lcao/module_gint/gint_rho_old.cpp diff --git a/source/module_hamilt_lcao/module_gint/gint_tau.cpp b/source/module_hamilt_lcao/module_gint/gint_tau_old.cpp similarity index 100% rename from source/module_hamilt_lcao/module_gint/gint_tau.cpp rename to source/module_hamilt_lcao/module_gint/gint_tau_old.cpp diff --git a/source/module_hamilt_lcao/module_gint/gint_vl.cpp b/source/module_hamilt_lcao/module_gint/gint_vl_old.cpp similarity index 100% rename from source/module_hamilt_lcao/module_gint/gint_vl.cpp rename to source/module_hamilt_lcao/module_gint/gint_vl_old.cpp diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu index 32dfe42b24..c9bf122628 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu @@ -4,27 +4,7 @@ #include "cuda_tools.cuh" -cudaError_t check(cudaError_t result, const char *const func, const char *const file, const int line) -{ - if (result != cudaSuccess) - { - fprintf(stderr, "CUDA Runtime Error at %s:%d code=%s \"%s\" \n", file, line, cudaGetErrorString(result), func); - exit(EXIT_FAILURE); - } - return result; -} -cudaError_t __checkCudaLastError(const char *file, const int line) -{ - cudaError_t result = cudaGetLastError(); - if (result != cudaSuccess) - { - fprintf(stderr, "%s(%i) : getLastCudaError():%s\n", file, line, cudaGetErrorString(result)); - assert(result == cudaSuccess); - } - return result; -} - -void dump_cuda_array_to_file(double* cuda_array, +void dump_cuda_array_to_file(const double* cuda_array, int width, int hight, const std::string& filename) diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh index 803e76ff22..dab697df8c 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh @@ -9,21 +9,47 @@ #include #include -#define checkCuda(val) check(val, #val, __FILE__, __LINE__) -#define checkCudaLastError() __checkCudaLastError(__FILE__, __LINE__) +#define checkCuda(val) check((val), #val, __FILE__, __LINE__) +#define checkCudaLastError() __getLastCudaError(__FILE__, __LINE__) -cudaError_t check(cudaError_t result, const char *const func, const char *const file, const int line); -cudaError_t __checkCudaLastError(const char *file, const int line); +inline void check(cudaError_t result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), cudaGetErrorString(result), func); + exit(EXIT_FAILURE); + } +} + +inline void __getLastCudaError(const char *file, + const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " (%d) %s.\n", + file, line, static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} -void dump_cuda_array_to_file(double* cuda_array, +static inline int ceildiv(int x, int y) +{ + return (x + y - 1) / y; +} + +void dump_cuda_array_to_file(const double* cuda_array, int width, int hight, const std::string& filename); -inline int ceil_div(int a, int b) -{ - return (a + b - 1) / b; -} +// inline int ceil_div(int a, int b) +// { +// return (a + b - 1) / b; +// } /* * @brief: A simple wrapper for cudaMalloc and cudaFree, sync and async CUDA diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh index b45805ec87..230e5a6f44 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh @@ -320,11 +320,6 @@ static __global__ void vbatched_gemm_kernel(int* M, alpha_tmp); } -static inline int ceildiv(int x, int y) -{ - return (x + y - 1) / y; -} - /** * Performs a batched matrix multiplication using the vbatched_gemm_impl * function. diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp new file mode 100644 index 0000000000..8372506e46 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp @@ -0,0 +1,34 @@ +#include "batch_biggrid.h" + +namespace ModuleGint +{ + +int BatchBigGrid::max_batch_size_ = 0; +int BatchBigGrid::max_atoms_num_ = 0; +int BatchBigGrid::max_phi_len_ = 0; +int BatchBigGrid::max_atom_pairs_num_ = 0; + +BatchBigGrid::BatchBigGrid(std::vector> biggrids) +{ + biggrids_ = biggrids; + max_batch_size_ = std::max(max_batch_size_, (int)biggrids_.size()); + int atom_pairs_num = 0; + for(const auto& biggrid : biggrids_) + { + for(const auto& atom: biggrid->get_atoms()) + { + max_nw_ = std::max(max_nw_, atom->get_nw()); + } + max_atoms_num_per_bgrid_ = std::max(max_atoms_num_per_bgrid_, biggrid->get_atoms_num()); + atoms_num_ += biggrid->get_atoms_num(); + atom_pairs_num += std::pow(biggrid->get_atoms_num(), 2); + phi_len_ += biggrid->get_phi_len() * biggrid->get_mgrids_num(); + } + max_atoms_num_ = std::max(max_atoms_num_, atoms_num_); + max_phi_len_ = std::max(max_phi_len_, phi_len_); + max_atom_pairs_num_ = std::max(max_atom_pairs_num_, atom_pairs_num); +} + + + +} diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h new file mode 100644 index 0000000000..d4de77d1db --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h @@ -0,0 +1,50 @@ +#pragma once +#include +#include +#include "big_grid.h" + +namespace ModuleGint +{ + +class BatchBigGrid +{ + public: + BatchBigGrid(std::vector> biggrids); + + const std::vector>& get_bgrids() { return biggrids_; } + + int get_batch_size() const { return biggrids_.size(); } + int get_atoms_num() const { return atoms_num_; } + int get_phi_len() const { return phi_len_;} + int get_max_atoms_num_per_bgrid() const { return max_atoms_num_per_bgrid_; } + bool empty() {return atoms_num_ == 0; } + static int get_max_batch_size() { return max_batch_size_; } + static int get_max_atoms_num() { return max_atoms_num_; } + static int get_max_phi_len() { return max_phi_len_; } + static int get_max_atom_pairs_num() { return max_atom_pairs_num_; } + static std::shared_ptr get_bgrid_info() { return BigGrid::get_bgrid_info(); } + + private: + std::vector> biggrids_; + + // the max nw of an atom + int max_nw_ = 0; + + int phi_len_ = 0; + // number of atoms in the batch + int atoms_num_ = 0; + + // the max number of atoms of a single biggrid + int max_atoms_num_per_bgrid_ = 0; + + // the max number of biggrids of a biggrids batch + static int max_batch_size_; + // the max number of total atoms of a biggrids batch + static int max_atoms_num_; + // the max number of total wavefunctions of a biggrids batch + static int max_phi_len_; + // the max number of atom pairs of a biggrids batch + static int max_atom_pairs_num_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp index d972cd90bb..e20a0fb50a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp @@ -13,7 +13,7 @@ void BigGrid::add_atom(const GintAtom* atom) atoms_.push_back(atom); } -int BigGrid::get_mgrid_phi_len() const +int BigGrid::get_phi_len() const { int len = 0; for(const auto& atom : atoms_) @@ -73,6 +73,11 @@ void BigGrid::set_mgrids_local_idx(std::vector& mgrids_idx) const } } +void BigGrid::set_atom_relative_coords(const GintAtom* atom, std::vector& atom_coord) const +{ + set_atom_relative_coords(atom->get_bgrid_idx(), atom->get_tau_in_bgrid(), atom_coord); +} + void BigGrid::set_atom_relative_coords(const Vec3i bgrid_idx, const Vec3d tau_in_bgrid, std::vector& atom_coord) const { Vec3i this_bgrid_idx = localcell_info_->get_bgrid_global_idx_3D(idx_); @@ -84,17 +89,18 @@ void BigGrid::set_atom_relative_coords(const Vec3i bgrid_idx, const Vec3d tau_in atom_coord.resize(biggrid_info_->get_mgrids_num()); for(int im = 0; im < biggrid_info_->get_mgrids_num(); ++im) { - const Vec3d& mcell_coord = biggrid_info_->get_mgrid_coord(im); - atom_coord[im] = mcell_coord - bgrid_relative_coord; + const Vec3d& mgrid_coord = biggrid_info_->get_mgrid_coord(im); + atom_coord[im] = mgrid_coord - bgrid_relative_coord; } } - -void BigGrid::set_atom_relative_coords(const GintAtom* atom, std::vector& atom_coord) const +Vec3d BigGrid::get_bgrid_atom_rcoord(const GintAtom* atom) const { - set_atom_relative_coords(atom->get_bgrid_idx(), atom->get_tau_in_bgrid(), atom_coord); + Vec3i this_bgrid_idx = localcell_info_->get_bgrid_global_idx_3D(idx_); + return unitcell_info_->get_relative_coord(atom->get_bgrid_idx(), this_bgrid_idx) + atom->get_tau_in_bgrid(); } + bool BigGrid::is_atom_on_bgrid(const GintAtom* atom) const { std::vector coords; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h index c1d5596e13..55bed7a251 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h @@ -17,30 +17,30 @@ class BigGrid // constructor BigGrid(int idx); - static void init_localcell_info(std::shared_ptr localcell_info) { localcell_info_ = localcell_info; }; - static void init_unitcell_info(std::shared_ptr unitcell_info) { unitcell_info_ = unitcell_info; }; - static void init_bgrid_info(std::shared_ptr biggrid_info) { biggrid_info_ = biggrid_info; }; + static void init_localcell_info(std::shared_ptr localcell_info) { localcell_info_ = localcell_info; } + static void init_unitcell_info(std::shared_ptr unitcell_info) { unitcell_info_ = unitcell_info; } + static void init_bgrid_info(std::shared_ptr biggrid_info) { biggrid_info_ = biggrid_info; } // getter functions - int get_idx() const { return idx_; }; - std::shared_ptr get_localcell_info() const { return localcell_info_; }; - std::shared_ptr get_unitcell_info() const {return unitcell_info_; }; - std::shared_ptr get_bgrid_info() const { return biggrid_info_; }; - const std::vector& get_atoms() const { return atoms_; }; - const GintAtom* get_atom(int i) const { return atoms_[i]; }; + int get_idx() const { return idx_; } + static std::shared_ptr get_localcell_info() { return localcell_info_; } + static std::shared_ptr get_unitcell_info() { return unitcell_info_; } + static std::shared_ptr get_bgrid_info() { return biggrid_info_; } + const std::vector& get_atoms() const { return atoms_; } + const GintAtom* get_atom(int i) const { return atoms_[i]; } // get the number of meshgrids in the big grid - int get_mgrids_num() const { return biggrid_info_->get_mgrids_num(); }; + int get_mgrids_num() const { return biggrid_info_->get_mgrids_num(); } // get the number of atoms that can affect the big grid - int get_atoms_num() const { return atoms_.size(); }; + int get_atoms_num() const { return atoms_.size(); } // add an atom to the big grid void add_atom(const GintAtom* atom); // get the total number of phi of a meshgrid // return: (\sum_{i=0}^{atoms_->size()} atoms_[i]->nw) - int get_mgrid_phi_len() const; + int get_phi_len() const; // set the start index of the phi of each atom // return: vector[i] = \sum_{j=0}^{i-1} atoms_[j]->nw @@ -55,6 +55,9 @@ class BigGrid // set the 1D index of the meshgrids in the local cell void set_mgrids_local_idx(std::vector& mgrids_idx) const; + // a wrapper function to get the relative coordinates of the atom and the meshgrids + void set_atom_relative_coords(const GintAtom* atom, std::vector& atom_coord) const; + /** * @brief Set the coordinates of the meshgrids of the big grid relative to an atom * @@ -64,8 +67,8 @@ class BigGrid */ void set_atom_relative_coords(const Vec3i bgrid_idx, const Vec3d tau_in_bgrid, std::vector& atom_coord) const; - // a wrapper function to get the relative coordinates of the atom and the meshgrids - void set_atom_relative_coords(const GintAtom* atom, std::vector& atom_coord) const; + // get the relative coords of the atom and the biggrid (used in gpu code) + Vec3d get_bgrid_atom_rcoord(const GintAtom* atom) const; // if the atom affects the big grid, return true, otherwise false // note when we say an atom affects a big grid, it does not mean that the atom affects all the meshgrid on the big grid, diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h index f8bcb79665..c017f87a3d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h @@ -22,36 +22,36 @@ class BigGridInfo Vec3d biggrid_vec3, int nmx, int nmy, int nmz); - Vec3d get_cartesian_coord(const Vec3d& index_3d) const { return index_3d * biggrid_latvec0_; }; - Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * biggrid_latvec0_; }; - const Vec3d get_direct_coord(const Vec3d& cart_coord) const { return cart_coord * biggrid_GT_; }; + Vec3d get_cartesian_coord(const Vec3d& index_3d) const { return index_3d * biggrid_latvec0_; } + Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * biggrid_latvec0_; } + Vec3d get_direct_coord(const Vec3d& cart_coord) const { return cart_coord * biggrid_GT_; } // Return the maximum number of big grids that can fit inside a sphere of radius r, // along the three lattice vector directions. Vec3i max_ext_bgrid_num(double r) const; // get number of meshgrids along three lattice directions - int get_nmx() const { return nmx_; }; - int get_nmy() const { return nmy_; }; - int get_nmz() const { return nmz_; }; - int get_mgrids_num() const { return nmxyz_; }; + int get_nmx() const { return nmx_; } + int get_nmy() const { return nmy_; } + int get_nmz() const { return nmz_; } + int get_mgrids_num() const { return nmxyz_; } - const std::vector& get_mgrids_coord() const { return meshgrid_coords_; }; - const Vec3d& get_mgrid_coord(int index_1d) const { return meshgrid_coords_[index_1d]; }; + const std::vector& get_mgrids_coord() const { return meshgrid_coords_; } + const Vec3d& get_mgrid_coord(int index_1d) const { return meshgrid_coords_[index_1d]; } - std::shared_ptr get_mgrid_info() const { return meshgrid_info_; }; + std::shared_ptr get_mgrid_info() const { return meshgrid_info_; } // get the 3D index of a meshgrid in the big grid from the 1D index Vec3i mgrid_idx_1Dto3D(int index_1d) const { return index1Dto3D(index_1d, nmx_, nmy_, nmz_); - }; + } // get the 1D index of a meshgrid in the big grid from the 3D index int mgrid_idx_3Dto1D(const Vec3i index_3d) const { return index3Dto1D(index_3d.x, index_3d.y, index_3d.z, nmx_, nmy_, nmz_); - }; + } private: // basis vectors of the big grid diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp index e766c46d9f..d7de110f24 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp @@ -3,6 +3,6 @@ namespace ModuleGint { -std::shared_ptr Gint::gint_info_ = nullptr; +GintInfo* Gint::gint_info_ = nullptr; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint.h index a14f014a6c..1255bae971 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint.h @@ -12,17 +12,15 @@ class Gint Gint() = default; virtual ~Gint() = default; - virtual void cal_gint() = 0; - // note that gint_info_ is a static member variable // it is shared by all instances of Gint - static void init_gint_info(std::shared_ptr gint_info) + static void set_gint_info(GintInfo* gint_info) { gint_info_ = gint_info; } protected: - static std::shared_ptr gint_info_; + static GintInfo* gint_info_; }; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp index 6ae3735ec6..7121694244 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp @@ -1,10 +1,38 @@ #include "source_base/ylm.h" #include "source_base/array_pool.h" #include "gint_atom.h" +#include "source_cell/unitcell.h" #include "gint_helper.h" namespace ModuleGint { +GintAtom::GintAtom( + const Atom* atom, + int it, int ia, int iat, + Vec3i biggrid_idx, + Vec3i unitcell_idx, + Vec3d tau_in_biggrid, + const Numerical_Orbital* orb, + const UnitCell* ucell) +: atom_(atom), it_(it), ia_(ia), iat_(iat), biggrid_idx_(biggrid_idx), + unitcell_idx_(unitcell_idx), tau_in_biggrid_(tau_in_biggrid), + orb_(orb), ucell_(ucell) +{ + p_psi_uniform_.resize(atom_->nw); + p_dpsi_uniform_.resize(atom_->nw); + p_ddpsi_uniform_.resize(atom_->nw); + for (int iw=0; iw < atom_->nw; ++iw) + { + if ( atom_->iw2_new[iw] ) + { + int l = atom_->iw2l[iw]; + int n = atom_->iw2n[iw]; + p_psi_uniform_[iw] = orb_->PhiLN(l, n).psi_uniform.data(); + p_dpsi_uniform_[iw] = orb_->PhiLN(l, n).dpsi_uniform.data(); + p_ddpsi_uniform_[iw] = orb_->PhiLN(l, n).ddpsi_uniform.data(); + } + } +} template void GintAtom::set_phi(const std::vector& coords, const int stride, T* phi) const @@ -14,20 +42,6 @@ void GintAtom::set_phi(const std::vector& coords, const int stride, T* ph // orb_ does not have the member variable dr_uniform const double dr_uniform = orb_->PhiLN(0, 0).dr_uniform; - // store the pointer to reduce repeated address fetching - std::vector p_psi_uniform(atom_->nw); - std::vector p_dpsi_uniform(atom_->nw); - for(int iw = 0; iw < atom_->nw; iw++) - { - if(atom_->iw2_new[iw]) - { - int l = atom_->iw2l[iw]; - int n = atom_->iw2n[iw]; - p_psi_uniform[iw] = orb_->PhiLN(l, n).psi_uniform.data(); - p_dpsi_uniform[iw] = orb_->PhiLN(l, n).dpsi_uniform.data(); - } - } - // store the spherical harmonics // it's outside the loop to reduce the vector allocation overhead std::vector ylma; @@ -35,7 +49,6 @@ void GintAtom::set_phi(const std::vector& coords, const int stride, T* ph for(int im = 0; im < num_mgrids; im++) { const Vec3d& coord = coords[im]; - // 1e-9 is to avoid division by zero const double dist = coord.norm() < 1e-9 ? 1e-9 : coord.norm(); if(dist > orb_->getRcut()) @@ -74,8 +87,8 @@ void GintAtom::set_phi(const std::vector& coords, const int stride, T* ph { if(atom_->iw2_new[iw]) { - auto psi_uniform = p_psi_uniform[iw]; - auto dpsi_uniform = p_dpsi_uniform[iw]; + auto psi_uniform = p_psi_uniform_[iw]; + auto dpsi_uniform = p_dpsi_uniform_[iw]; psi = c1 * psi_uniform[ip] + c2 * dpsi_uniform[ip] + c3 * psi_uniform[ip + 1] + c4 * dpsi_uniform[ip + 1]; } @@ -94,22 +107,6 @@ void GintAtom::set_phi_dphi( // orb_ does not have the member variable dr_uniform const double dr_uniform = orb_->PhiLN(0, 0).dr_uniform; - - // store the pointer to reduce repeated address fetching - std::vector p_psi_uniform(atom_->nw); - std::vector p_dpsi_uniform(atom_->nw); - std::vector phi_nr_uniform(atom_->nw); - for (int iw=0; iw< atom_->nw; ++iw) - { - if ( atom_->iw2_new[iw] ) - { - int l = atom_->iw2l[iw]; - int n = atom_->iw2n[iw]; - p_psi_uniform[iw] = orb_->PhiLN(l, n).psi_uniform.data(); - p_dpsi_uniform[iw] = orb_->PhiLN(l, n).dpsi_uniform.data(); - phi_nr_uniform[iw] = orb_->PhiLN(l, n).nr_uniform; - } - } std::vector rly(std::pow(atom_->nwl + 1, 2)); // TODO: replace array_pool with std::vector @@ -157,24 +154,16 @@ void GintAtom::set_phi_dphi( // function from interpolation method. if(atom_->iw2_new[iw]) { - auto psi_uniform = p_psi_uniform[iw]; - auto dpsi_uniform = p_dpsi_uniform[iw]; - - if(ip >= phi_nr_uniform[iw] - 4) - { - tmp = dtmp = 0.0; - } - else - { - // use Polynomia Interpolation method to get the - // wave functions - - tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) - + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); - - dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) - + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); - } + auto psi_uniform = p_psi_uniform_[iw]; + auto dpsi_uniform = p_dpsi_uniform_[iw]; + // use Polynomia Interpolation method to get the + // wave functions + + tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) + + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); + + dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) + + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); } // new l is used. // get the 'l' of this localized wave function diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h index b1da5d586a..aff8aae5b9 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h @@ -13,28 +13,26 @@ class GintAtom // constructor GintAtom( const Atom* atom, - int ia, - int iat, + int it, int ia, int iat, Vec3i biggrid_idx, Vec3i unitcell_idx, Vec3d tau_in_biggrid, - const Numerical_Orbital* orb) - : atom_(atom), ia_(ia), iat_(iat), biggrid_idx_(biggrid_idx), - unitcell_idx_(unitcell_idx), tau_in_biggrid_(tau_in_biggrid), - orb_(orb) {}; + const Numerical_Orbital* orb, + const UnitCell* ucell); // getter functions - const Atom* get_atom() const { return atom_; }; - int get_ia() const { return ia_; }; - int get_iat() const { return iat_; }; - const Vec3i& get_bgrid_idx() const { return biggrid_idx_; }; - const Vec3i& get_unitcell_idx() const { return unitcell_idx_; }; - const Vec3i& get_R() const { return unitcell_idx_; }; - const Vec3d& get_tau_in_bgrid() const { return tau_in_biggrid_; }; - const Numerical_Orbital* get_orb() const { return orb_; }; - - int get_nw() const { return atom_->nw; }; - double get_rcut() const { return orb_->getRcut(); }; + const Atom* get_atom() const { return atom_; } + int get_ia() const { return ia_; } + int get_iat() const { return iat_; } + int get_start_iw() const { return ucell_->itiaiw2iwt(it_, ia_, 0); } // get the start index of global atomic orbitals + const Vec3i& get_bgrid_idx() const { return biggrid_idx_; } + const Vec3i& get_unitcell_idx() const { return unitcell_idx_; } + const Vec3i& get_R() const { return unitcell_idx_; } + const Vec3d& get_tau_in_bgrid() const { return tau_in_biggrid_; } + const Numerical_Orbital* get_orb() const { return orb_; } + + int get_nw() const { return atom_->nw; } + double get_rcut() const { return orb_->getRcut(); } /** * @brief Get the wave function values of the atom at a meshgrid. @@ -91,13 +89,16 @@ class GintAtom private: // the atom object const Atom* atom_; - - // the global index of the atom - int iat_; + + // the global index of the atom type + int it_; // the global index of the atom among the same type of atoms int ia_; + // the global index of the atom + int iat_; + // the index of big grid which contains this atom Vec3i biggrid_idx_; @@ -109,10 +110,13 @@ class GintAtom Vec3d tau_in_biggrid_; // the numerical orbitals of this atom - // In fact, I think the Numerical_Orbital class - // should be a member of the Atom class, not the GintAtom class const Numerical_Orbital* orb_; + const UnitCell* ucell_; + + std::vector p_psi_uniform_; + std::vector p_dpsi_uniform_; + std::vector p_ddpsi_uniform_; }; } // namespace ModuleGint diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp index 52edcf8e18..39b63191e7 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp @@ -3,22 +3,29 @@ #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h" #include "module_parameter/parameter.h" +#ifdef __MPI +#include "source_base/blacs_connector.h" +#include +#endif + namespace ModuleGint { -void compose_hr_gint(std::shared_ptr> hr_gint) +void compose_hr_gint(HContainer& hr_gint) { - for (int iap = 0; iap < hr_gint->size_atom_pairs(); iap++) + ModuleBase::TITLE("Gint", "compose_hr_gint"); + ModuleBase::timer::tick("Gint", "compose_hr_gint"); + for (int iap = 0; iap < hr_gint.size_atom_pairs(); iap++) { - auto& ap = hr_gint->get_atom_pair(iap); + auto& ap = hr_gint.get_atom_pair(iap); const int iat1 = ap.get_atom_i(); const int iat2 = ap.get_atom_j(); if (iat1 > iat2) { // fill lower triangle matrix with upper triangle matrix // the upper is - const hamilt::AtomPair* upper_ap = hr_gint->find_pair(iat2, iat1); - const hamilt::AtomPair* lower_ap = hr_gint->find_pair(iat1, iat2); + const hamilt::AtomPair* upper_ap = hr_gint.find_pair(iat2, iat1); + const hamilt::AtomPair* lower_ap = hr_gint.find_pair(iat1, iat2); #ifdef __DEBUG assert(upper_ap != nullptr); #endif @@ -37,22 +44,25 @@ void compose_hr_gint(std::shared_ptr> hr_gint) } } } + ModuleBase::timer::tick("Gint", "compose_hr_gint"); } -void compose_hr_gint(std::vector>> hr_gint_part, - std::shared_ptr>> hr_gint_full) +void compose_hr_gint(const std::vector>& hr_gint_part, + HContainer>& hr_gint_full) { - for (int iap = 0; iap < hr_gint_full->size_atom_pairs(); iap++) + ModuleBase::TITLE("Gint", "compose_hr_gint"); + ModuleBase::timer::tick("Gint", "compose_hr_gint"); + for (int iap = 0; iap < hr_gint_full.size_atom_pairs(); iap++) { - auto* ap = &hr_gint_full->get_atom_pair(iap); + auto* ap = &(hr_gint_full.get_atom_pair(iap)); const int iat1 = ap->get_atom_i(); const int iat2 = ap->get_atom_j(); if (iat1 <= iat2) { hamilt::AtomPair>* upper_ap = ap; - hamilt::AtomPair>* lower_ap = hr_gint_full->find_pair(iat2, iat1); - const hamilt::AtomPair* ap_nspin_0 = hr_gint_part[0]->find_pair(iat1, iat2); - const hamilt::AtomPair* ap_nspin_3 = hr_gint_part[3]->find_pair(iat1, iat2); + hamilt::AtomPair>* lower_ap = hr_gint_full.find_pair(iat2, iat1); + const hamilt::AtomPair* ap_nspin_0 = hr_gint_part[0].find_pair(iat1, iat2); + const hamilt::AtomPair* ap_nspin_3 = hr_gint_part[3].find_pair(iat1, iat2); for (int ir = 0; ir < upper_ap->get_R_size(); ir++) { const auto R_index = upper_ap->get_R_index(ir); @@ -72,8 +82,8 @@ void compose_hr_gint(std::vector>> hr_gint_pa if (PARAM.globalv.domag) { - const hamilt::AtomPair* ap_nspin_1 = hr_gint_part[1]->find_pair(iat1, iat2); - const hamilt::AtomPair* ap_nspin_2 = hr_gint_part[2]->find_pair(iat1, iat2); + const hamilt::AtomPair* ap_nspin_1 = hr_gint_part[1].find_pair(iat1, iat2); + const hamilt::AtomPair* ap_nspin_2 = hr_gint_part[2].find_pair(iat1, iat2); const auto mat_nspin_1 = ap_nspin_1->find_matrix(R_index); const auto mat_nspin_2 = ap_nspin_2->find_matrix(R_index); for (int irow = 0; irow < mat_nspin_1->get_row_size(); ++irow) @@ -101,65 +111,68 @@ void compose_hr_gint(std::vector>> hr_gint_pa } } } + ModuleBase::timer::tick("Gint", "compose_hr_gint"); } template -void transfer_hr_gint_to_hR(std::shared_ptr> hr_gint, HContainer* hR) +void transfer_hr_gint_to_hR(const HContainer& hr_gint, HContainer& hR) { + ModuleBase::TITLE("Gint", "transfer_hr_gint_to_hR"); + ModuleBase::timer::tick("Gint", "transfer_hr_gint_to_hR"); #ifdef __MPI int size = 0; MPI_Comm_size(MPI_COMM_WORLD, &size); if (size == 1) { - hR->add(*hr_gint); + hR.add(hr_gint); } else { - hamilt::transferSerials2Parallels(*hr_gint, hR); + hamilt::transferSerials2Parallels(hr_gint, &hR); } #else - hR->add(*hr_gint); + hR.add(hr_gint); #endif + ModuleBase::timer::tick("Gint", "transfer_hr_gint_to_hR"); } // gint_info should not have been a parameter, but it was added to initialize dm_gint_full // In the future, we might try to remove the gint_info parameter template void transfer_dm_2d_to_gint( - std::shared_ptr gint_info, + const GintInfo& gint_info, std::vector*> dm, - std::vector>> dm_gint) + std::vector>& dm_gint) { - // To check whether input parameter dm_2d has been initialized -#ifdef __DEBUG - assert(PARAM.inp.nspin == dm.size() - && "The size of dm should be equal to the number of spins!"); -#endif + ModuleBase::TITLE("Gint", "transfer_dm_2d_to_gint"); + ModuleBase::timer::tick("Gint", "transfer_dm_2d_to_gint"); if (PARAM.inp.nspin != 4) { - for (int is = 0; is < PARAM.inp.nspin; is++) + // dm_gint.size() usually equals to PARAM.inp.nspin, + // but there is exception within module_lr + for (int is = 0; is < dm_gint.size(); is++) { #ifdef __MPI - hamilt::transferParallels2Serials(*dm[is], dm_gint[is].get()); + hamilt::transferParallels2Serials(*dm[is], &dm_gint[is]); #else - dm_gint[is]->set_zero(); - dm_gint[is]->add(*dm[is]); + dm_gint[is].set_zero(); + dm_gint[is].add(*dm[is]); #endif } } else // NSPIN=4 case { #ifdef __MPI const int npol = 2; - std::shared_ptr> dm_full = gint_info->get_hr(npol); - hamilt::transferParallels2Serials(*dm[0], dm_full.get()); + HContainer dm_full = gint_info.get_hr(npol); + hamilt::transferParallels2Serials(*dm[0], &dm_full); #else - HContainer* dm_full = dm[0]; + HContainer& dm_full = *(dm[0]); #endif std::vector tmp_pointer(4, nullptr); - for (int iap = 0; iap < dm_full->size_atom_pairs(); iap++) + for (int iap = 0; iap < dm_full.size_atom_pairs(); iap++) { - auto& ap = dm_full->get_atom_pair(iap); + auto& ap = dm_full.get_atom_pair(iap); const int iat1 = ap.get_atom_i(); const int iat2 = ap.get_atom_j(); for (int ir = 0; ir < ap.get_R_size(); ir++) @@ -168,7 +181,7 @@ void transfer_dm_2d_to_gint( for (int is = 0; is < 4; is++) { tmp_pointer[is] = - dm_gint[is]->find_matrix(iat1, iat2, r_index)->get_pointer(); + dm_gint[is].find_matrix(iat1, iat2, r_index)->get_pointer(); } T* data_full = ap.get_pointer(ir); for (int irow = 0; irow < ap.get_row_size(); irow += 2) @@ -189,21 +202,142 @@ void transfer_dm_2d_to_gint( } } } + ModuleBase::timer::tick("Gint", "transfer_dm_2d_to_gint"); +} + +int globalIndex(int localindex, int nblk, int nprocs, int myproc) +{ + const int iblock = localindex / nblk; + const int gIndex = (iblock * nprocs + myproc) * nblk + localindex % nblk; + return gIndex; +} + +int localIndex(int globalindex, int nblk, int nprocs, int& myproc) +{ + myproc = int((globalindex % (nblk * nprocs)) / nblk); + return int(globalindex / (nblk * nprocs)) * nblk + globalindex % nblk; } +template +void wfc_2d_to_gint(const T* wfc_2d, + int nbands, // needed if MPI is disabled + int nlocal, // needed if MPI is disabled + const Parallel_Orbitals& pv, + T* wfc_gint, + const GintInfo& gint_info) +{ + ModuleBase::TITLE("Gint", "wfc_2d_to_gint"); + ModuleBase::timer::tick("Gint", "wfc_2d_to_gint"); + +#ifdef __MPI + // dimension related + nlocal = pv.desc_wfc[2]; + nbands = pv.desc_wfc[3]; + + const std::vector& trace_lo = gint_info.get_trace_lo(); + + // MPI and memory related + const int mem_stride = 1; + int mpi_info = 0; + + // get the rank of the current process + int rank = 0; + MPI_Comm_rank(pv.comm(), &rank); + + // calculate the maximum number of nlocal over all processes in pv.comm() range + long buf_size; + mpi_info = MPI_Reduce(&pv.nloc_wfc, &buf_size, 1, MPI_LONG, MPI_MAX, 0, pv.comm()); + mpi_info = MPI_Bcast(&buf_size, 1, MPI_LONG, 0, pv.comm()); // get and then broadcast + std::vector wfc_block(buf_size); + + // this quantity seems to have the value returned by function numroc_ in ScaLAPACK? + int naroc[2]; + + // for BLACS broadcast + char scope = 'A'; + char top = ' '; + + // loop over all processors + for (int iprow = 0; iprow < pv.dim0; ++iprow) + { + for (int ipcol = 0; ipcol < pv.dim1; ++ipcol) + { + if (iprow == pv.coord[0] && ipcol == pv.coord[1]) + { + BlasConnector::copy(pv.nloc_wfc, wfc_2d, mem_stride, wfc_block.data(), mem_stride); + naroc[0] = pv.nrow; + naroc[1] = pv.ncol_bands; + Cxgebs2d(pv.blacs_ctxt, &scope, &top, 2, 1, naroc, 2); + Cxgebs2d(pv.blacs_ctxt, &scope, &top, buf_size, 1, wfc_block.data(), buf_size); + } + else + { + Cxgebr2d(pv.blacs_ctxt, &scope, &top, 2, 1, naroc, 2, iprow, ipcol); + Cxgebr2d(pv.blacs_ctxt, &scope, &top, buf_size, 1, wfc_block.data(), buf_size, iprow, ipcol); + } + + // then use it to set the wfc_grid. + const int nb = pv.nb; + const int dim0 = pv.dim0; + const int dim1 = pv.dim1; + for (int j = 0; j < naroc[1]; ++j) + { + int igcol = globalIndex(j, nb, dim1, ipcol); + if (igcol >= PARAM.inp.nbands) + { + continue; + } + for (int i = 0; i < naroc[0]; ++i) + { + int igrow = globalIndex(i, nb, dim0, iprow); + int mu_local = trace_lo[igrow]; + if (wfc_gint && mu_local >= 0) + { + wfc_gint[igcol * nlocal + mu_local] = wfc_block[j * naroc[0] + i]; + } + } + } + // this operation will let all processors have the same wfc_grid + } + } +#else + for (int i = 0; i < nbands; ++i) + { + for (int j = 0; j < nlocal; ++j) + { + wfc_gint[i * nlocal + j] = wfc_2d[i * nlocal + j]; + } + } +#endif + ModuleBase::timer::tick("Gint", "wfc_2d_to_gint"); +} template void transfer_hr_gint_to_hR( - std::shared_ptr> hr_gint, - HContainer* hR); + const HContainer& hr_gint, + HContainer& hR); template void transfer_hr_gint_to_hR( - std::shared_ptr>> hr_gint, - HContainer>* hR); + const HContainer>& hr_gint, + HContainer>& hR); template void transfer_dm_2d_to_gint( - std::shared_ptr gint_info, + const GintInfo& gint_info, std::vector*> dm, - std::vector>> dm_gint); + std::vector>& dm_gint); template void transfer_dm_2d_to_gint( - std::shared_ptr gint_info, + const GintInfo& gint_info, std::vector>*> dm, - std::vector>>> dm_gint); + std::vector>>& dm_gint); +template void wfc_2d_to_gint( + const double* wfc_2d, + int nbands, + int nlocal, + const Parallel_Orbitals& pv, + double* wfc_grid, + const GintInfo& gint_info); +template void wfc_2d_to_gint( + const std::complex* wfc_2d, + int nbands, + int nlocal, + const Parallel_Orbitals& pv, + std::complex* wfc_grid, + const GintInfo& gint_info); } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h index 47f0eda35b..485978ccf8 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h @@ -5,18 +5,20 @@ namespace ModuleGint { // fill the lower triangle matrix with the upper triangle matrix - void compose_hr_gint(std::shared_ptr> hr_gint); + void compose_hr_gint(HContainer& hr_gint); // for nspin=4 case - void compose_hr_gint(std::vector>> hr_gint_part, - std::shared_ptr>> hr_gint_full); + void compose_hr_gint(const std::vector>& hr_gint_part, + HContainer>& hr_gint_full); template - void transfer_hr_gint_to_hR(std::shared_ptr> hr_gint, HContainer* hR); + void transfer_hr_gint_to_hR(const HContainer& hr_gint, HContainer& hR); template void transfer_dm_2d_to_gint( - std::shared_ptr gint_info, + const GintInfo& gint_info, std::vector*> dm, - std::vector>> dm_gint); + std::vector>& dm_gint); + template + void wfc_2d_to_gint(const T* wfc_2d, int nbands, int nlocal, const Parallel_Orbitals& pv, T* wfc_grid, const GintInfo& gint_info); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp new file mode 100644 index 0000000000..78a8b91069 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp @@ -0,0 +1,271 @@ +#include +#include "gint_dvlocal.h" +#include "phi_operator.h" +#include "source_base/parallel_reduce.h" + +namespace ModuleGint +{ + +void Gint_dvlocal::cal_dvlocal() +{ + ModuleBase::TITLE("Gint", "cal_gint_dvlocal"); + ModuleBase::timer::tick("Gint", "cal_gint_dvlocal"); + init_hr_gint_(); + cal_hr_gint_(); + ModuleBase::timer::tick("Gint", "cal_gint_dvlocal"); +} + +void Gint_dvlocal::init_hr_gint_() +{ + pvdpRx = gint_info_->get_hr(); + pvdpRy = gint_info_->get_hr(); + pvdpRz = gint_info_->get_hr(); +} + +void Gint_dvlocal::cal_hr_gint_() +{ +#pragma omp parallel + { + PhiOperator phi_op; + std::vector phi; + std::vector phi_vldr3; + std::vector dphi_x; + std::vector dphi_y; + std::vector dphi_z; +#pragma omp for schedule(dynamic) + for(const auto& biggrid: gint_info_->get_biggrids()) + { + if(biggrid->get_atoms().empty()) + { + continue; + } + phi_op.set_bgrid(biggrid); + const int phi_len = phi_op.get_rows() * phi_op.get_cols(); + phi.resize(phi_len); + phi_vldr3.resize(phi_len); + dphi_x.resize(phi_len); + dphi_y.resize(phi_len); + dphi_z.resize(phi_len); + phi_op.set_phi_dphi(phi.data(), dphi_x.data(), dphi_y.data(), dphi_z.data()); + phi_op.phi_mul_vldr3(vr_eff_, dr3_, phi.data(), phi_vldr3.data()); + phi_op.phi_mul_phi(phi_vldr3.data(), dphi_x.data(), pvdpRx, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi_vldr3.data(), dphi_y.data(), pvdpRy, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi_vldr3.data(), dphi_z.data(), pvdpRz, PhiOperator::Triangular_Matrix::Upper); + } + } +} + +void Gint_dvlocal::cal_dvlocal_R_sparseMatrix( + const int nspin, + const int cspin, + const int nlocal, + const double sparse_thr, + const Parallel_Orbitals& pv, + const UnitCell& ucell, + const Grid_Driver& gdriver, + LCAO_HS_Arrays& hs_arrays) +{ + ModuleBase::TITLE("Gint", "cal_dvlocal_R_sparseMatrix"); + ModuleBase::timer::tick("Gint", "cal_dvlocal_R_sparseMatrix"); + std::map, std::map>> pvdpRx_sparseMatrix; + std::map, std::map>> pvdpRy_sparseMatrix; + std::map, std::map>> pvdpRz_sparseMatrix; + + double temp_value_double; + + Vec3d tau1, dtau; + for (int iap = 0; iap < pvdpRx.size_atom_pairs(); iap++) + { + const auto& ap = pvdpRx.get_atom_pair(iap); + const int iat1 = ap.get_atom_i(); + const int iat2 = ap.get_atom_j(); + const int it1 = ucell.iat2it[iat1]; + const int it2 = ucell.iat2it[iat2]; + const Atom* atom1 = &ucell.atoms[it1]; + const Atom* atom2 = &ucell.atoms[it2]; + const int start1 = ucell.itiaiw2iwt(it1, ucell.iat2ia[iat1], 0); + const int start2 = ucell.itiaiw2iwt(it2, ucell.iat2ia[iat2], 0); + + for (int ir = 0; ir < ap.get_R_size(); ir++) + { + const ModuleBase::Vector3 R = ap.get_R_index(ir); + Abfs::Vector3_Order dR(R.x, R.y, R.z); + double* p_pvdpRx = pvdpRx.get_atom_pair(iap).get_pointer(ir); + double* p_pvdpRy = pvdpRy.get_atom_pair(iap).get_pointer(ir); + double* p_pvdpRz = pvdpRz.get_atom_pair(iap).get_pointer(ir); + + for (int iw = 0; iw < atom1->nw * npol_; iw++) + { + for (int iw2 = 0; iw2 < atom2->nw * npol_; iw2++) + { + const int nw = atom2->nw; + const int mug0 = iw / npol_; + const int nug0 = iw2 / npol_; + const int iw_nowg = mug0 * nw + nug0; + + double temp_value = p_pvdpRx[iw_nowg]; + if (std::abs(temp_value) > sparse_thr) + { + pvdpRx_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value; + } + temp_value = p_pvdpRy[iw_nowg]; + if (std::abs(temp_value) > sparse_thr) + { + pvdpRy_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value; + } + temp_value = p_pvdpRz[iw_nowg]; + if (std::abs(temp_value) > sparse_thr) + { + pvdpRz_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value; + } + } + } + } + } + distribute_pvdpR_sparseMatrix(cspin, 0, nlocal, sparse_thr, pvdpRx_sparseMatrix, pv, hs_arrays); + distribute_pvdpR_sparseMatrix(cspin, 1, nlocal, sparse_thr, pvdpRy_sparseMatrix, pv, hs_arrays); + distribute_pvdpR_sparseMatrix(cspin, 2, nlocal, sparse_thr, pvdpRz_sparseMatrix, pv, hs_arrays); + ModuleBase::timer::tick("Gint", "cal_dvlocal_R_sparseMatrix"); +} + + +void Gint_dvlocal::distribute_pvdpR_sparseMatrix( + const int cspin, + const int dim, + const int nlocal, + const double sparse_threshold, + const std::map, + std::map>>& + pvdpR_sparseMatrix, + const Parallel_Orbitals& pv, + LCAO_HS_Arrays& hs_arrays) +{ + int total_R_num = hs_arrays.all_R_coor.size(); + std::vector nonzero_num(total_R_num); + std::vector minus_nonzero_num(total_R_num); + int count = 0; + for (const auto& R_coor: hs_arrays.all_R_coor) + { + auto iter = pvdpR_sparseMatrix.find(R_coor); + if (iter != pvdpR_sparseMatrix.end()) + { + for (auto& row_loop: iter->second) + { + nonzero_num[count] += row_loop.second.size(); + } + } + + auto minus_R_coor = -1 * R_coor; + + iter = pvdpR_sparseMatrix.find(minus_R_coor); + if (iter != pvdpR_sparseMatrix.end()) + { + for (auto& row_loop: iter->second) + { + minus_nonzero_num[count] += row_loop.second.size(); + } + } + count++; + } + + Parallel_Reduce::reduce_all(nonzero_num.data(), total_R_num); + Parallel_Reduce::reduce_all(minus_nonzero_num.data(), total_R_num); + + std::vector tmp(nlocal); + count = 0; + + const std::vector& trace_lo = gint_info_->get_trace_lo(); + for (const auto& R_coor: hs_arrays.all_R_coor) + { + if (nonzero_num[count] != 0 || minus_nonzero_num[count] != 0) + { + auto minus_R_coor = -1 * R_coor; + + for (int row = 0; row < nlocal; ++row) + { + tmp.assign(tmp.size(), 0); + + auto iter = pvdpR_sparseMatrix.find(R_coor); + if (iter != pvdpR_sparseMatrix.end()) + { + + if (trace_lo[row] >= 0) + { + auto row_iter = iter->second.find(row); + if (row_iter != iter->second.end()) + { + for (auto& value: row_iter->second) + { + tmp[value.first] = value.second; + } + } + } + } + + auto minus_R_iter = pvdpR_sparseMatrix.find(minus_R_coor); + if (minus_R_iter != pvdpR_sparseMatrix.end()) + { + for (int col = 0; col < row; ++col) + { + if (trace_lo[col] >= 0) + { + auto row_iter = minus_R_iter->second.find(col); + if (row_iter != minus_R_iter->second.end()) + { + auto col_iter = row_iter->second.find(row); + if (col_iter != row_iter->second.end()) + { + tmp[col] = col_iter->second; + } + } + } + } + } + + Parallel_Reduce::reduce_pool(tmp.data(), nlocal); + + if (pv.global2local_row(row) >= 0) + { + for (int col = 0; col < nlocal; ++col) + { + if (pv.global2local_col(col) >= 0) + { + if (std::abs(tmp[col]) > sparse_threshold) + { + if (dim == 0) + { + double& value = hs_arrays.dHRx_sparse[cspin][R_coor][row][col]; + value += tmp[col]; + if (std::abs(value) <= sparse_threshold) + { + hs_arrays.dHRx_sparse[cspin][R_coor][row].erase(col); + } + } + if (dim == 1) + { + double& value = hs_arrays.dHRy_sparse[cspin][R_coor][row][col]; + value += tmp[col]; + if (std::abs(value) <= sparse_threshold) + { + hs_arrays.dHRy_sparse[cspin][R_coor][row].erase(col); + } + } + if (dim == 2) + { + double& value = hs_arrays.dHRz_sparse[cspin][R_coor][row][col]; + value += tmp[col]; + if (std::abs(value) <= sparse_threshold) + { + hs_arrays.dHRz_sparse[cspin][R_coor][row].erase(col); + } + } + } + } + } + } + } + } + count++; + } +} +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h new file mode 100644 index 0000000000..77976aad78 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h @@ -0,0 +1,65 @@ +#pragma once +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_HS_arrays.hpp" +#include "source_base/abfs-vector3_order.h" +#include "gint.h" +#include "gint_info.h" + +namespace ModuleGint +{ + +class Gint_dvlocal : public Gint +{ + public: + Gint_dvlocal( + const double* vr_eff, + const int nspin, + const int npol) + : vr_eff_(vr_eff), nspin_(nspin), npol_(npol), dr3_(gint_info_->get_mgrid_volume()) + { + assert(nspin_ == 2); // currently only npin == 2 is supported + } + + void cal_dvlocal(); + + void cal_dvlocal_R_sparseMatrix( + const int nspin, + const int cspin, + const int nlocal, + const double sparse_thr, + const Parallel_Orbitals& pv, + const UnitCell& ucell, + const Grid_Driver& gdriver, + LCAO_HS_Arrays& hs_arrays); + + private: + void init_hr_gint_(); + + void cal_hr_gint_(); + + void distribute_pvdpR_sparseMatrix( + const int cspin, + const int dim, + const int nlocal, + const double sparse_threshold, + const std::map, + std::map>>& + pvdpR_sparseMatrix, + const Parallel_Orbitals& pv, + LCAO_HS_Arrays& HS_Arrays); + + // input + const double* vr_eff_; + int nspin_; + int npol_; + + // intermediate variables + double dr3_; + HContainer pvdpRx; + HContainer pvdpRy; + HContainer pvdpRz; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp new file mode 100644 index 0000000000..71fabbd703 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp @@ -0,0 +1,48 @@ +#include "gint_env_gamma.h" +#include "gint_common.h" +#include "phi_operator.h" + +namespace ModuleGint +{ + +Gint_env_gamma::Gint_env_gamma( + const double* psid, + const Parallel_Orbitals* pv, + const int nbands, + const int nlocal, + double* rho) + :rho_(rho) +{ + wfc_gint_.resize(nbands * gint_info_->get_lgd()); + wfc_2d_to_gint(psid, nbands, nlocal, *pv, wfc_gint_.data(), *gint_info_); +} + +void Gint_env_gamma::cal_env_band(const int iband) +{ + ModuleBase::TITLE("Gint", "cal_gint_env"); + ModuleBase::timer::tick("Gint", "cal_gint_env"); + ModuleBase::GlobalFunc::ZEROS(rho_, gint_info_->get_local_mgrid_num()); + const double* wfc_gint_band = &wfc_gint_[iband * gint_info_->get_lgd()]; +#pragma omp parallel + { + PhiOperator phi_op; + std::vector phi; +#pragma omp for schedule(dynamic) + for(const auto& biggrid: gint_info_->get_biggrids()) + { + if(biggrid->get_atoms().empty()) + { + continue; + } + phi_op.set_bgrid(biggrid); + const int phi_len = phi_op.get_rows() * phi_op.get_cols(); + phi.resize(phi_len); + phi_op.set_phi(phi.data()); + phi_op.cal_env_gamma(phi.data(), wfc_gint_band, gint_info_->get_trace_lo(), rho_); + } + } + ModuleBase::timer::tick("Gint", "cal_gint_env"); +} + + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h new file mode 100644 index 0000000000..6ba3dca4fa --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" + +namespace ModuleGint +{ + +class Gint_env_gamma : public Gint +{ + public: + Gint_env_gamma( + const double* psid, + const Parallel_Orbitals* pv, + const int nbands, + const int nlocal, + double* rho); + + void cal_env_band(const int iband); + + private: + // output + double* rho_; + + // intermediate variable + std::vector wfc_gint_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp new file mode 100644 index 0000000000..b92ed8ddfc --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp @@ -0,0 +1,54 @@ +#include "gint_env_k.h" +#include "gint_common.h" +#include "phi_operator.h" + +namespace ModuleGint +{ + +Gint_env_k::Gint_env_k( + const std::complex* psid, + const Parallel_Orbitals* pv, + const std::vector& kvec_c, + const std::vector& kvec_d, + const int nbands, + const int nlocal, + const int ik, + const int nspin, + const int npol, + double* rho) + :kvec_c_(kvec_c), kvec_d_(kvec_d), ik_(ik), nspin_(nspin), npol_(npol), rho_(rho) +{ + wfc_gint_.resize(nbands * gint_info_->get_lgd()); + wfc_2d_to_gint(psid, nbands, nlocal, *pv, wfc_gint_.data(), *gint_info_); +} + +void Gint_env_k::cal_env_band(const int iband) +{ + ModuleBase::TITLE("Gint", "cal_gint_env"); + ModuleBase::timer::tick("Gint", "cal_gint_env"); + ModuleBase::GlobalFunc::ZEROS(rho_, gint_info_->get_local_mgrid_num()); + const std::complex* wfc_gint_band = &wfc_gint_[iband * gint_info_->get_lgd()]; +#pragma omp parallel + { + PhiOperator phi_op; + std::vector phi; +#pragma omp for schedule(dynamic) + for(const auto& biggrid: gint_info_->get_biggrids()) + { + if(biggrid->get_atoms().empty()) + { + continue; + } + phi_op.set_bgrid(biggrid); + const int phi_len = phi_op.get_rows() * phi_op.get_cols(); + phi.resize(phi_len); + phi_op.set_phi(phi.data()); + phi_op.cal_env_k(phi.data(), wfc_gint_band, gint_info_->get_trace_lo(), ik_, nspin_, + npol_, gint_info_->get_lgd(), kvec_c_, kvec_d_, rho_); + } + } + ModuleBase::timer::tick("Gint", "cal_gint_env"); +} + + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h new file mode 100644 index 0000000000..4d1232e591 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" + +namespace ModuleGint +{ + +class Gint_env_k : public Gint +{ + public: + Gint_env_k( + const std::complex* psid, + const Parallel_Orbitals* pv, + const std::vector& kvec_c, + const std::vector& kvec_d, + const int nbands, + const int nlocal, + const int ik, + const int nspin, + const int npol, + double* rho); + + void cal_env_band(const int iband); + + private: + // input + const std::vector& kvec_c_; + const std::vector& kvec_d_; + int ik_; + int nspin_; + int npol_; + + // output + double* rho_; + + // intermediate variable + std::vector> wfc_gint_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp index 01fd6de0ab..3fc9bde005 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp @@ -8,9 +8,12 @@ namespace ModuleGint void Gint_fvl::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_fvl"); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_fvl_svl_(); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); } void Gint_fvl::init_dm_gint_() @@ -64,7 +67,7 @@ void Gint_fvl::cal_fvl_svl_() for (int is = 0; is < nspin_; is++) { phi_op.phi_mul_vldr3(vr_eff_[is], dr3_, phi.data(), phi_vldr3.data()); - phi_op.phi_mul_dm(phi_vldr3.data(), *dm_gint_vec_[is], false, phi_vldr3_dm.data()); + phi_op.phi_mul_dm(phi_vldr3.data(), dm_gint_vec_[is], false, phi_vldr3_dm.data()); if(isforce_) { phi_op.phi_dot_dphi(phi_vldr3_dm.data(), dphi_x.data(), dphi_y.data(), dphi_z.data(), fvl_thread); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h index 013c7b2e0a..9e225fed0f 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h @@ -23,9 +23,9 @@ class Gint_fvl : public Gint ModuleBase::matrix* svl) : nspin_(nspin), vr_eff_(vr_eff), dm_vec_(dm_vec), isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), - dr3_(gint_info_->get_mgrid_volume()) {}; + dr3_(gint_info_->get_mgrid_volume()) {} - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); @@ -44,7 +44,7 @@ class Gint_fvl : public Gint ModuleBase::matrix* svl_; // intermediate variables - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; double dr3_; }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp new file mode 100644 index 0000000000..1d90304d2c --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp @@ -0,0 +1,134 @@ +#include "gint_fvl_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_fvl_gpu::cal_gint() +{ + ModuleBase::TITLE("Gint", "cal_gint_fvl"); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); + init_dm_gint_(); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); + cal_fvl_svl_(); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); +} + +void Gint_fvl_gpu::init_dm_gint_() +{ + dm_gint_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_vec_[is] = gint_info_->get_hr(); + } +} + +void Gint_fvl_gpu::transfer_cpu_to_gpu_() +{ + dm_gint_d_vec_.resize(nspin_); + vr_eff_d_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is].get_nnr(), 0, false); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), + dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + vr_eff_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_vec_[is].get_device_ptr(), vr_eff_[is], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + } + if (isforce_) + { + fvl_d_ = CudaMemWrapper(gint_info_->get_nat() * 3, 0, true); + } + if (isstress_) + { + svl_d_ = CudaMemWrapper(6, 0, true); + } +} + +void Gint_fvl_gpu::transfer_gpu_to_cpu_() +{ + if (isforce_) + { + fvl_d_.copy_device_to_host_sync(); + for (int iat = 0; iat < gint_info_->get_nat(); iat++) + { + for (int j = 0; j < 3; j++) + { + fvl_[0](iat, j) += fvl_d_.get_host_ptr()[iat * 3 + j]; + } + } + } + if (isstress_) + { + svl_d_.copy_device_to_host_sync(); + svl_[0](0, 0) += svl_d_.get_host_ptr()[0]; + svl_[0](0, 1) += svl_d_.get_host_ptr()[1]; + svl_[0](0, 2) += svl_d_.get_host_ptr()[2]; + svl_[0](1, 1) += svl_d_.get_host_ptr()[3]; + svl_[0](1, 2) += svl_d_.get_host_ptr()[4]; + svl_[0](2, 2) += svl_d_.get_host_ptr()[5]; + } +} + +void Gint_fvl_gpu::cal_fvl_svl_() +{ + transfer_cpu_to_gpu_(); +#pragma omp parallel num_threads(gint_info_->get_streams_num()) + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z(BatchBigGrid::get_max_phi_len(), stream, false); + + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi_dphi(phi.get_device_ptr(), + dphi_x.get_device_ptr(), + dphi_y.get_device_ptr(), + dphi_z.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + const bool is_symm = false; + phi_op.phi_mul_vldr3(vr_eff_d_vec_[is].get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_dm(phi_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), + dm_gint_vec_[is], is_symm, phi_vldr3_dm.get_device_ptr()); + if (isforce_) + { + phi_op.phi_dot_dphi(phi_vldr3_dm.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), + dphi_z.get_device_ptr(), fvl_d_.get_device_ptr()); + } + if (isstress_) + { + phi_op.phi_dot_dphi_r(phi_vldr3_dm.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), + dphi_z.get_device_ptr(), svl_d_.get_device_ptr()); + } + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } + transfer_gpu_to_cpu_(); +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h new file mode 100644 index 0000000000..6d3d341e64 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "source_base/matrix.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_fvl_gpu : public Gint +{ + public: + Gint_fvl_gpu( + const int nspin, + const std::vector& vr_eff, + const std::vector*>& dm_vec, + const bool isforce, + const bool isstress, + ModuleBase::matrix* fvl, + ModuleBase::matrix* svl) + : nspin_(nspin), vr_eff_(vr_eff), dm_vec_(dm_vec), + isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), + dr3_(gint_info_->get_mgrid_volume()) {} + + void cal_gint(); + + private: + void init_dm_gint_(); + + void cal_fvl_svl_(); + + void transfer_cpu_to_gpu_(); + void transfer_gpu_to_cpu_(); + // input + const int nspin_; + std::vector vr_eff_; + std::vector*> dm_vec_; + const bool isforce_; + const bool isstress_; + + // output + ModuleBase::matrix* fvl_; + ModuleBase::matrix* svl_; + + // intermediate variables + std::vector> dm_gint_vec_; + + double dr3_; + + // GPU memory + std::vector> vr_eff_d_vec_; + std::vector> dm_gint_d_vec_; + CudaMemWrapper fvl_d_; + CudaMemWrapper svl_d_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp index 15ca44b041..3299600c99 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp @@ -8,9 +8,12 @@ namespace ModuleGint void Gint_fvl_meta::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_fvl"); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_fvl_svl_(); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); } void Gint_fvl_meta::init_dm_gint_() @@ -93,10 +96,10 @@ void Gint_fvl_meta::cal_fvl_svl_() phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_x.data(), dphi_x_vldr3.data()); phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_y.data(), dphi_y_vldr3.data()); phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_z.data(), dphi_z_vldr3.data()); - phi_op.phi_mul_dm(phi_vldr3.data(), *dm_gint_vec_[is], false, phi_vldr3_dm.data()); - phi_op.phi_mul_dm(dphi_x_vldr3.data(), *dm_gint_vec_[is], false, dphi_x_vldr3_dm.data()); - phi_op.phi_mul_dm(dphi_y_vldr3.data(), *dm_gint_vec_[is], false, dphi_y_vldr3_dm.data()); - phi_op.phi_mul_dm(dphi_z_vldr3.data(), *dm_gint_vec_[is], false, dphi_z_vldr3_dm.data()); + phi_op.phi_mul_dm(phi_vldr3.data(), dm_gint_vec_[is], false, phi_vldr3_dm.data()); + phi_op.phi_mul_dm(dphi_x_vldr3.data(), dm_gint_vec_[is], false, dphi_x_vldr3_dm.data()); + phi_op.phi_mul_dm(dphi_y_vldr3.data(), dm_gint_vec_[is], false, dphi_y_vldr3_dm.data()); + phi_op.phi_mul_dm(dphi_z_vldr3.data(), dm_gint_vec_[is], false, dphi_z_vldr3_dm.data()); if(isforce_) { phi_op.phi_dot_dphi(phi_vldr3_dm.data(), dphi_x.data(), dphi_y.data(), dphi_z.data(), fvl_thread); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h index 636bbc47b5..1abeac9d11 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h @@ -23,9 +23,9 @@ class Gint_fvl_meta : public Gint ModuleBase::matrix* svl) : nspin_(nspin), vr_eff_(vr_eff), vofk_(vofk), dm_vec_(dm_vec), isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), - dr3_(gint_info_->get_mgrid_volume()) {}; + dr3_(gint_info_->get_mgrid_volume()) {} - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); @@ -45,7 +45,7 @@ class Gint_fvl_meta : public Gint ModuleBase::matrix* svl_; // intermediate variables - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; double dr3_; }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp new file mode 100644 index 0000000000..fa19925d04 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp @@ -0,0 +1,182 @@ +#include "gint_fvl_meta_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_fvl_meta_gpu::cal_gint() +{ + ModuleBase::TITLE("Gint", "cal_gint_fvl"); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); + init_dm_gint_(); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); + cal_fvl_svl_(); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); +} + +void Gint_fvl_meta_gpu::init_dm_gint_() +{ + dm_gint_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_vec_[is] = gint_info_->get_hr(); + } +} + +void Gint_fvl_meta_gpu::transfer_cpu_to_gpu_() +{ + dm_gint_d_vec_.resize(nspin_); + vr_eff_d_vec_.resize(nspin_); + vofk_d_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is].get_nnr(), 0, false); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), + dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + vr_eff_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_vec_[is].get_device_ptr(), vr_eff_[is], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + vofk_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vofk_d_vec_[is].get_device_ptr(), vofk_[is], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + } + if (isforce_) + { + fvl_d_ = CudaMemWrapper(gint_info_->get_nat() * 3, 0, true); + } + if (isstress_) + { + svl_d_ = CudaMemWrapper(6, 0, true); + } +} + +void Gint_fvl_meta_gpu::transfer_gpu_to_cpu_() +{ + if (isforce_) + { + fvl_d_.copy_device_to_host_sync(); + for (int iat = 0; iat < gint_info_->get_nat(); iat++) + { + for (int j = 0; j < 3; j++) + { + fvl_[0](iat, j) += fvl_d_.get_host_ptr()[iat * 3 + j]; + } + } + } + if (isstress_) + { + svl_d_.copy_device_to_host_sync(); + svl_[0](0, 0) += svl_d_.get_host_ptr()[0]; + svl_[0](0, 1) += svl_d_.get_host_ptr()[1]; + svl_[0](0, 2) += svl_d_.get_host_ptr()[2]; + svl_[0](1, 1) += svl_d_.get_host_ptr()[3]; + svl_[0](1, 2) += svl_d_.get_host_ptr()[4]; + svl_[0](2, 2) += svl_d_.get_host_ptr()[5]; + } +} + +void Gint_fvl_meta_gpu::cal_fvl_svl_() +{ + transfer_cpu_to_gpu_(); +#pragma omp parallel num_threads(gint_info_->get_streams_num()) + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_xx(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_xy(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_xz(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_yy(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_yz(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_zz(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi_dphi(phi.get_device_ptr(), + dphi_x.get_device_ptr(), + dphi_y.get_device_ptr(), + dphi_z.get_device_ptr()); + phi_op.set_ddphi(ddphi_xx.get_device_ptr(), ddphi_xy.get_device_ptr(), + ddphi_xz.get_device_ptr(), ddphi_yy.get_device_ptr(), + ddphi_yz.get_device_ptr(), ddphi_zz.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + const bool is_symm = false; + phi_op.phi_mul_vldr3(vr_eff_d_vec_[is].get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_vec_[is].get_device_ptr(), dr3_, + dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_vec_[is].get_device_ptr(), dr3_, + dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_vec_[is].get_device_ptr(), dr3_, + dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr()); + phi_op.phi_mul_dm(phi_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), + dm_gint_vec_[is], is_symm, phi_vldr3_dm.get_device_ptr()); + phi_op.phi_mul_dm(dphi_x_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), + dm_gint_vec_[is], is_symm, dphi_x_vldr3_dm.get_device_ptr()); + phi_op.phi_mul_dm(dphi_y_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), + dm_gint_vec_[is], is_symm, dphi_y_vldr3_dm.get_device_ptr()); + phi_op.phi_mul_dm(dphi_z_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), + dm_gint_vec_[is], is_symm, dphi_z_vldr3_dm.get_device_ptr()); + if (isforce_) + { + phi_op.phi_dot_dphi(phi_vldr3_dm.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), + dphi_z.get_device_ptr(), fvl_d_.get_device_ptr()); + phi_op.phi_dot_dphi(dphi_x_vldr3_dm.get_device_ptr(), + ddphi_xx.get_device_ptr(), ddphi_xy.get_device_ptr(), + ddphi_xz.get_device_ptr(), fvl_d_.get_device_ptr()); + phi_op.phi_dot_dphi(dphi_y_vldr3_dm.get_device_ptr(), + ddphi_xy.get_device_ptr(), ddphi_yy.get_device_ptr(), + ddphi_yz.get_device_ptr(), fvl_d_.get_device_ptr()); + phi_op.phi_dot_dphi(dphi_z_vldr3_dm.get_device_ptr(), + ddphi_xz.get_device_ptr(), ddphi_yz.get_device_ptr(), + ddphi_zz.get_device_ptr(), fvl_d_.get_device_ptr()); + } + if (isstress_) + { + phi_op.phi_dot_dphi_r(phi_vldr3_dm.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), + dphi_z.get_device_ptr(), svl_d_.get_device_ptr()); + phi_op.phi_dot_dphi_r(dphi_x_vldr3_dm.get_device_ptr(), + ddphi_xx.get_device_ptr(), ddphi_xy.get_device_ptr(), + ddphi_xz.get_device_ptr(), svl_d_.get_device_ptr()); + phi_op.phi_dot_dphi_r(dphi_y_vldr3_dm.get_device_ptr(), + ddphi_xy.get_device_ptr(), ddphi_yy.get_device_ptr(), + ddphi_yz.get_device_ptr(), svl_d_.get_device_ptr()); + phi_op.phi_dot_dphi_r(dphi_z_vldr3_dm.get_device_ptr(), + ddphi_xz.get_device_ptr(), ddphi_yz.get_device_ptr(), + ddphi_zz.get_device_ptr(), svl_d_.get_device_ptr()); + } + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } + transfer_gpu_to_cpu_(); +} + +} // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h new file mode 100644 index 0000000000..22baba9d6d --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h @@ -0,0 +1,64 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "source_base/matrix.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ +class Gint_fvl_meta_gpu : public Gint +{ + public: + Gint_fvl_meta_gpu( + const int nspin, + const std::vector& vr_eff, + const std::vector& vofk, + const std::vector*>& dm_vec, + const bool isforce, + const bool isstress, + ModuleBase::matrix* fvl, + ModuleBase::matrix* svl) + : nspin_(nspin), vr_eff_(vr_eff), vofk_(vofk), dm_vec_(dm_vec), + isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), + dr3_(gint_info_->get_mgrid_volume()) {} + + void cal_gint(); + + private: + void init_dm_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + void cal_fvl_svl_(); + + // input + const int nspin_; + std::vector vr_eff_; + std::vector vofk_; + std::vector*> dm_vec_; + const bool isforce_; + const bool isstress_; + + // output + ModuleBase::matrix* fvl_; + ModuleBase::matrix* svl_; + + // intermediate variables + std::vector> dm_gint_vec_; + + double dr3_; + + std::vector> vr_eff_d_vec_; + std::vector> vofk_d_vec_; + std::vector> dm_gint_d_vec_; + CudaMemWrapper fvl_d_; + CudaMemWrapper svl_d_; +}; + +} // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h index 687c37df50..a017f81ba0 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h @@ -5,17 +5,13 @@ #include "gint_type.h" #include "source_base/timer.h" -template -std::shared_ptr toConstSharedPtr(std::shared_ptr ptr) { - return std::static_pointer_cast(ptr); -} - - +namespace ModuleGint +{ inline int index3Dto1D(const int id_x, const int id_y, const int id_z, const int dim_x, const int dim_y, const int dim_z) { return id_z + id_y * dim_z + id_x * dim_y * dim_z; -}; +} inline Vec3i index1Dto3D(const int index_1d, const int dim_x, const int dim_y, const int dim_z) @@ -24,7 +20,7 @@ inline Vec3i index1Dto3D(const int index_1d, int id_y = (index_1d - id_x * dim_y * dim_z) / dim_z; int id_z = index_1d % dim_z; return Vec3i(id_x, id_y, id_z); -}; +} // if exponent is an integer between 0 and 5 (the most common cases in gint) and // and exp is a variable that cannot be determined at compile time (which means the compiler cannot optimize the code), @@ -49,15 +45,17 @@ inline double pow_int(const double base, const int exp) double result = std::pow(base, exp); return result; } -}; +} inline int floor_div(const int a, const int b) { // a ^ b < 0 means a and b have different signs return a / b - (a % b != 0 && (a ^ b) < 0); -}; +} inline int ceil_div(const int a, const int b) { return a / b + (a % b != 0 && (a ^ b) > 0); -}; \ No newline at end of file +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp index 11f2a5d59e..b0738e28e4 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp @@ -41,25 +41,38 @@ GintInfo::GintInfo( biggrids_.push_back(std::make_shared(i)); } - // initialize the atoms + // initialize the atoms and the numerical orbital init_atoms_(ucell_->ntype, ucell_->atoms, Phi); + // initialize trace_lo_ and lgd_ + init_trace_lo_(ucell, PARAM.inp.nspin); + // initialize the ijr_info // this step needs to be done after init_atoms_, because it requires the information of is_atom_on_bgrid init_ijr_info_(ucell, gd); + + #ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + streams_num_ = PARAM.inp.nstream; // the default value of num_stream is 4 + const int batch_size = nbz_local; + init_bgrid_batches_(batch_size); + gpu_vars_ = std::make_shared(biggrid_info_, ucell, Phi); + } + #endif } template -std::shared_ptr> GintInfo::get_hr(int npol) const +HContainer GintInfo::get_hr(int npol) const { - auto p_hr = std::make_shared>(ucell_->nat); + auto hr = HContainer(ucell_->nat); if(PARAM.inp.gamma_only) { - p_hr->fix_gamma(); + hr.fix_gamma(); } - p_hr->insert_ijrs(&ijr_info_, *ucell_, npol); - p_hr->allocate(nullptr, true); - return p_hr; + hr.insert_ijrs(&ijr_info_, *ucell_, npol); + hr.allocate(nullptr, true); + return hr; } void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital* Phi) @@ -68,12 +81,14 @@ void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital int iat = 0; is_atom_in_proc_.resize(ucell_->nat, false); atoms_.resize(ucell_->nat); + orbs_.resize(ntype); // TODO: USE OPENMP TO PARALLELIZE THIS LOOP for(int i = 0; i < ntype; i++) { const auto& atom = atoms[i]; - const auto *orb = &Phi[i]; + orbs_[i] = Phi[i]; + const auto *orb = &orbs_[i]; // rcut extends to the maximum big grids in x, y, z directions Vec3i ext_bgrid = biggrid_info_->max_ext_bgrid_num(atom.Rcut); @@ -124,7 +139,7 @@ void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital atom_bgrid_idx.y - ucell_idx_bgrid.y * unitcell_info_->get_nby(), atom_bgrid_idx.z - ucell_idx_bgrid.z * unitcell_info_->get_nbz()); r_to_atom.insert(std::make_pair(ucell_idx_relative, - GintAtom(&atom, j, iat, ext_atom_bgrid_idx, ucell_idx_relative, tau_in_biggrid, orb))); + GintAtom(&atom, i, j, iat, ext_atom_bgrid_idx, ucell_idx_relative, tau_in_biggrid, orb, ucell_))); } if(biggrids_[bgrid_local_idx]->is_atom_on_bgrid(&r_to_atom.at(ucell_idx_relative))) { @@ -140,6 +155,47 @@ void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital ModuleBase::timer::tick("GintInfo", "init_atoms"); } +void GintInfo::init_trace_lo_(const UnitCell& ucell, const int nspin) +{ + this->trace_lo_ = std::vector(PARAM.globalv.nlocal, -1); + this->lgd_ = 0; + int iat = 0; + int iw_all = 0; + int iw_local = 0; + for (int it = 0; it < ucell.ntype; it++) + { + for (int ia = 0; ia < ucell.atoms[it].na; ia++) + { + if (is_atom_in_proc_[iat]) + { + int nw0 = ucell.atoms[it].nw; + if (nspin== 4) + { // added by zhengdy-soc, need to be double in soc + nw0 *= 2; + this->lgd_ += nw0; + } else { + this->lgd_ += nw0; + } + + for (int iw = 0; iw < nw0; iw++) + { + this->trace_lo_[iw_all] = iw_local; + ++iw_local; + ++iw_all; + } + } else { + // global index of atomic orbitals + iw_all += ucell.atoms[it].nw; + if (nspin == 4) + { + iw_all += ucell.atoms[it].nw; + } + } + ++iat; + } + } +} + void GintInfo::init_ijr_info_(const UnitCell& ucell, Grid_Driver& gd) { HContainer hr_gint_local(ucell.nat); @@ -207,6 +263,22 @@ void GintInfo::init_ijr_info_(const UnitCell& ucell, Grid_Driver& gd) return; } -template std::shared_ptr> GintInfo::get_hr(int npol) const; -template std::shared_ptr>> GintInfo::get_hr>(int npol) const; +#ifdef __CUDA +void GintInfo::init_bgrid_batches_(int batch_size) +{ + for (int i = 0; i < biggrids_.size(); i += batch_size) + { + std::vector> bgrid_vec; + for(int j = i; j < i + batch_size && j < biggrids_.size(); j++) + { + bgrid_vec.push_back(biggrids_[j]); + } + auto bgrid_batch = std::make_shared(bgrid_vec); + bgrid_batches_.push_back(bgrid_batch); + } +} +#endif + +template HContainer GintInfo::get_hr(int npol) const; +template HContainer> GintInfo::get_hr>(int npol) const; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h index c234ec165c..88f9b7c6bc 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h @@ -13,6 +13,11 @@ #include "localcell_info.h" #include "divide_info.h" +#ifdef __CUDA +#include "batch_biggrid.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h" +#endif + namespace ModuleGint { @@ -29,20 +34,26 @@ class GintInfo const UnitCell& ucell, Grid_Driver& gd); // getter functions - std::vector> get_biggrids() const { return biggrids_; }; - double get_local_mgrid_num() const { return localcell_info_->get_mgrids_num(); }; - double get_mgrid_volume() const { return meshgrid_info_->get_volume(); }; + const std::vector>& get_biggrids() { return biggrids_; } + const std::vector& get_trace_lo() const{ return trace_lo_; } + int get_lgd() const { return lgd_; } + int get_nat() const { return ucell_->nat; } // return the number of atoms in the unitcell + int get_local_mgrid_num() const { return localcell_info_->get_mgrids_num(); } + double get_mgrid_volume() const { return meshgrid_info_->get_volume(); } //========================================= // functions about hcontainer //========================================= template - std::shared_ptr> get_hr(int npol = 1) const; + HContainer get_hr(int npol = 1) const; private: // initialize the atoms void init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital* Phi); + // initialize trace_lo_ and lgd_ + void init_trace_lo_(const UnitCell& ucell, const int nspin); + // initialize the ijr_info void init_ijr_info_(const UnitCell& ucell, Grid_Driver& gd); @@ -77,6 +88,30 @@ class GintInfo // format for storing atomic pair information in hcontainer, used for initializing hcontainer std::vector ijr_info_; + + // map the global index of atomic orbitals to local index + std::vector trace_lo_; + + // store the information about Numerical orbitals + std::vector orbs_; + + // total num of atomic orbitals on this proc + int lgd_ = 0; + + #ifdef __CUDA + public: + std::vector>& get_bgrid_batches() { return bgrid_batches_; }; + std::shared_ptr get_gpu_vars() const { return gpu_vars_; }; + int get_dev_id() const { return gpu_vars_->dev_id_; }; + int get_streams_num() const { return streams_num_; }; + + private: + void init_bgrid_batches_(int batch_size); + std::vector> bgrid_batches_; + std::shared_ptr gpu_vars_; + // More streams can improve parallelism and may speed up grid integration, at the cost of higher GPU memory usage. + int streams_num_; + #endif }; } // namespace ModuleGint diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index bd945b8b19..a66b061ab3 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -1,5 +1,6 @@ #include "gint_interface.h" #include "source_base/timer.h" +#include "module_parameter/parameter.h" #include "gint_vl.h" #include "gint_vl_metagga.h" #include "gint_vl_nspin4.h" @@ -9,6 +10,17 @@ #include "gint_rho.h" #include "gint_tau.h" +#ifdef __CUDA +#include "gint_vl_gpu.h" +#include "gint_rho_gpu.h" +#include "gint_fvl_gpu.h" +#include "gint_vl_nspin4_gpu.h" +#include "gint_vl_metagga_gpu.h" +#include "gint_vl_metagga_nspin4_gpu.h" +#include "gint_tau_gpu.h" +#include "gint_fvl_meta_gpu.h" +#endif + namespace ModuleGint { @@ -16,20 +28,35 @@ void cal_gint_vl( const double* vr_eff, HContainer* hR) { - ModuleBase::timer::tick("Gint", "cal_gint_vl"); - Gint_vl gint_vl(vr_eff, hR); - gint_vl.cal_gint(); - ModuleBase::timer::tick("Gint", "cal_gint_vl"); +#ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_vl_gpu gint_vl(vr_eff, hR); + gint_vl.cal_gint(); + } else +#endif + { + Gint_vl gint_vl(vr_eff, hR); + gint_vl.cal_gint(); + } } +// nspin == 4 case void cal_gint_vl( std::vector vr_eff, HContainer>* hR) { - ModuleBase::timer::tick("Gint", "cal_gint_vl"); - Gint_vl_nspin4 gint_vl_nspin4(vr_eff, hR); - gint_vl_nspin4.cal_gint(); - ModuleBase::timer::tick("Gint", "cal_gint_vl"); + #ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_vl_nspin4_gpu gint_vl_nspin4(vr_eff, hR); + gint_vl_nspin4.cal_gint(); + } else + #endif + { + Gint_vl_nspin4 gint_vl_nspin4(vr_eff, hR); + gint_vl_nspin4.cal_gint(); + } } void cal_gint_vl_metagga( @@ -37,32 +64,55 @@ void cal_gint_vl_metagga( const double* vfork, HContainer* hR) { - ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); - Gint_vl_metagga gint_vl_metagga(vr_eff, vfork, hR); - gint_vl_metagga.cal_gint(); - ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); +#ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_vl_metagga_gpu gint_vl_metagga(vr_eff, vfork, hR); + gint_vl_metagga.cal_gint(); + } else +#endif + { + Gint_vl_metagga gint_vl_metagga(vr_eff, vfork, hR); + gint_vl_metagga.cal_gint(); + } } +// nspin == 4 case void cal_gint_vl_metagga( std::vector vr_eff, std::vector vofk, HContainer>* hR) { - ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); - Gint_vl_metagga_nspin4 gint_vl_metagga_nspin4(vr_eff, vofk, hR); - gint_vl_metagga_nspin4.cal_gint(); - ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); +#ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_vl_metagga_nspin4_gpu gint_vl_metagga_nspin4(vr_eff, vofk, hR); + gint_vl_metagga_nspin4.cal_gint(); + } else +#endif + { + Gint_vl_metagga_nspin4 gint_vl_metagga_nspin4(vr_eff, vofk, hR); + gint_vl_metagga_nspin4.cal_gint(); + } } void cal_gint_rho( const std::vector*>& dm_vec, const int nspin, - double **rho) + double **rho, + bool is_dm_symm) { - ModuleBase::timer::tick("Gint", "cal_gint_rho"); - Gint_rho gint_rho(dm_vec, nspin, rho); - gint_rho.cal_gint(); - ModuleBase::timer::tick("Gint", "cal_gint_rho"); + #ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_rho_gpu gint_rho(dm_vec, nspin, rho, is_dm_symm); + gint_rho.cal_gint(); + } else + #endif + { + Gint_rho gint_rho(dm_vec, nspin, rho, is_dm_symm); + gint_rho.cal_gint(); + } } void cal_gint_tau( @@ -70,10 +120,17 @@ void cal_gint_tau( const int nspin, double** tau) { - ModuleBase::timer::tick("Gint", "cal_gint_tau"); - Gint_tau gint_tau(dm_vec, nspin, tau); - gint_tau.cal_gint(); - ModuleBase::timer::tick("Gint", "cal_gint_tau"); + #ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_tau_gpu gint_tau(dm_vec, nspin, tau); + gint_tau.cal_gint(); + } else + #endif + { + Gint_tau gint_tau(dm_vec, nspin, tau); + gint_tau.cal_gint(); + } } void cal_gint_fvl( @@ -85,10 +142,17 @@ void cal_gint_fvl( ModuleBase::matrix* fvl, ModuleBase::matrix* svl) { - ModuleBase::timer::tick("Gint", "cal_gint_fvl"); - Gint_fvl gint_fvl(nspin, vr_eff, dm_vec, isforce, isstress, fvl, svl); - gint_fvl.cal_gint(); - ModuleBase::timer::tick("Gint", "cal_gint_fvl"); +#ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_fvl_gpu gint_fvl_gpu(nspin, vr_eff, dm_vec, isforce, isstress, fvl, svl); + gint_fvl_gpu.cal_gint(); + } else +#endif + { + Gint_fvl gint_fvl(nspin, vr_eff, dm_vec, isforce, isstress, fvl, svl); + gint_fvl.cal_gint(); + } } void cal_gint_fvl_meta( @@ -101,10 +165,36 @@ void cal_gint_fvl_meta( ModuleBase::matrix* fvl, ModuleBase::matrix* svl) { - ModuleBase::timer::tick("Gint", "cal_gint_fvl_meta"); - Gint_fvl_meta gint_fvl_meta(nspin, vr_eff, vofk, dm_vec, isforce, isstress, fvl, svl); - gint_fvl_meta.cal_gint(); - ModuleBase::timer::tick("Gint", "cal_gint_fvl_meta"); +#ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_fvl_meta_gpu gint_fvl_meta(nspin, vr_eff, vofk, dm_vec, isforce, isstress, fvl, svl); + gint_fvl_meta.cal_gint(); + } else +#endif + { + Gint_fvl_meta gint_fvl_meta(nspin, vr_eff, vofk, dm_vec, isforce, isstress, fvl, svl); + gint_fvl_meta.cal_gint(); + } +} + +void cal_dvlocal_R_sparseMatrix( + const int nspin, + const int npol, + const int current_spin, + const int nlocal, + const double sparse_thr, + const double* vr_eff, + const Parallel_Orbitals& pv, + const UnitCell& ucell, + const Grid_Driver& gdriver, + LCAO_HS_Arrays& hs_arrays) +{ + Gint_dvlocal gint_dvlocal(vr_eff, nspin, npol); + gint_dvlocal.cal_dvlocal(); + gint_dvlocal.cal_dvlocal_R_sparseMatrix( + nspin, current_spin, nlocal, sparse_thr, + pv, ucell, gdriver, hs_arrays); } } // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h index cec6b12e01..f674e24011 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h @@ -2,7 +2,7 @@ #include #include "module_hamilt_lcao/module_hcontainer/hcontainer.h" #include "gint_type.h" - +#include "gint_dvlocal.h" namespace ModuleGint { @@ -28,7 +28,8 @@ void cal_gint_vl_metagga( void cal_gint_rho( const std::vector*>& dm_vec, const int nspin, - double **rho); + double **rho, + bool is_dm_symm = true); void cal_gint_tau( const std::vector*>& dm_vec, @@ -54,6 +55,17 @@ void cal_gint_fvl_meta( ModuleBase::matrix* fvl, ModuleBase::matrix* svl); +void cal_dvlocal_R_sparseMatrix( + const int nspin, + const int npol, + const int current_spin, + const int nlocal, + const double sparse_thr, + const double* vr_eff, + const Parallel_Orbitals& pv, + const UnitCell& ucell, + const Grid_Driver& gdriver, + LCAO_HS_Arrays& hs_arrays); } // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp index 2924487c7e..c96b10a731 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp @@ -8,9 +8,12 @@ namespace ModuleGint void Gint_rho::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_rho"); + ModuleBase::timer::tick("Gint", "cal_gint_rho"); init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_rho_(); + ModuleBase::timer::tick("Gint", "cal_gint_rho"); } void Gint_rho::init_dm_gint_() @@ -43,7 +46,7 @@ void Gint_rho::cal_rho_() phi_op.set_phi(phi.data()); for (int is = 0; is < nspin_; is++) { - phi_op.phi_mul_dm(phi.data(), *dm_gint_vec_[is], true, phi_dm.data()); + phi_op.phi_mul_dm(phi.data(), dm_gint_vec_[is], is_dm_symm_, phi_dm.data()); phi_op.phi_dot_phi(phi.data(), phi_dm.data(), rho_[is]); } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h index 6bd2b51030..e0a15edbdc 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h @@ -15,10 +15,11 @@ class Gint_rho : public Gint Gint_rho( const std::vector*>& dm_vec, const int nspin, - double **rho) - : dm_vec_(dm_vec), nspin_(nspin), rho_(rho) {}; + double **rho, + bool is_dm_symm = true) + : dm_vec_(dm_vec), nspin_(nspin), rho_(rho), is_dm_symm_(is_dm_symm) {} - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); @@ -28,14 +29,16 @@ class Gint_rho : public Gint // input const std::vector*> dm_vec_; const int nspin_; + + // if true, it means the DMR matrix is symmetric, + // which leads to faster computations compared to the asymmetric case. + const bool is_dm_symm_; // output double **rho_; - //======================== // Intermediate variables - //======================== - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; }; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp new file mode 100644 index 0000000000..ca24002579 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp @@ -0,0 +1,86 @@ +#include "gint_rho_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_rho_gpu::cal_gint() +{ + ModuleBase::TITLE("Gint", "cal_gint_rho"); + ModuleBase::timer::tick("Gint", "cal_gint_rho"); + init_dm_gint_(); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); + cal_rho_(); + ModuleBase::timer::tick("Gint", "cal_gint_rho"); +} + +void Gint_rho_gpu::init_dm_gint_() +{ + dm_gint_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_vec_[is] = gint_info_->get_hr(); + } +} + +void Gint_rho_gpu::transfer_cpu_to_gpu_() +{ + dm_gint_d_vec_.resize(nspin_); + rho_d_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is].get_nnr(), 0, false); + rho_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), + dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + } +} + +void Gint_rho_gpu::transfer_gpu_to_cpu_() +{ + for (int is = 0; is < nspin_; is++) + { + checkCuda(cudaMemcpy(rho_[is], rho_d_vec_[is].get_device_ptr(), + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyDeviceToHost)); + } +} + +void Gint_rho_gpu::cal_rho_() +{ + transfer_cpu_to_gpu_(); +#pragma omp parallel num_threads(gint_info_->get_streams_num()) + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_dm(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi(phi.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + phi_op.phi_mul_dm(phi.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is], + is_dm_symm_, phi_dm.get_device_ptr()); + phi_op.phi_dot_phi(phi.get_device_ptr(), phi_dm.get_device_ptr(), rho_d_vec_[is].get_device_ptr()); + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } + transfer_gpu_to_cpu_(); +} + +} // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h new file mode 100644 index 0000000000..13db0f5a85 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_rho_gpu: public Gint +{ + public: + Gint_rho_gpu( + const std::vector*>& dm_vec, + const int nspin, + double **rho, + bool is_dm_symm = true) + : dm_vec_(dm_vec), nspin_(nspin), rho_(rho), is_dm_symm_(is_dm_symm) {} + + void cal_gint(); + + private: + void init_dm_gint_(); + + void cal_rho_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + // input + const std::vector*> dm_vec_; + const int nspin_; + + // if true, it means the DMR matrix is symmetric, + // which leads to faster computations compared to the asymmetric case. + const bool is_dm_symm_; + + // output + double **rho_; + + // Intermediate variables + std::vector> dm_gint_vec_; + + std::vector> dm_gint_d_vec_; + std::vector> rho_d_vec_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp index f5d0b70a0c..1b5e282384 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp @@ -8,9 +8,12 @@ namespace ModuleGint void Gint_tau::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_tau"); + ModuleBase::timer::tick("Gint", "cal_gint_tau"); init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_tau_(); + ModuleBase::timer::tick("Gint", "cal_gint_tau"); } void Gint_tau::init_dm_gint_() @@ -51,9 +54,9 @@ void Gint_tau::cal_tau_() phi_op.set_phi_dphi(nullptr, dphi_x.data(), dphi_y.data(), dphi_z.data()); for (int is = 0; is < nspin_; is++) { - phi_op.phi_mul_dm(dphi_x.data(), *dm_gint_vec_[is], true, dphi_x_dm.data()); - phi_op.phi_mul_dm(dphi_y.data(), *dm_gint_vec_[is], true, dphi_y_dm.data()); - phi_op.phi_mul_dm(dphi_z.data(), *dm_gint_vec_[is], true, dphi_z_dm.data()); + phi_op.phi_mul_dm(dphi_x.data(), dm_gint_vec_[is], true, dphi_x_dm.data()); + phi_op.phi_mul_dm(dphi_y.data(), dm_gint_vec_[is], true, dphi_y_dm.data()); + phi_op.phi_mul_dm(dphi_z.data(), dm_gint_vec_[is], true, dphi_z_dm.data()); phi_op.phi_dot_phi(dphi_x.data(), dphi_x_dm.data(), kin_[is]); phi_op.phi_dot_phi(dphi_y.data(), dphi_y_dm.data(), kin_[is]); phi_op.phi_dot_phi(dphi_z.data(), dphi_z_dm.data(), kin_[is]); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h index d36552a79e..b1d3b0664a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h @@ -15,9 +15,9 @@ class Gint_tau : public Gint const std::vector*>& dm_vec, const int nspin, double** tau) - : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {}; + : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {} - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); @@ -31,10 +31,8 @@ class Gint_tau : public Gint // output double **kin_; - //======================== // Intermediate variables - //======================== - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; }; } // namespace ModuleGint diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp new file mode 100644 index 0000000000..cbeeead322 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp @@ -0,0 +1,98 @@ +#include "gint_tau_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_tau_gpu::cal_gint() +{ + ModuleBase::TITLE("Gint", "cal_gint_tau"); + ModuleBase::timer::tick("Gint", "cal_gint_tau"); + init_dm_gint_(); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); + cal_tau_(); + ModuleBase::timer::tick("Gint", "cal_gint_tau"); +} + +void Gint_tau_gpu::init_dm_gint_() +{ + dm_gint_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_vec_[is] = gint_info_->get_hr(); + } +} + +void Gint_tau_gpu::transfer_cpu_to_gpu_() +{ + dm_gint_d_vec_.resize(nspin_); + kin_d_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is].get_nnr(), 0, false); + kin_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), + dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + } +} + +void Gint_tau_gpu::transfer_gpu_to_cpu_() +{ + for (int is = 0; is < nspin_; is++) + { + checkCuda(cudaMemcpy(kin_[is], kin_d_vec_[is].get_device_ptr(), + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyDeviceToHost)); + } +} + +void Gint_tau_gpu::cal_tau_() +{ + transfer_cpu_to_gpu_(); +#pragma omp parallel num_threads(gint_info_->get_streams_num()) + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper dphi_x(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z_dm(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi_dphi(nullptr, + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), dphi_z.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + const bool is_symm = true; + phi_op.phi_mul_dm(dphi_x.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is], + is_symm, dphi_x_dm.get_device_ptr()); + phi_op.phi_mul_dm(dphi_y.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is], + is_symm, dphi_y_dm.get_device_ptr()); + phi_op.phi_mul_dm(dphi_z.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is], + is_symm, dphi_z_dm.get_device_ptr()); + phi_op.phi_dot_phi(dphi_x.get_device_ptr(), dphi_x_dm.get_device_ptr(), kin_d_vec_[is].get_device_ptr()); + phi_op.phi_dot_phi(dphi_y.get_device_ptr(), dphi_y_dm.get_device_ptr(), kin_d_vec_[is].get_device_ptr()); + phi_op.phi_dot_phi(dphi_z.get_device_ptr(), dphi_z_dm.get_device_ptr(), kin_d_vec_[is].get_device_ptr()); + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } + transfer_gpu_to_cpu_(); +} + +} diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h new file mode 100644 index 0000000000..bfac5a48a3 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h @@ -0,0 +1,47 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_tau_gpu : public Gint +{ + public: + Gint_tau_gpu( + const std::vector*>& dm_vec, + const int nspin, + double** tau) + : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {} + + void cal_gint(); + + private: + void init_dm_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + void cal_tau_(); + + // input + const std::vector*> dm_vec_; + const int nspin_; + + // output + double **kin_; + + // Intermediate variables + std::vector> dm_gint_vec_; + + std::vector> dm_gint_d_vec_; + std::vector> kin_d_vec_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h index 9cf623765b..4d1b2e8537 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h @@ -4,9 +4,12 @@ #include "source_base/vector3.h" #include "source_base/matrix3.h" -using Matrix3 = ModuleBase::Matrix3; -using Vec3d = ModuleBase::Vector3; -using Vec3i = ModuleBase::Vector3; +namespace ModuleGint +{ + using Matrix3 = ModuleBase::Matrix3; + using Vec3d = ModuleBase::Vector3; + using Vec3i = ModuleBase::Vector3; -template -using HContainer = hamilt::HContainer; \ No newline at end of file + template + using HContainer = hamilt::HContainer; +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp index b0107bf064..ee40327d72 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp @@ -9,10 +9,13 @@ namespace ModuleGint void Gint_vl::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_), hR_); + transfer_hr_gint_to_hR(hr_gint_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } //======================== @@ -26,8 +29,6 @@ void Gint_vl::init_hr_gint_() void Gint_vl::cal_hr_gint_() { -// be careful!! -// each thread will have a copy of hr_gint_, this may cause a lot of memory usage #pragma omp parallel { PhiOperator phi_op; @@ -46,7 +47,7 @@ void Gint_vl::cal_hr_gint_() phi_vldr3.resize(phi_len); phi_op.set_phi(phi.data()); phi_op.phi_mul_vldr3(vr_eff_, dr3_, phi.data(), phi_vldr3.data()); - phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper); } } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h index fa3f4b9888..fc717629c5 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h @@ -15,9 +15,9 @@ class Gint_vl : public Gint Gint_vl( const double* vr_eff, HContainer* hR) - : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} - void cal_gint() override; + void cal_gint(); private: @@ -33,12 +33,10 @@ class Gint_vl : public Gint // output HContainer* hR_; - //======================== // Intermediate variables - //======================== double dr3_; - std::shared_ptr> hr_gint_; + HContainer hr_gint_; }; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp new file mode 100644 index 0000000000..fe9162bc4e --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp @@ -0,0 +1,73 @@ +#include "gint_vl_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_vl_gpu::cal_gint() +{ + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); + init_hr_gint_(); + cal_hr_gint_(); + compose_hr_gint(hr_gint_); + transfer_hr_gint_to_hR(hr_gint_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); +} + +void Gint_vl_gpu::init_hr_gint_() +{ + hr_gint_ = gint_info_->get_hr(); +} + +void Gint_vl_gpu::transfer_cpu_to_gpu_() +{ + hr_gint_d_ = CudaMemWrapper(hr_gint_.get_nnr(), 0, false); + vr_eff_d_ = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_.get_device_ptr(), vr_eff_, + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); +} + +void Gint_vl_gpu::transfer_gpu_to_cpu_() +{ + checkCuda(cudaMemcpy(hr_gint_.get_wrapper(), hr_gint_d_.get_device_ptr(), + hr_gint_.get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void Gint_vl_gpu::cal_hr_gint_() +{ + transfer_cpu_to_gpu_(); +#pragma omp parallel num_threads(gint_info_->get_streams_num()) + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi(phi.get_device_ptr()); + phi_op.phi_mul_vldr3(vr_eff_d_.get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } + transfer_gpu_to_cpu_(); +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h new file mode 100644 index 0000000000..53290658c8 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h @@ -0,0 +1,49 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_vl_gpu : public Gint +{ + public: + Gint_vl_gpu( + const double* vr_eff, + HContainer* hR) + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} + + void cal_gint(); + + private: + + void init_hr_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + void cal_hr_gint_(); + + // input + const double* vr_eff_; + + + // output + HContainer* hR_; + + // Intermediate variables + double dr3_; + + HContainer hr_gint_; + + CudaMemWrapper hr_gint_d_; + CudaMemWrapper vr_eff_d_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp index fa651a89e1..2c885adca2 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp @@ -9,10 +9,13 @@ namespace ModuleGint void Gint_vl_metagga::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_), hR_); + transfer_hr_gint_to_hR(hr_gint_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } //======================== @@ -26,8 +29,6 @@ void Gint_vl_metagga::init_hr_gint_() void Gint_vl_metagga::cal_hr_gint_() { -// be careful!! -// each thread will have a copy of hr_gint_, this may cause a lot of memory usage #pragma omp parallel { PhiOperator phi_op; @@ -61,10 +62,10 @@ void Gint_vl_metagga::cal_hr_gint_() phi_op.phi_mul_vldr3(vofk_, dr3_, dphi_x.data(), dphi_x_vldr3.data()); phi_op.phi_mul_vldr3(vofk_, dr3_, dphi_y.data(), dphi_y_vldr3.data()); phi_op.phi_mul_vldr3(vofk_, dr3_, dphi_z.data(), dphi_z_vldr3.data()); - phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper); } } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h index 6ae267f7d4..01bef660a2 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h @@ -16,9 +16,9 @@ class Gint_vl_metagga : public Gint const double* vr_eff, const double* vofk, HContainer* hR) - : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} - void cal_gint() override; + void cal_gint(); private: @@ -35,12 +35,10 @@ class Gint_vl_metagga : public Gint // output HContainer* hR_; - //======================== // Intermediate variables - //======================== double dr3_; - std::shared_ptr> hr_gint_; + HContainer hr_gint_; }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp new file mode 100644 index 0000000000..9c2dad8421 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp @@ -0,0 +1,98 @@ +#include "gint_vl_metagga_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_vl_metagga_gpu::cal_gint() +{ + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); + init_hr_gint_(); + cal_hr_gint_(); + compose_hr_gint(hr_gint_); + transfer_hr_gint_to_hR(hr_gint_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); +} + +//======================== +// Private functions +//======================== + +void Gint_vl_metagga_gpu::init_hr_gint_() +{ + hr_gint_ = gint_info_->get_hr(); +} + +void Gint_vl_metagga_gpu::transfer_cpu_to_gpu_() +{ + hr_gint_d_ = CudaMemWrapper(hr_gint_.get_nnr(), 0, false); + vr_eff_d_ = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + vofk_d_ = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_.get_device_ptr(), vr_eff_, + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + checkCuda(cudaMemcpy(vofk_d_.get_device_ptr(), vofk_, + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); +} + +void Gint_vl_metagga_gpu::transfer_gpu_to_cpu_() +{ + checkCuda(cudaMemcpy(hr_gint_.get_wrapper(), hr_gint_d_.get_device_ptr(), + hr_gint_.get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void Gint_vl_metagga_gpu::cal_hr_gint_() +{ + transfer_cpu_to_gpu_(); +#pragma omp parallel num_threads(gint_info_->get_streams_num()) + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi_dphi(phi.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), dphi_z.get_device_ptr()); + phi_op.phi_mul_vldr3(vr_eff_d_.get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_.get_device_ptr(), dr3_, + dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_.get_device_ptr(), dr3_, + dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_.get_device_ptr(), dr3_, + dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr()); + phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + phi_op.phi_mul_phi(dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + phi_op.phi_mul_phi(dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + phi_op.phi_mul_phi(dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } + transfer_gpu_to_cpu_(); +} +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h new file mode 100644 index 0000000000..efdc01762a --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_vl_metagga_gpu : public Gint +{ + public: + Gint_vl_metagga_gpu( + const double* vr_eff, + const double* vofk, + HContainer* hR) + : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} + + void cal_gint(); + + private: + + void init_hr_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + // note that only the upper triangle matrix of hR is calculated + // that's why we need compose_hr_gint() to fill the lower triangle matrix. + void cal_hr_gint_(); + + // input + const double* vr_eff_; + const double* vofk_; + + // output + HContainer* hR_; + + // Intermediate variables + double dr3_; + + HContainer hr_gint_; + + CudaMemWrapper hr_gint_d_; + CudaMemWrapper vr_eff_d_; + CudaMemWrapper vofk_d_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp index 986b182c09..5c6086031c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp @@ -11,10 +11,13 @@ namespace ModuleGint void Gint_vl_metagga_nspin4::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_full_), hR_); + transfer_hr_gint_to_hR(hr_gint_full_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } void Gint_vl_metagga_nspin4::init_hr_gint_() @@ -65,10 +68,10 @@ void Gint_vl_metagga_nspin4::cal_hr_gint_() phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_x.data(), dphi_x_vldr3.data()); phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_y.data(), dphi_y_vldr3.data()); phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_z.data(), dphi_z_vldr3.data()); - phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); } } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h index 40bf386fa3..abdbde3f08 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h @@ -16,9 +16,9 @@ class Gint_vl_metagga_nspin4 : public Gint std::vector vr_eff, std::vector vofk, HContainer>* hR) - : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} - void cal_gint() override; + void cal_gint(); private: void init_hr_gint_(); @@ -31,15 +31,13 @@ class Gint_vl_metagga_nspin4 : public Gint // output HContainer>* hR_; - //======================== // Intermediate variables - //======================== const double dr3_; const int nspin_ = 4; - std::vector>> hr_gint_part_; - std::shared_ptr>> hr_gint_full_; + std::vector> hr_gint_part_; + HContainer> hr_gint_full_; }; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp new file mode 100644 index 0000000000..9adc4cb137 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp @@ -0,0 +1,113 @@ +#include "gint_vl_metagga_nspin4_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_vl_metagga_nspin4_gpu::cal_gint() +{ + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); + init_hr_gint_(); + cal_hr_gint_(); + compose_hr_gint(hr_gint_part_, hr_gint_full_); + transfer_hr_gint_to_hR(hr_gint_full_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); +} + +void Gint_vl_metagga_nspin4_gpu::init_hr_gint_() +{ + hr_gint_part_.resize(nspin_); + for(int i = 0; i < nspin_; i++) + { + hr_gint_part_[i] = gint_info_->get_hr(); + } + const int npol = 2; + hr_gint_full_ = gint_info_->get_hr>(npol); +} + +void Gint_vl_metagga_nspin4_gpu::transfer_cpu_to_gpu_() +{ + vr_eff_d_.resize(nspin_); + vofk_d_.resize(nspin_); + hr_gint_part_d_.resize(nspin_); + for(int i = 0; i < nspin_; i++) + { + hr_gint_part_d_[i] = CudaMemWrapper(hr_gint_part_[i].get_nnr(), 0, false); + vr_eff_d_[i] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + vofk_d_[i] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_[i].get_device_ptr(), vr_eff_[i], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + checkCuda(cudaMemcpy(vofk_d_[i].get_device_ptr(), vofk_[i], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + } +} + +void Gint_vl_metagga_nspin4_gpu::transfer_gpu_to_cpu_() +{ + for(int i = 0; i < nspin_; i++) + { + checkCuda(cudaMemcpy(hr_gint_part_[i].get_wrapper(), hr_gint_part_d_[i].get_device_ptr(), + hr_gint_part_[i].get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); + } +} + +void Gint_vl_metagga_nspin4_gpu::cal_hr_gint_() +{ + transfer_cpu_to_gpu_(); +#pragma omp parallel num_threads(gint_info_->get_streams_num()) + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi_dphi(phi.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), dphi_z.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + phi_op.phi_mul_vldr3(vr_eff_d_[is].get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_[is].get_device_ptr(), dr3_, + dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_[is].get_device_ptr(), dr3_, + dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_[is].get_device_ptr(), dr3_, + dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr()); + phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + phi_op.phi_mul_phi(dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + phi_op.phi_mul_phi(dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + phi_op.phi_mul_phi(dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } + transfer_gpu_to_cpu_(); +} + +} // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h new file mode 100644 index 0000000000..d38665dffa --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_vl_metagga_nspin4_gpu : public Gint +{ + public: + Gint_vl_metagga_nspin4_gpu( + std::vector vr_eff, + std::vector vofk, + HContainer>* hR) + : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} + + void cal_gint(); + + private: + void init_hr_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + void cal_hr_gint_(); + + // input + std::vector vr_eff_; + std::vector vofk_; + // output + HContainer>* hR_; + + // Intermediate variables + const double dr3_; + + const int nspin_ = 4; + + std::vector> hr_gint_part_; + HContainer> hr_gint_full_; + + std::vector> vr_eff_d_; + std::vector> vofk_d_; + std::vector> hr_gint_part_d_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp index db211570ca..27db0a7db3 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp @@ -10,10 +10,13 @@ namespace ModuleGint { void Gint_vl_nspin4::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_full_), hR_); + transfer_hr_gint_to_hR(hr_gint_full_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } void Gint_vl_nspin4::init_hr_gint_() @@ -49,7 +52,7 @@ void Gint_vl_nspin4::cal_hr_gint_() for(int is = 0; is < nspin_; is++) { phi_op.phi_mul_vldr3(vr_eff_[is], dr3_, phi.data(), phi_vldr3.data()); - phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); } } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h index 6338055db6..9436b5c397 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h @@ -15,9 +15,9 @@ class Gint_vl_nspin4 : public Gint Gint_vl_nspin4( std::vector vr_eff, HContainer>* hR) - : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} - void cal_gint() override; + void cal_gint(); private: @@ -33,16 +33,13 @@ class Gint_vl_nspin4 : public Gint // output HContainer>* hR_; - //======================== // Intermediate variables - //======================== const double dr3_; const int nspin_ = 4; - std::vector>> hr_gint_part_; - std::shared_ptr>> hr_gint_full_; - + std::vector> hr_gint_part_; + HContainer> hr_gint_full_; }; } // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp new file mode 100644 index 0000000000..c070258db5 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp @@ -0,0 +1,91 @@ +#include "gint_vl_nspin4_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_vl_nspin4_gpu::cal_gint() +{ + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); + init_hr_gint_(); + cal_hr_gint_(); + compose_hr_gint(hr_gint_part_, hr_gint_full_); + transfer_hr_gint_to_hR(hr_gint_full_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); +} + +void Gint_vl_nspin4_gpu::init_hr_gint_() +{ + hr_gint_part_.resize(nspin_); + for(int i = 0; i < nspin_; i++) + { + hr_gint_part_[i] = gint_info_->get_hr(); + } + const int npol = 2; + hr_gint_full_ = gint_info_->get_hr>(npol); +} + +void Gint_vl_nspin4_gpu::transfer_cpu_to_gpu_() +{ + vr_eff_d_.resize(nspin_); + hr_gint_part_d_.resize(nspin_); + for(int i = 0; i < nspin_; i++) + { + hr_gint_part_d_[i] = CudaMemWrapper(hr_gint_part_[i].get_nnr(), 0, false); + vr_eff_d_[i] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_[i].get_device_ptr(), vr_eff_[i], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + } +} + +void Gint_vl_nspin4_gpu::transfer_gpu_to_cpu_() +{ + for(int i = 0; i < nspin_; i++) + { + checkCuda(cudaMemcpy(hr_gint_part_[i].get_wrapper(), hr_gint_part_d_[i].get_device_ptr(), + hr_gint_part_[i].get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); + } +} + + +void Gint_vl_nspin4_gpu::cal_hr_gint_() +{ + transfer_cpu_to_gpu_(); +#pragma omp parallel num_threads(gint_info_->get_streams_num()) + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi(phi.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + phi_op.phi_mul_vldr3(vr_eff_d_[is].get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } + transfer_gpu_to_cpu_(); +} + +} // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h new file mode 100644 index 0000000000..a7decea9e8 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_vl_nspin4_gpu : public Gint +{ + public: + Gint_vl_nspin4_gpu( + std::vector vr_eff, + HContainer>* hR) + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} + + void cal_gint(); + + private: + + void init_hr_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + // note that only the upper triangle matrix of hR is calculated + // that's why we need compose_hr_gint() to fill the lower triangle matrix. + void cal_hr_gint_(); + + // input + std::vector vr_eff_; + + // output + HContainer>* hR_; + + // Intermediate variables + const double dr3_; + + const int nspin_ = 4; + + std::vector> hr_gint_part_; + HContainer> hr_gint_full_; + + std::vector> vr_eff_d_; + std::vector> hr_gint_part_d_; +}; + +} // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h new file mode 100644 index 0000000000..9b7ad27e83 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h @@ -0,0 +1,171 @@ +#pragma once +#include +#include "source_base/tool_quit.h" +#include "gint_helper.cuh" + +template +class CudaMemWrapper +{ + public: + + CudaMemWrapper() = default; + CudaMemWrapper(const CudaMemWrapper& other) = delete; + CudaMemWrapper& operator=(const CudaMemWrapper& other) = delete; + CudaMemWrapper(CudaMemWrapper&& other) noexcept + { + this->device_ptr_ = other.device_ptr_; + this->host_ptr_ = other.host_ptr_; + this->size_ = other.size_; + this->malloc_host_ = other.malloc_host_; + this->stream_ = other.stream_; + + other.device_ptr_ = nullptr; + other.host_ptr_ = nullptr; + other.size_ = 0; + other.malloc_host_ = false; + other.stream_ = 0; + } + + CudaMemWrapper& operator=(CudaMemWrapper&& other) noexcept + { + if (this != &other) + { + this->device_ptr_ = other.device_ptr_; + this->host_ptr_ = other.host_ptr_; + this->size_ = other.size_; + this->malloc_host_ = other.malloc_host_; + this->stream_ = other.stream_; + + other.device_ptr_ = nullptr; + other.host_ptr_ = nullptr; + other.size_ = 0; + other.malloc_host_ = false; + other.stream_ = 0; + } + return *this; + } + + CudaMemWrapper(size_t size, + cudaStream_t stream = 0, + bool malloc_host = true) + { + size_ = size; + malloc_host_ = malloc_host; + stream_ = stream; + + if (malloc_host) + { + checkCuda(cudaMallocHost((void**)&host_ptr_, size_* sizeof(T))); + memset(host_ptr_, 0, size_ * sizeof(T)); + } + else + { host_ptr_ = nullptr; } + + checkCuda(cudaMalloc((void**)&device_ptr_, size_ * sizeof(T))); + checkCuda(cudaMemset(device_ptr_, 0, size_ * sizeof(T))); + } + + ~CudaMemWrapper() + { + free(); + } + + void copy_host_to_device_sync(size_t size) + { + if (host_ptr_ == nullptr) + { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to device."); } + checkCuda(cudaMemcpy(device_ptr_, host_ptr_, size * sizeof(T), cudaMemcpyHostToDevice)); + } + + void copy_host_to_device_sync() + { + copy_host_to_device_sync(size_); + } + + void copy_host_to_device_async(size_t size) + { + if (host_ptr_ == nullptr) + { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to device."); } + checkCuda(cudaMemcpyAsync(device_ptr_, host_ptr_, size * sizeof(T), cudaMemcpyHostToDevice, stream_)); + } + + void copy_host_to_device_async() + { + copy_host_to_device_async(size_); + } + + void copy_device_to_host_sync(size_t size) + { + if (host_ptr_ == nullptr) + { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to host."); } + checkCuda(cudaMemcpy(host_ptr_, device_ptr_, size * sizeof(T), cudaMemcpyDeviceToHost)); + } + + void copy_device_to_host_sync() + { + copy_device_to_host_sync(size_); + } + + void copy_device_to_host_async(size_t size) + { + if (host_ptr_ == nullptr) + { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to host."); } + checkCuda(cudaMemcpyAsync(host_ptr_, device_ptr_, size * sizeof(T), cudaMemcpyDeviceToHost, stream_)); + } + + void copy_device_to_host_async() + { + copy_device_to_host_async(size_); + } + + void memset_device_sync(const size_t size, const int value = 0) + { + checkCuda(cudaMemset(device_ptr_, value, size * sizeof(T))); + } + + void memset_device_sync(const int value = 0) + { + memset_device_sync(size_, value); + } + + void memset_device_async(const size_t size, const int value = 0) + { + checkCuda(cudaMemsetAsync(device_ptr_, value, size * sizeof(T), stream_)); + } + + void memset_device_async(const int value = 0) + { + memset_device_async(size_, value); + } + + void memset_host(const size_t size, const int value = 0) + { + if (host_ptr_ == nullptr) + { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot memset host."); } + checkCuda(cudaMemset(host_ptr_, value, size * sizeof(T))); + } + + void memset_host(const int value = 0) + { + memset_host(size_, value); + } + + void free() + { + checkCuda(cudaFree(device_ptr_)); + checkCuda(cudaFreeHost(host_ptr_)); + } + + T* get_device_ptr() { return device_ptr_; } + T* get_host_ptr() { return host_ptr_; } + const T* get_device_ptr() const { return device_ptr_; } + const T* get_host_ptr() const { return host_ptr_; } + size_t get_size() const { return size_; } + + private: + T* device_ptr_ = nullptr; + T* host_ptr_ = nullptr; + size_t size_ = 0; + bool malloc_host_ = false; + cudaStream_t stream_ = 0; +}; \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu new file mode 100644 index 0000000000..b35e0669b6 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu @@ -0,0 +1,38 @@ +#include "gemm_tn_vbatch.cuh" +#include "gemm_nn_vbatch.cuh" +#include "dgemm_vbatch.h" + +void dgemm_nn_vbatch( + int max_m, int max_n, int max_k, + const int* m_d, const int* n_d, const int* k_d, + const double* const* A_array_d, const int* lda_d, + const double* const* B_array_d, const int* ldb_d, + double** C_array_d, const int* ldc_d, + int batchCount, cudaStream_t stream, + const double* alpha) +{ + vbatched_gemm_nn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); + +} + +void dgemm_tn_vbatch( + int max_m, int max_n, int max_k, + const int* m_d, const int* n_d, const int* k_d, + const double* const* A_array_d, const int* lda_d, + const double* const* B_array_d, const int* ldb_d, + double** C_array_d, const int* ldc_d, + int batchCount, cudaStream_t stream, + const double* alpha) +{ + vbatched_gemm_tn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); +} diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h new file mode 100644 index 0000000000..8589bcf62e --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +// C(batch_id) = alpha * A(batch_id) * B(batch_id) + C(batch_id) +void dgemm_nn_vbatch( + int max_m, int max_n, int max_k, + const int* m_d, const int* n_d, const int* k_d, + const double* const* A_array_d, const int* lda_d, + const double* const* B_array_d, const int* ldb_d, + double** C_array_d, const int* ldc_d, + int batchCount, cudaStream_t stream, + const double* alpha = nullptr); + +// C(batch_id) = alpha * A(batch_id)^T * B(batch_id) + C(batch_id) +void dgemm_tn_vbatch( + int max_m, int max_n, int max_k, + const int* m_d, const int* n_d, const int* k_d, + const double* const* A_array_d, const int* lda_d, + const double* const* B_array_d, const int* ldb_d, + double** C_array_d, const int* ldc_d, + int batchCount, cudaStream_t stream, + const double* alpha = nullptr); \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh new file mode 100644 index 0000000000..5ad934e305 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh @@ -0,0 +1,427 @@ +#ifndef GEMM_NN_VBATCH_CUH +#define GEMM_NN_VBATCH_CUH +#include // for assert +#include +#include // for CUDA_VERSION +#include +#include // for fprintf and stderr + +#include "gint_helper.cuh" +#include + + +#define sA(i, j) sA[(j)*slda + (i)] +#define sB(i, j) sB[(j)*sldb + (i)] +#define fetch(A, m, n, bound) offs_d##A[min(n * LD##A + m, bound)] + +template +static __device__ void vbatched_gemm_nn_device(int M, + int N, + int K, + const T* __restrict__ A, + int LDA, + const T* __restrict__ B, + int LDB, + T* __restrict__ C, + int LDC, + T* sA, + int slda, + T* sB, + int sldb, + T alpha) +{ + int idx = threadIdx.x; // thread's m dimension + int idy = threadIdx.y; // thread's n dimension + + int idt = DIM_X * idy + idx; // thread's global number + + int idxA = idt % DIM_XA; // idx within A + int idyA = idt / DIM_XA; // idy within A + + int idxB = idt % DIM_XB; // idx within B + int idyB = idt / DIM_XB; // idy within B + + int blx = blockIdx.x; // block's m dimension + int bly = blockIdx.y; // block's n dimension + + // Registers for the innermost loop + T rC[THR_N][THR_M]; + T rA[THR_M]; + T rB[THR_N]; + + // Registers for the dev->shmem copy + T ra[BLK_K / DIM_YA][BLK_M / DIM_XA]; + T rb[BLK_N / DIM_YB][BLK_K / DIM_XB]; + + // bound is the correction to offs_d in order to not get out of memory bound + // so bound could be negative value since offs_d could be out of bound + const T* offs_dA = A + blx * BLK_M + idyA * LDA + idxA; + int boundA + = (LDA * (K - 1) + M) - (blx * BLK_M + idyA * LDA + idxA) - 1; + + const T* offs_dB = B + bly * BLK_N * LDB + idyB * LDB + idxB; + int boundB + = (LDB * (N - 1) + K) - (bly * BLK_N * LDB + idyB * LDB + idxB) - 1; + + int m, n, k, kk; + +// Zero C +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] = 0.0; + } + } + +// Load A dev->shmem +#pragma unroll + for (n = 0; n < BLK_K; n += DIM_YA) + { +#pragma unroll + for (m = 0; m < BLK_M; m += DIM_XA) + { + sA(m + idxA, n + idyA) = fetch(A, m, n, boundA); + } + } + +#pragma unroll + for (n = 0; n < BLK_N; n += DIM_YB) + { +#pragma unroll + for (m = 0; m < BLK_K; m += DIM_XB) + { + sB(m + idxB, n + idyB) = fetch(B, m, n, boundB); + } + } + + __syncthreads(); + + for (kk = 0; kk < K - BLK_K; kk += BLK_K) + { + offs_dA += BLK_K * LDA; + boundA -= BLK_K * LDA; + + offs_dB += BLK_K; + boundB -= BLK_K; + +// Load A dev->regs +#pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) + { +#pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) + { + ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA); + } + } + +// Load B dev->regs +#pragma unroll + for (n = 0; n < BLK_N / DIM_YB; n++) + { +#pragma unroll + for (m = 0; m < BLK_K / DIM_XB; m++) + { + rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB); + } + } + +// Multiply +#pragma unroll + for (k = 0; k < BLK_K; k++) + { +// Load A shmem->regs +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rA[m] = sA(m * DIM_X + idx, k); + } + +// Load B shmem->regs +#pragma unroll + for (n = 0; n < THR_N; n++) + { + rB[n] = sB(k, n * DIM_Y + idy); + } + +// Compute +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] += rA[m] * rB[n]; + } + } + } + + __syncthreads(); + +// Load A regs->shmem +#pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) + { +#pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) + { + sA(m * DIM_XA + idxA, n * DIM_YA + idyA) = ra[n][m]; + } + } + +// Load B regs->shmem +#pragma unroll + for (n = 0; n < BLK_N / DIM_YB; n++) + { +#pragma unroll + for (m = 0; m < BLK_K / DIM_XB; m++) + { + sB(m * DIM_XB + idxB, n * DIM_YB + idyB) = rb[n][m]; + } + } + __syncthreads(); + } + + // Multiply last full (BLK_K) or partial block of + // columns of op(A) and rows of op(B). + // It's okay that m,n exceed matrix bounds as all work is in registers + // or shared memory, and out-of-bounds rC[n][m] will not be saved later. + kk = K - kk; +#pragma unroll + for (k = 0; k < kk; k++) + { +// Load A shmem->regs +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rA[m] = sA(m * DIM_X + idx, k); + } + +// Load B shmem->regs +#pragma unroll + for (n = 0; n < THR_N; n++) + { + rB[n] = sB(k, n * DIM_Y + idy); + } + +// Compute +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] += rA[m] * rB[n]; + } + } + } + +// Store C regs->dev +#pragma unroll + for (n = 0; n < THR_N; n++) + { + int coord_dCn = bly * BLK_N + n * DIM_Y + idy; +#pragma unroll + for (m = 0; m < THR_M; m++) + { + int coord_dCm = blx * BLK_M + m * DIM_X + idx; + if (coord_dCm < M && coord_dCn < N) + { + int offsC = coord_dCn * LDC + coord_dCm; + + atomicAdd(C + offsC, rC[n][m] * alpha); + } + } + } +} + +/******************************************************************************/ +template +static __global__ void vbatched_gemm_nn_kernel(const int* M, + const int* N, + const int* K, + const T* const* global_A_array, + const int* global_lda, + const T* const* global_B_array, + const int* global_ldb, + T** global_C_array, + const int* global_ldc, + const T* alpha) +{ + extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + T* shared_mem = reinterpret_cast(smem); + + int batchid = blockIdx.z; + int local_M = (int)M[batchid]; + int local_N = (int)N[batchid]; + int local_K = (int)K[batchid]; + + if (blockIdx.x >= (local_M + BLK_M - 1) / BLK_M) + return; + if (blockIdx.y >= (local_N + BLK_N - 1) / BLK_N) + return; + + int shared_lda = BLK_M + 1; + int shared_ldb = BLK_K + 1; + T* shared_A = (T*)shared_mem; + T* shared_B = shared_A + shared_lda * BLK_K; + double alpha_tmp = 1.0; + if (alpha != nullptr) + { + alpha_tmp = alpha[batchid]; + } + vbatched_gemm_nn_device(local_M, + local_N, + local_K, + global_A_array[batchid], + (int)global_lda[batchid], + global_B_array[batchid], + (int)global_ldb[batchid], + global_C_array[batchid], + (int)global_ldc[batchid], + shared_A, + shared_lda, + shared_B, + shared_ldb, + alpha_tmp); +} + +/** + * Performs a batched matrix multiplication using the vbatched_gemm_impl + * function. + * + * C = alpha * A * B + C + * @tparam T The data type of the matrices. + * @tparam DIM_X The number of threads in the x-dimension of each block. + * @tparam DIM_Y The number of threads in the y-dimension of each block. + * @tparam BLK_M The number of rows processed by each thread block. + * @tparam BLK_N The number of columns processed by each thread block. + * @tparam BLK_K The number of elements processed by each thread block along the + * K dimension. + * @tparam DIM_XA The number of threads in the x-dimension used for loading + * matrix A. + * @tparam DIM_YA The number of threads in the y-dimension used for loading + * matrix A. + * @tparam DIM_XB The number of threads in the x-dimension used for loading + * matrix B. + * @tparam DIM_YB The number of threads in the y-dimension used for loading + * matrix B. + * @param max_m The maximum number of rows in the matrices. + * @param max_n The maximum number of columns in the matrices. + * @param m An array of batch sizes for the number of rows in each matrix. + * @param n An array of batch sizes for the number of columns in each matrix. + * @param k An array of batch sizes for the number of elements in each matrix + * along the K dimension. + * @param global_A_array An array of pointers to the input matrices A. + * @param global_lda An array of leading dimensions for the input matrices A. + * @param global_B_array An array of pointers to the input matrices B. + * @param global_ldb An array of leading dimensions for the input matrices B. + * @param global_C_array An array of pointers to the output matrices C. + * @param global_ldc An array of leading dimensions for the output matrices C. + * @param batchCount The number of matrices in the batch. + * @param stream The CUDA stream to use for the computation. + * @param alpha The scalar value to multiply the matrices by (optional, default + * is nullptr). generate by copilot + */ +template +void vbatched_gemm_nn_impl(int max_m, + int max_n, + const int* m, + const int* n, + const int* k, + const T* const* global_A_array, + const int* global_lda, + const T* const* global_B_array, + const int* global_ldb, + T** global_C_array, + const int* global_ldc, + int batchCount, + cudaStream_t stream, + const T* alpha = nullptr) +{ + // The positions of A and B have been swapped here. + // This is because vbatch_gemm_nn_kernel is column major, + // but vatched_gemm_nn_impl is designed to be row major, + + size_t shared_mem_size = 0; + shared_mem_size += (BLK_M + 1) * BLK_K * sizeof(T); + shared_mem_size += (BLK_K + 1) * BLK_N * sizeof(T); + dim3 dimBlock(DIM_X, DIM_Y); + const int max_batch_count = 32768; + + for (int i = 0; i < batchCount; i += max_batch_count) + { + const int ibatch = min(max_batch_count, batchCount - i); + dim3 dimGrid(ceil_div(max_n, BLK_M), + ceil_div(max_m, BLK_N), + ibatch); + const T* alpha_tmp = nullptr; + if (alpha != nullptr) + { + alpha_tmp = alpha + i; + } + + vbatched_gemm_nn_kernel + <<>>( + n + i, m + i, k + i, + global_B_array + i, global_ldb + i, + global_A_array + i, global_lda + i, + global_C_array + i, global_ldc + i, + alpha_tmp); + checkCudaLastError(); + } +} + +#endif // GEMM_VBATCH_CUH \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh new file mode 100644 index 0000000000..701e93e81f --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh @@ -0,0 +1,452 @@ +#ifndef GEMM_TN_VBATCH_CUH +#define GEMM_TN_VBATCH_CUH +#include // for assert +#include +#include // for CUDA_VERSION +#include +#include // for fprintf and stderr + +#include "gint_helper.cuh" +#include + + +#define sA(i, j) sA[(j)*slda + (i)] +#define sB(i, j) sB[(j)*sldb + (i)] +#define fetch(A, m, n, bound) offs_d##A[min(n * LD##A + m, bound)] + +template +static __device__ void vbatched_gemm_nt_device(int M, + int N, + int K, + const T* __restrict__ A, + int LDA, + const T* __restrict__ B, + int LDB, + T* __restrict__ C, + int LDC, + T* sA, + int slda, + T* sB, + int sldb, + T alpha) +{ + int idx = threadIdx.x; // thread's m dimension + int idy = threadIdx.y; // thread's n dimension + + int idt = DIM_X * idy + idx; // thread's global number + + int idxA = idt % DIM_XA; // idx within A + int idyA = idt / DIM_XA; // idy within A + + int idxB = idt % DIM_XB; // idx within B + int idyB = idt / DIM_XB; // idy within B + + int blx = blockIdx.x; // block's m dimension + int bly = blockIdx.y; // block's n dimension + + // Registers for the innermost loop + T rC[THR_N][THR_M]; + T rA[THR_M]; + T rB[THR_N]; + + // Registers for the dev->shmem copy + T ra[BLK_K / DIM_YA][BLK_M / DIM_XA]; + T rb[BLK_K / DIM_YB][BLK_N / DIM_XB]; + + // bound is the correction to offs_d in order to not get out of memory bound + // so bound could be negative value since offs_d could be out of bound + const T* offs_dA = A + blx * BLK_M + idyA * LDA + idxA; + int boundA + = (LDA * (K - 1) + M) - (blx * BLK_M + idyA * LDA + idxA) - 1; + + const T* offs_dB = B + bly * BLK_N + idyB * LDB + idxB; + int boundB + = (LDB * (K - 1) + N) - (bly * BLK_N + idyB * LDB + idxB) - 1; + + int m, n, k, kk; + +// Zero C +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] = 0.0; + } + } + +// Load A dev->shmem +#pragma unroll + for (n = 0; n < BLK_K; n += DIM_YA) + { +#pragma unroll + for (m = 0; m < BLK_M; m += DIM_XA) + { + sA(m + idxA, n + idyA) = fetch(A, m, n, boundA); + } + } + +#pragma unroll + for (n = 0; n < BLK_K; n += DIM_YB) + { +#pragma unroll + for (m = 0; m < BLK_N; m += DIM_XB) + { + sB(n + idyB, m + idxB) = fetch(B, m, n, boundB); + } + } + + __syncthreads(); + + for (kk = 0; kk < K - BLK_K; kk += BLK_K) + { + offs_dA += BLK_K * LDA; + boundA -= BLK_K * LDA; + + offs_dB += BLK_K * LDB; + boundB -= BLK_K * LDB; + +// Load A dev->regs +#pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) + { +#pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) + { + ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA); + } + } + +// Load B dev->regs +#pragma unroll + for (n = 0; n < BLK_K / DIM_YB; n++) + { +#pragma unroll + for (m = 0; m < BLK_N / DIM_XB; m++) + { + rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB); + } + } + +// Multiply +#pragma unroll + for (k = 0; k < BLK_K; k++) + { +// Load A shmem->regs +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rA[m] = sA(m * DIM_X + idx, k); + } + +// Load B shmem->regs +#pragma unroll + for (n = 0; n < THR_N; n++) + { + rB[n] = sB(k, n * DIM_Y + idy); + } + +// Compute +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] += rA[m] * rB[n]; + } + } + } + + __syncthreads(); + +// Load A regs->shmem +#pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) + { +#pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) + { + sA(m * DIM_XA + idxA, n * DIM_YA + idyA) = ra[n][m]; + } + } + +// Load B regs->shmem +#pragma unroll + for (n = 0; n < BLK_K / DIM_YB; n++) + { +#pragma unroll + for (m = 0; m < BLK_N / DIM_XB; m++) + { + sB(n * DIM_YB + idyB, m * DIM_XB + idxB) = rb[n][m]; + } + } + __syncthreads(); + } + + // Multiply last full (BLK_K) or partial block of + // columns of op(A) and rows of op(B). + // It's okay that m,n exceed matrix bounds as all work is in registers + // or shared memory, and out-of-bounds rC[n][m] will not be saved later. + kk = K - kk; +#pragma unroll + for (k = 0; k < kk; k++) + { +// Load A shmem->regs +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rA[m] = sA(m * DIM_X + idx, k); + } + +// Load B shmem->regs +#pragma unroll + for (n = 0; n < THR_N; n++) + { + rB[n] = sB(k, n * DIM_Y + idy); + } + +// Compute +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] += rA[m] * rB[n]; + } + } + } + +// Store C regs->dev +#pragma unroll + for (n = 0; n < THR_N; n++) + { + int coord_dCn = bly * BLK_N + n * DIM_Y + idy; +#pragma unroll + for (m = 0; m < THR_M; m++) + { + int coord_dCm = blx * BLK_M + m * DIM_X + idx; + if (coord_dCm < M && coord_dCn < N) + { + int offsC = coord_dCn * LDC + coord_dCm; + + atomicAdd(C + offsC, rC[n][m] * alpha); + } + } + } +} + +/******************************************************************************/ +template +static __global__ void vbatched_gemm_nt_kernel(const int* M, + const int* N, + const int* K, + const T* const* global_A_array, + const int* global_lda, + const T* const* global_B_array, + const int* global_ldb, + T** global_C_array, + const int* global_ldc, + const T* alpha) +{ + extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + T* shared_mem = reinterpret_cast(smem); + + int batchid = blockIdx.z; + int local_M = (int)M[batchid]; + int local_N = (int)N[batchid]; + int local_K = (int)K[batchid]; + + if (blockIdx.x >= (local_M + BLK_M - 1) / BLK_M) + return; + if (blockIdx.y >= (local_N + BLK_N - 1) / BLK_N) + return; + + int shared_lda = BLK_M + 1; + int shared_ldb = BLK_K + 1; + T* shared_A = (T*)shared_mem; + T* shared_B = shared_A + shared_lda * BLK_K; + double alpha_tmp = 1.0; + if (alpha != nullptr) + { + alpha_tmp = alpha[batchid]; + } + vbatched_gemm_nt_device(local_M, + local_N, + local_K, + global_A_array[batchid], + (int)global_lda[batchid], + global_B_array[batchid], + (int)global_ldb[batchid], + global_C_array[batchid], + (int)global_ldc[batchid], + shared_A, + shared_lda, + shared_B, + shared_ldb, + alpha_tmp); +} + +/** + * Performs a batched matrix multiplication using the vbatched_gemm_impl + * function. + * + * C = alpha * trans(A) * B + C + * @tparam T The data type of the matrices. + * @tparam DIM_X The number of threads in the x-dimension of each block. + * @tparam DIM_Y The number of threads in the y-dimension of each block. + * @tparam BLK_M The number of rows processed by each thread block. + * @tparam BLK_N The number of columns processed by each thread block. + * @tparam BLK_K The number of elements processed by each thread block along the + * K dimension. + * @tparam DIM_XA The number of threads in the x-dimension used for loading + * matrix A. + * @tparam DIM_YA The number of threads in the y-dimension used for loading + * matrix A. + * @tparam DIM_XB The number of threads in the x-dimension used for loading + * matrix B. + * @tparam DIM_YB The number of threads in the y-dimension used for loading + * matrix B. + * @param max_m The maximum number of rows in the matrices. + * @param max_n The maximum number of columns in the matrices. + * @param m An array of batch sizes for the number of rows in each matrix. + * @param n An array of batch sizes for the number of columns in each matrix. + * @param k An array of batch sizes for the number of elements in each matrix + * along the K dimension. + * @param global_A_array An array of pointers to the input matrices A. + * @param global_lda An array of leading dimensions for the input matrices A. + * @param global_B_array An array of pointers to the input matrices B. + * @param global_ldb An array of leading dimensions for the input matrices B. + * @param global_C_array An array of pointers to the output matrices C. + * @param global_ldc An array of leading dimensions for the output matrices C. + * @param batchCount The number of matrices in the batch. + * @param stream The CUDA stream to use for the computation. + * @param alpha The scalar value to multiply the matrices by (optional, default + * is nullptr). generate by copilot + */ + +/* + * Why do we need to implement our own matrix multiplication based on the magma + * code? There are two main reasons. First is when we are doing batch matrix + * multiplication, since we need to accumulate the results of the + * multiplications, it is necessary to pass the same memory address of matrix C + * to different multiplications. This way, the accumulation can be done directly + * through atomic operations during the matrix multiplication, avoiding the + * reduction operations after the multiplication. Secondly, when calculating the + * charge density, where C = alpha * A * B + C, the value of alpha might be + * different for the same batch of matrices. Using the standard matrix + * multiplication interface would require breaking down the batch matrix + * multiplication into smaller batches. In practice, it is difficult to + * accumulate a batch. + * + * Moreover, taking into account the specific requirements of our application, + * especially the fact that we can relatively easily control the arrangement of + * the matrix elements, we have only implemented one type of requirement for + * matrix transposition. That is, we have implemented the operation C = alpha * + * A * trans(B) + C under the constraint of column-major order. + * + * Finally, we would like to thank Magma for its contributions to the field of + * scientific computing. + */ + +template +void vbatched_gemm_tn_impl(int max_m, + int max_n, + const int* m, + const int* n, + const int* k, + const T* const* global_A_array, + const int* global_lda, + const T* const* global_B_array, + const int* global_ldb, + T** global_C_array, + const int* global_ldc, + int batchCount, + cudaStream_t stream, + const T* alpha = nullptr) +{ + // The positions of A and B have been swapped here. + // This is because vbatch_gemm__tn_kernel is column major, + // but vatched_gemm_nt_impl is designed to be row major, + + size_t shared_mem_size = 0; + shared_mem_size += (BLK_M + 1) * BLK_K * sizeof(T); + shared_mem_size += (BLK_K + 1) * BLK_N * sizeof(T); + dim3 dimBlock(DIM_X, DIM_Y); + const int max_batch_count = 32768; + + for (int i = 0; i < batchCount; i += max_batch_count) + { + const int ibatch = min(max_batch_count, batchCount - i); + dim3 dimGrid(ceil_div(max_n, BLK_M), + ceil_div(max_m, BLK_N), + ibatch); + const T* alpha_tmp = nullptr; + if (alpha != nullptr) + { + alpha_tmp = alpha + i; + } + + vbatched_gemm_nt_kernel + <<>>( + n + i, m + i, k + i, + global_B_array + i, global_ldb + i, + global_A_array + i, global_lda + i, + global_C_array + i, global_ldc + i, + alpha_tmp); + checkCudaLastError(); + } +} + +#endif // GEMM_TN_VBATCH_CUH \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp new file mode 100644 index 0000000000..f4443762f0 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp @@ -0,0 +1,126 @@ +#include "gint_gpu_vars.h" +#include "source_base/module_device/device.h" + +namespace ModuleGint +{ + +GintGpuVars::GintGpuVars(std::shared_ptr biggrid_info, + const UnitCell& ucell, + const Numerical_Orbital* Phi) +{ +// set device +#ifdef __MPI + dev_id_ = base_device::information::set_device_by_rank(); +#endif + std::vector ylmcoef_h(100); + for (int i = 0; i < 100; i++) + { + ylmcoef_h[i] = ModuleBase::Ylm::ylmcoef[i]; + } + set_ylmcoe_d(ylmcoef_h.data(), &ylmcoef_d); + + const int ntype = ucell.ntype; + std::vector atom_nw_h(ntype); + std::vector ucell_atom_nwl_h(ntype); + for (int i = 0; i < ntype; i++) + { + atom_nw_h[i] = ucell.atoms[i].nw; + ucell_atom_nwl_h[i] = ucell.atoms[i].nwl; + } + checkCuda(cudaMalloc((void**)&atom_nw_d, sizeof(int) * ntype)); + checkCuda(cudaMemcpy(atom_nw_d, atom_nw_h.data(), sizeof(int) * ntype, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&ucell_atom_nwl_d, sizeof(int) * ntype)); + checkCuda(cudaMemcpy(ucell_atom_nwl_d, ucell_atom_nwl_h.data(), sizeof(int) * ntype, cudaMemcpyHostToDevice)); + + dr_uniform = Phi[0].PhiLN(0, 0).dr_uniform; + double max_rcut = 0; + std::vector rcut_h(ntype); + for (int i = 0; i < ntype; i++) + { + rcut_h[i] = Phi[i].getRcut(); + if (rcut_h[i] > max_rcut) + { + max_rcut = rcut_h[i]; + } + } + checkCuda(cudaMalloc((void**)&rcut_d, sizeof(double) * ntype)); + checkCuda(cudaMemcpy(rcut_d, rcut_h.data(), sizeof(double) * ntype, cudaMemcpyHostToDevice)); + nr_max = static_cast(1 / dr_uniform * max_rcut) + 10; + + nwmax = ucell.nwmax; + std::vector psi_u_h(ntype * nwmax * nr_max); + std::vector dpsi_u_h(ntype * nwmax * nr_max); + std::vector d2psi_u_h(ntype * nwmax * nr_max); + // std::vector cannot use data(), so std::vector is used instead + std::vector atom_iw2_new_h(ntype * nwmax); + std::vector atom_iw2_ylm_h(ntype * nwmax); + std::vector atom_iw2_l_h(ntype * nwmax); + for (int i = 0; i < ntype; i++) + { + Atom* atomx = &ucell.atoms[i]; + for (int j = 0; j < atomx->nw; j++) + { + atom_iw2_new_h[i * nwmax + j] = atomx->iw2_new[j]; + atom_iw2_ylm_h[i * nwmax + j] = atomx->iw2_ylm[j]; + atom_iw2_l_h[i * nwmax + j] = atomx->iw2l[j]; + const auto psi_ptr = &Phi[i].PhiLN(atomx->iw2l[j], atomx->iw2n[j]); + const int psi_size = psi_ptr->psi_uniform.size(); + int idx = i * nwmax * nr_max + j * nr_max; + for (int k = 0; k < psi_size; k++) + { + psi_u_h[idx + k] = psi_ptr->psi_uniform[k]; + dpsi_u_h[idx + k] = psi_ptr->dpsi_uniform[k]; + d2psi_u_h[idx + k] = psi_ptr->ddpsi_uniform[k]; + } + } + } + + checkCuda(cudaMalloc((void**)&atom_iw2_new_d, sizeof(bool) * ntype * nwmax)); + checkCuda(cudaMemcpy(atom_iw2_new_d, atom_iw2_new_h.data(), sizeof(bool) * ntype * nwmax, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&atom_iw2_ylm_d, sizeof(int) * ntype * nwmax)); + checkCuda(cudaMemcpy(atom_iw2_ylm_d, atom_iw2_ylm_h.data(), sizeof(int) * ntype * nwmax, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&atom_iw2_l_d, sizeof(int) * ntype * nwmax)); + checkCuda(cudaMemcpy(atom_iw2_l_d, atom_iw2_l_h.data(), sizeof(int) * ntype * nwmax, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&psi_u_d, sizeof(double) * ntype * nwmax * nr_max)); + checkCuda(cudaMemcpy(psi_u_d, psi_u_h.data(), sizeof(double) * ntype * nwmax * nr_max, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&dpsi_u_d, sizeof(double) * ntype * nwmax * nr_max)); + checkCuda(cudaMemcpy(dpsi_u_d, dpsi_u_h.data(), sizeof(double) * ntype * nwmax * nr_max, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&d2psi_u_d, sizeof(double) * ntype * nwmax * nr_max)); + checkCuda(cudaMemcpy(d2psi_u_d, d2psi_u_h.data(), sizeof(double) * ntype * nwmax * nr_max, cudaMemcpyHostToDevice)); + + const int mgrid_num = biggrid_info->get_mgrids_num(); + std::vector mgrids_pos_h(mgrid_num); + for(int i = 0; i < mgrid_num; i++) + { + mgrids_pos_h[i].x = biggrid_info->get_mgrid_coord(i).x; + mgrids_pos_h[i].y = biggrid_info->get_mgrid_coord(i).y; + mgrids_pos_h[i].z = biggrid_info->get_mgrid_coord(i).z; + } + checkCuda(cudaMalloc((void**)&mgrids_pos_d, sizeof(double3) * mgrid_num)); + checkCuda(cudaMemcpy(mgrids_pos_d, mgrids_pos_h.data(), sizeof(double3) * mgrid_num, cudaMemcpyHostToDevice)); + + checkCuda(cudaMalloc((void**)&iat2it_d, sizeof(int) * ucell.nat)); + checkCuda(cudaMemcpy(iat2it_d, ucell.iat2it, sizeof(int) * ucell.nat, cudaMemcpyHostToDevice)); + + gemm_algo_selector(mgrid_num, fastest_matrix_mul, ucell); +} + +GintGpuVars::~GintGpuVars() +{ +#ifdef __MPI + checkCuda(cudaSetDevice(dev_id_)); +#endif + checkCuda(cudaFree(rcut_d)); + checkCuda(cudaFree(atom_nw_d)); + checkCuda(cudaFree(ucell_atom_nwl_d)); + checkCuda(cudaFree(atom_iw2_new_d)); + checkCuda(cudaFree(atom_iw2_ylm_d)); + checkCuda(cudaFree(atom_iw2_l_d)); + checkCuda(cudaFree(psi_u_d)); + checkCuda(cudaFree(dpsi_u_d)); + checkCuda(cudaFree(d2psi_u_d)); + checkCuda(cudaFree(mgrids_pos_d)); + checkCuda(cudaFree(iat2it_d)); +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h new file mode 100644 index 0000000000..7d2515b3b0 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include "set_const_mem.cuh" +#include "source_base/ylm.h" +#include "source_cell/unitcell.h" +#include "source_cell/atom_spec.h" +#include "module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h" +#include "gint_helper.cuh" +#include "module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cuh" + +namespace ModuleGint +{ + +class GintGpuVars +{ + public: + GintGpuVars(std::shared_ptr bgrid_info, + const UnitCell& ucell, + const Numerical_Orbital* Phi); + ~GintGpuVars(); + + int nwmax; + double dr_uniform; + double nr_max; + // ylmcoef_d is __constant__ memory, no need to cudaFree + double* ylmcoef_d = nullptr; + double* rcut_d = nullptr; + int* atom_nw_d = nullptr; + int* ucell_atom_nwl_d = nullptr; + bool* atom_iw2_new_d = nullptr; + int* atom_iw2_ylm_d = nullptr; + int* atom_iw2_l_d = nullptr; + double* psi_u_d = nullptr; + double* dpsi_u_d = nullptr; + double* d2psi_u_d = nullptr; + double3* mgrids_pos_d = nullptr; + int* iat2it_d = nullptr; + + // the index of gpu device + int dev_id_ = 0; + matrix_multiple_func_type fastest_matrix_mul; + +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh new file mode 100644 index 0000000000..7a6e925531 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh @@ -0,0 +1,75 @@ +#pragma once +#include + +// if exponent is an integer between 0 and 5 (the most common cases in gint) and +// and exp is a variable that cannot be determined at compile time (which means the compiler cannot optimize the code), +// pow_int is much faster than std::pow +template +__forceinline__ __device__ T pow_int(const T base, const int exp) +{ + switch (exp) + { + case 0: + return 1.0; + case 1: + return base; + case 2: + return base * base; + case 3: + return base * base * base; + case 4: + return base * base * base * base; + case 5: + return base * base * base * base * base; + default: + double result = std::pow(base, exp); + return result; + } +} + +template +__forceinline__ __device__ T warpReduceSum(T val) +{ + val += __shfl_xor_sync(0xffffffff, val, 16, 32); + val += __shfl_xor_sync(0xffffffff, val, 8, 32); + val += __shfl_xor_sync(0xffffffff, val, 4, 32); + val += __shfl_xor_sync(0xffffffff, val, 2, 32); + val += __shfl_xor_sync(0xffffffff, val, 1, 32); + return val; +} + +inline int ceil_div(const int a, const int b) +{ + return a / b + (a % b != 0 && (a ^ b) > 0); +} + +inline void check(cudaError_t result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), cudaGetErrorString(result), func); + exit(EXIT_FAILURE); + } +} + +inline void __getLastCudaError(const char *file, + const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " (%d) %s.\n", + file, line, static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCuda(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define checkCudaLastError() __getLastCudaError(__FILE__, __LINE__) \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu new file mode 100644 index 0000000000..edc07959d4 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu @@ -0,0 +1,466 @@ +#include "phi_operator_gpu.h" +#include "phi_operator_kernel.cuh" +#include "dgemm_vbatch.h" +#include + +namespace ModuleGint +{ +PhiOperatorGpu::PhiOperatorGpu(std::shared_ptr gint_gpu_vars, cudaStream_t stream) +:gint_gpu_vars_(gint_gpu_vars), stream_(stream), +mgrids_num_(BatchBigGrid::get_bgrid_info()->get_mgrids_num()), +atoms_num_info_(BatchBigGrid::get_max_batch_size(), stream_, true), +bgrids_phi_len_(BatchBigGrid::get_max_batch_size(), stream_, true), +bgrids_phi_start_(BatchBigGrid::get_max_batch_size(), stream_, true), +atoms_iat_(BatchBigGrid::get_max_atoms_num(), stream_, true), +atoms_bgrids_rcoords_(BatchBigGrid::get_max_atoms_num(), stream_, true), +atoms_phi_start_(BatchBigGrid::get_max_atoms_num(), stream_, true), +mgrids_local_idx_batch_(BatchBigGrid::get_max_batch_size() + * BatchBigGrid::get_bgrid_info()->get_mgrids_num(), stream_, true), +gemm_m_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_n_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_k_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_lda_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_ldb_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_ldc_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_A_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_B_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_C_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_alpha_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true) +{ + checkCuda(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); +} + +PhiOperatorGpu::~PhiOperatorGpu() +{ + checkCuda(cudaEventDestroy(event_)); +} + +void PhiOperatorGpu::set_bgrid_batch(std::shared_ptr bgrid_batch) +{ + bgrid_batch_ = bgrid_batch; + auto atoms_num_info_h = atoms_num_info_.get_host_ptr(); + auto bgrids_phi_len_h = bgrids_phi_len_.get_host_ptr(); + auto bgrids_phi_start_h = bgrids_phi_start_.get_host_ptr(); + auto atoms_iat_h = atoms_iat_.get_host_ptr(); + auto atoms_bgrids_rcoords_h = atoms_bgrids_rcoords_.get_host_ptr(); + auto atoms_phi_start_h = atoms_phi_start_.get_host_ptr(); + auto mgrids_local_idx_batch_h = mgrids_local_idx_batch_.get_host_ptr(); + int i = 0; + int j = 0; + int atoms_accum = 0; + phi_len_ = 0; + int phi_start = 0; + std::vector mgrids_local_idx; + checkCuda(cudaEventSynchronize(event_)); + for (const auto& bgrid : bgrid_batch->get_bgrids()) + { + atoms_num_info_h[i] = make_int2(bgrid->get_atoms_num(), atoms_accum); + atoms_accum += bgrid->get_atoms_num(); + bgrids_phi_start_h[i] = phi_start; + bgrid->set_mgrids_local_idx(mgrids_local_idx); + std::copy(mgrids_local_idx.begin(), mgrids_local_idx.end(), + mgrids_local_idx_batch_h + i * mgrids_num_); + int phi_len_bgrid = 0; + for (const auto& atom : bgrid->get_atoms()) + { + atoms_iat_h[j] = atom->get_iat(); + Vec3d rcoord = bgrid->get_bgrid_atom_rcoord(atom); + atoms_bgrids_rcoords_h[j] = make_double3(rcoord.x, rcoord.y, rcoord.z); + atoms_phi_start_h[j] = phi_len_ + phi_len_bgrid; + phi_len_bgrid += atom->get_nw(); + j++; + } + bgrids_phi_len_h[i] = phi_len_bgrid; + phi_len_ += phi_len_bgrid * bgrid->get_mgrids_num(); + phi_start += phi_len_bgrid * bgrid->get_mgrids_num(); + i++; + } + + atoms_num_info_.copy_host_to_device_async(bgrid_batch->get_batch_size()); + bgrids_phi_len_.copy_host_to_device_async(bgrid_batch->get_batch_size()); + bgrids_phi_start_.copy_host_to_device_async(bgrid_batch->get_batch_size()); + atoms_iat_.copy_host_to_device_async(bgrid_batch->get_atoms_num()); + atoms_bgrids_rcoords_.copy_host_to_device_async(bgrid_batch->get_atoms_num()); + atoms_phi_start_.copy_host_to_device_async(bgrid_batch->get_atoms_num()); + mgrids_local_idx_batch_.copy_host_to_device_async(bgrid_batch->get_batch_size() * mgrids_num_); + checkCuda(cudaEventRecord(event_, stream_)); +} + +void PhiOperatorGpu::set_phi(double* phi_d) const +{ + // checkCuda(cudaMemsetAsync(phi_d, 0, phi_len_ * sizeof(double), stream_)); + dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); + dim3 threads_per_block(64); + set_phi_kernel<<>>( + gint_gpu_vars_->nwmax, + mgrids_num_, + gint_gpu_vars_->nr_max, + gint_gpu_vars_->dr_uniform, + gint_gpu_vars_->ylmcoef_d, + gint_gpu_vars_->ucell_atom_nwl_d, + gint_gpu_vars_->atom_iw2_new_d, + gint_gpu_vars_->atom_iw2_ylm_d, + gint_gpu_vars_->atom_nw_d, + gint_gpu_vars_->iat2it_d, + gint_gpu_vars_->rcut_d, + gint_gpu_vars_->psi_u_d, + gint_gpu_vars_->dpsi_u_d, + gint_gpu_vars_->mgrids_pos_d, + atoms_iat_.get_device_ptr(), + atoms_bgrids_rcoords_.get_device_ptr(), + atoms_num_info_.get_device_ptr(), + atoms_phi_start_.get_device_ptr(), + bgrids_phi_len_.get_device_ptr(), + phi_d); + checkCudaLastError(); +} + +void PhiOperatorGpu::set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_y_d, double* dphi_z_d) const +{ + dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); + dim3 threads_per_block(64); + set_phi_dphi_kernel<<>>( + gint_gpu_vars_->nwmax, + mgrids_num_, + gint_gpu_vars_->nr_max, + gint_gpu_vars_->dr_uniform, + gint_gpu_vars_->ylmcoef_d, + gint_gpu_vars_->ucell_atom_nwl_d, + gint_gpu_vars_->atom_iw2_new_d, + gint_gpu_vars_->atom_iw2_ylm_d, + gint_gpu_vars_->atom_iw2_l_d, + gint_gpu_vars_->atom_nw_d, + gint_gpu_vars_->iat2it_d, + gint_gpu_vars_->rcut_d, + gint_gpu_vars_->psi_u_d, + gint_gpu_vars_->dpsi_u_d, + gint_gpu_vars_->mgrids_pos_d, + atoms_iat_.get_device_ptr(), + atoms_bgrids_rcoords_.get_device_ptr(), + atoms_num_info_.get_device_ptr(), + atoms_phi_start_.get_device_ptr(), + bgrids_phi_len_.get_device_ptr(), + phi_d, + dphi_x_d, + dphi_y_d, + dphi_z_d); + checkCudaLastError(); +} + +void PhiOperatorGpu::set_ddphi(double* ddphi_xx_d, double* ddphi_xy_d, double* ddphi_xz_d, + double* ddphi_yy_d, double* ddphi_yz_d, double* ddphi_zz_d) const +{ + // Since the underlying implementation of `set_ddphi` uses `ddphi +=` instead of `ddphi =`, + // the ddphi array needs to be zeroed out at the beginning of the function. + checkCuda(cudaMemsetAsync(ddphi_xx_d, 0, phi_len_ * sizeof(double), stream_)); + checkCuda(cudaMemsetAsync(ddphi_xy_d, 0, phi_len_ * sizeof(double), stream_)); + checkCuda(cudaMemsetAsync(ddphi_xz_d, 0, phi_len_ * sizeof(double), stream_)); + checkCuda(cudaMemsetAsync(ddphi_yy_d, 0, phi_len_ * sizeof(double), stream_)); + checkCuda(cudaMemsetAsync(ddphi_yz_d, 0, phi_len_ * sizeof(double), stream_)); + checkCuda(cudaMemsetAsync(ddphi_zz_d, 0, phi_len_ * sizeof(double), stream_)); + dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); + dim3 threads_per_block(64); + set_ddphi_kernel<<>>( + gint_gpu_vars_->nwmax, + mgrids_num_, + gint_gpu_vars_->nr_max, + gint_gpu_vars_->dr_uniform, + gint_gpu_vars_->ylmcoef_d, + gint_gpu_vars_->ucell_atom_nwl_d, + gint_gpu_vars_->atom_iw2_new_d, + gint_gpu_vars_->atom_iw2_ylm_d, + gint_gpu_vars_->atom_iw2_l_d, + gint_gpu_vars_->atom_nw_d, + gint_gpu_vars_->iat2it_d, + gint_gpu_vars_->rcut_d, + gint_gpu_vars_->psi_u_d, + gint_gpu_vars_->dpsi_u_d, + gint_gpu_vars_->mgrids_pos_d, + atoms_iat_.get_device_ptr(), + atoms_bgrids_rcoords_.get_device_ptr(), + atoms_num_info_.get_device_ptr(), + atoms_phi_start_.get_device_ptr(), + bgrids_phi_len_.get_device_ptr(), + ddphi_xx_d, + ddphi_xy_d, + ddphi_xz_d, + ddphi_yy_d, + ddphi_yz_d, + ddphi_zz_d); + checkCudaLastError(); +} + +void PhiOperatorGpu::phi_mul_vldr3( + const double* vl_d, + const double dr3, + const double* phi_d, + double* result_d) const +{ + dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); + dim3 threads_per_block(64); + phi_mul_vldr3_kernel<<>>( + vl_d, + dr3, + phi_d, + mgrids_num_, + mgrids_local_idx_batch_.get_device_ptr(), + bgrids_phi_len_.get_device_ptr(), + bgrids_phi_start_.get_device_ptr(), + result_d); + checkCudaLastError(); +} + +void PhiOperatorGpu::phi_mul_phi( + const double* phi_d, + const double* phi_vldr3_d, + HContainer& hRGint, + double* hr_d) const +{ + // ap_num means number of atom pairs + int ap_num = 0; + int max_m = 0; + int max_n = 0; + int max_k = mgrids_num_; + checkCuda(cudaEventSynchronize(event_)); + for (int i = 0; i < bgrid_batch_->get_batch_size(); i++) + { + auto bgrid = bgrid_batch_->get_bgrids()[i]; + // the length of phi on a mesh grid + const int phi_len_mgrid = bgrid->get_phi_len(); + const int pre_atoms = atoms_num_info_.get_host_ptr()[i].y; + for (int ia_1 = 0; ia_1 < bgrid->get_atoms_num(); ia_1++) + { + auto atom_1 = bgrid->get_atoms()[ia_1]; + const int iat_1 = atom_1->get_iat(); + const auto& r_1 = atom_1->get_R(); + const int nw1 = atom_1->get_nw(); + const int phi_1_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_1]; + + for (int ia_2 = 0; ia_2 < bgrid->get_atoms_num(); ia_2++) + { + auto atom_2 = bgrid->get_atoms()[ia_2]; + const int iat_2 = atom_2->get_iat(); + const auto& r_2 = atom_2->get_R(); + const int nw2 = atom_2->get_nw(); + + if(iat_1 > iat_2) + { continue; } + + int hr_offset = hRGint.find_matrix_offset(iat_1, iat_2, r_1 - r_2); + if (hr_offset == -1) + { continue; } + + const int phi_2_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_2]; + + gemm_A_.get_host_ptr()[ap_num] = phi_d + phi_1_offset; + gemm_B_.get_host_ptr()[ap_num] = phi_vldr3_d + phi_2_offset; + gemm_C_.get_host_ptr()[ap_num] = hr_d + hr_offset; + gemm_lda_.get_host_ptr()[ap_num] = phi_len_mgrid; + gemm_ldb_.get_host_ptr()[ap_num] = phi_len_mgrid; + gemm_ldc_.get_host_ptr()[ap_num] = nw2; + gemm_m_.get_host_ptr()[ap_num] = nw1; + gemm_n_.get_host_ptr()[ap_num] = nw2; + gemm_k_.get_host_ptr()[ap_num] = bgrid->get_mgrids_num(); + ap_num++; + + max_m = std::max(max_m, nw1); + max_n = std::max(max_n, nw2); + } + } + } + + gemm_A_.copy_host_to_device_async(ap_num); + gemm_B_.copy_host_to_device_async(ap_num); + gemm_C_.copy_host_to_device_async(ap_num); + gemm_lda_.copy_host_to_device_async(ap_num); + gemm_ldb_.copy_host_to_device_async(ap_num); + gemm_ldc_.copy_host_to_device_async(ap_num); + gemm_m_.copy_host_to_device_async(ap_num); + gemm_n_.copy_host_to_device_async(ap_num); + gemm_k_.copy_host_to_device_async(ap_num); + checkCuda(cudaEventRecord(event_, stream_)); + + dgemm_tn_vbatch(max_m, + max_n, + max_k, + gemm_m_.get_device_ptr(), + gemm_n_.get_device_ptr(), + gemm_k_.get_device_ptr(), + gemm_A_.get_device_ptr(), + gemm_lda_.get_device_ptr(), + gemm_B_.get_device_ptr(), + gemm_ldb_.get_device_ptr(), + gemm_C_.get_device_ptr(), + gemm_ldc_.get_device_ptr(), + ap_num, + stream_, + nullptr); +} + +void PhiOperatorGpu::phi_mul_dm( + const double* phi_d, + const double* dm_d, + const HContainer& dm, + const bool is_symm, + double* phi_dm_d) +{ + checkCuda(cudaMemsetAsync(phi_dm_d, 0, phi_len_ * sizeof(double), stream_)); + // ap_num means number of atom pairs + int ap_num = 0; + int max_m = mgrids_num_; + int max_n = 0; + int max_k = 0; + checkCuda(cudaEventSynchronize(event_)); + for (int i = 0; i < bgrid_batch_->get_batch_size(); i++) + { + auto bgrid = bgrid_batch_->get_bgrids()[i]; + // the length of phi on a mesh grid + const int phi_len_mgrid = bgrid->get_phi_len(); + const int pre_atoms = atoms_num_info_.get_host_ptr()[i].y; + for (int ia_1 = 0; ia_1 < bgrid->get_atoms_num(); ia_1++) + { + auto atom_1 = bgrid->get_atoms()[ia_1]; + const int iat_1 = atom_1->get_iat(); + const auto& r_1 = atom_1->get_R(); + const int nw1 = atom_1->get_nw(); + const int phi_1_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_1]; + int ia_2 = is_symm ? ia_1 : 0; + for (; ia_2 < bgrid->get_atoms_num(); ia_2++) + { + auto atom_2 = bgrid->get_atoms()[ia_2]; + const int iat_2 = atom_2->get_iat(); + const auto& r_2 = atom_2->get_R(); + const int nw2 = atom_2->get_nw(); + + int dm_offset = dm.find_matrix_offset(iat_1, iat_2, r_1-r_2); + if (dm_offset == -1) + { continue; } + + const int phi_dm_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_2]; + + gemm_A_.get_host_ptr()[ap_num] = phi_d + phi_1_offset; + gemm_B_.get_host_ptr()[ap_num] = dm_d + dm_offset; + gemm_C_.get_host_ptr()[ap_num] = phi_dm_d + phi_dm_offset; + gemm_lda_.get_host_ptr()[ap_num] = phi_len_mgrid; + gemm_ldb_.get_host_ptr()[ap_num] = nw2; + gemm_ldc_.get_host_ptr()[ap_num] = phi_len_mgrid; + gemm_m_.get_host_ptr()[ap_num] = mgrids_num_; + gemm_n_.get_host_ptr()[ap_num] = nw2; + gemm_k_.get_host_ptr()[ap_num] = nw1; + gemm_alpha_.get_host_ptr()[ap_num] = ia_1 == ia_2 ? 1.0 : 2.0; + ap_num++; + + max_n = std::max(max_n, nw2); + max_k = std::max(max_k, nw1); + } + } + } + + gemm_A_.copy_host_to_device_async(ap_num); + gemm_B_.copy_host_to_device_async(ap_num); + gemm_C_.copy_host_to_device_async(ap_num); + gemm_lda_.copy_host_to_device_async(ap_num); + gemm_ldb_.copy_host_to_device_async(ap_num); + gemm_ldc_.copy_host_to_device_async(ap_num); + gemm_m_.copy_host_to_device_async(ap_num); + gemm_n_.copy_host_to_device_async(ap_num); + gemm_k_.copy_host_to_device_async(ap_num); + if(is_symm) + { + // if is_symm == false, gemm_alpha_ always equals 1.0, + // so we don't need to copy it to device + gemm_alpha_.copy_host_to_device_async(ap_num); + } + checkCuda(cudaEventRecord(event_, stream_)); + + auto alpha_ptr = is_symm ? gemm_alpha_.get_device_ptr() : nullptr; + dgemm_nn_vbatch(max_m, + max_n, + max_k, + gemm_m_.get_device_ptr(), + gemm_n_.get_device_ptr(), + gemm_k_.get_device_ptr(), + gemm_A_.get_device_ptr(), + gemm_lda_.get_device_ptr(), + gemm_B_.get_device_ptr(), + gemm_ldb_.get_device_ptr(), + gemm_C_.get_device_ptr(), + gemm_ldc_.get_device_ptr(), + ap_num, + stream_, + alpha_ptr); +} + +void PhiOperatorGpu::phi_dot_phi( + const double* phi_i_d, + const double* phi_j_d, + double* rho_d) const +{ + dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); + dim3 threads_per_block(64); + phi_dot_phi_kernel<<>>( + phi_i_d, + phi_j_d, + mgrids_num_, + mgrids_local_idx_batch_.get_device_ptr(), + bgrids_phi_len_.get_device_ptr(), + bgrids_phi_start_.get_device_ptr(), + rho_d); + checkCudaLastError(); +} + +void PhiOperatorGpu::phi_dot_dphi( + const double* phi_d, + const double* dphi_x_d, + const double* dphi_y_d, + const double* dphi_z_d, + double* fvl_d) const +{ + dim3 grid_dim(bgrid_batch_->get_max_atoms_num_per_bgrid(), + bgrid_batch_->get_batch_size()); + dim3 threads_per_block(32); + phi_dot_dphi_kernel<<>>( + phi_d, + dphi_x_d, + dphi_y_d, + dphi_z_d, + mgrids_num_, + bgrids_phi_len_.get_device_ptr(), + atoms_num_info_.get_device_ptr(), + atoms_phi_start_.get_device_ptr(), + atoms_iat_.get_device_ptr(), + gint_gpu_vars_->iat2it_d, + gint_gpu_vars_->atom_nw_d, + fvl_d); + checkCudaLastError(); +} + +void PhiOperatorGpu::phi_dot_dphi_r( + const double* phi_d, + const double* dphi_x_d, + const double* dphi_y_d, + const double* dphi_z_d, + double* svl_d) const +{ + dim3 grid_dim(mgrids_num_, + bgrid_batch_->get_batch_size()); + dim3 threads_per_block(32); + phi_dot_dphi_r_kernel<<>>( + phi_d, + dphi_x_d, + dphi_y_d, + dphi_z_d, + mgrids_num_, + bgrids_phi_len_.get_device_ptr(), + atoms_num_info_.get_device_ptr(), + atoms_phi_start_.get_device_ptr(), + atoms_iat_.get_device_ptr(), + atoms_bgrids_rcoords_.get_device_ptr(), + gint_gpu_vars_->mgrids_pos_d, + gint_gpu_vars_->iat2it_d, + gint_gpu_vars_->atom_nw_d, + svl_d); + checkCudaLastError(); +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h new file mode 100644 index 0000000000..4988e265ce --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h @@ -0,0 +1,110 @@ +#pragma once +#include +#include + +#include "module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h" +#include "gint_helper.cuh" +#include "gint_gpu_vars.h" +#include "cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class PhiOperatorGpu +{ + +public: + PhiOperatorGpu(std::shared_ptr gint_gpu_vars, cudaStream_t stream = 0); + ~PhiOperatorGpu(); + + void set_bgrid_batch(std::shared_ptr bgrid_batch); + + void set_phi(double* phi_d) const; + + void set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_y_d, double* dphi_z_d) const; + + void set_ddphi(double* ddphi_xx_d, double* ddphi_xy_d, double* ddphi_xz_d, + double* ddphi_yy_d, double* ddphi_yz_d, double* ddphi_zz_d) const; + + void phi_mul_vldr3( + const double* vl_d, + const double dr3, + const double* phi_d, + double* result_d) const; + + void phi_mul_phi( + const double* phi_d, + const double* phi_vldr3_d, + HContainer& hRGint, + double* hr_d) const; + + void phi_mul_dm( + const double* phi_d, + const double* dm_d, + const HContainer& dm, + const bool is_symm, + double* phi_dm_d); + + void phi_dot_phi( + const double* phi_i_d, + const double* phi_j_d, + double* rho_d) const; + + void phi_dot_dphi( + const double* phi_d, + const double* dphi_x_d, + const double* dphi_y_d, + const double* dphi_z_d, + double* fvl_d) const; + + void phi_dot_dphi_r( + const double* phi_d, + const double* dphi_x_d, + const double* dphi_y_d, + const double* dphi_z_d, + double* svl_d) const; + +private: + std::shared_ptr bgrid_batch_; + std::shared_ptr gint_gpu_vars_; + + // the number of meshgrids on a biggrid + int mgrids_num_; + + int phi_len_; + + cudaStream_t stream_ = 0; + cudaEvent_t event_; + + // The first number in every group of two represents the number of atoms on that bigcell. + // The second number represents the cumulative number of atoms up to that bigcell. + CudaMemWrapper atoms_num_info_; + + // the iat of each atom + CudaMemWrapper atoms_iat_; + + // atoms_bgrids_rcoords_ here represents the relative coordinates from the big grid to the atoms + CudaMemWrapper atoms_bgrids_rcoords_; + + // the start index of the phi array for each atom + CudaMemWrapper atoms_phi_start_; + // The length of phi for a single meshgrid on each big grid. + CudaMemWrapper bgrids_phi_len_; + // The start index of the phi array for each big grid. + CudaMemWrapper bgrids_phi_start_; + // Mapping of the index of meshgrid in the batch of biggrids to the index of meshgrid in the local cell + CudaMemWrapper mgrids_local_idx_batch_; + + mutable CudaMemWrapper gemm_m_; + mutable CudaMemWrapper gemm_n_; + mutable CudaMemWrapper gemm_k_; + mutable CudaMemWrapper gemm_lda_; + mutable CudaMemWrapper gemm_ldb_; + mutable CudaMemWrapper gemm_ldc_; + mutable CudaMemWrapper gemm_A_; + mutable CudaMemWrapper gemm_B_; + mutable CudaMemWrapper gemm_C_; + mutable CudaMemWrapper gemm_alpha_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu new file mode 100644 index 0000000000..5db767f501 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu @@ -0,0 +1,580 @@ +#include "phi_operator_kernel.cuh" +#include "gint_helper.cuh" +#include "sph.cuh" + +namespace ModuleGint +{ + +__global__ void set_phi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ phi) +{ + const int bgrid_id = blockIdx.y; + const int mgrid_id = blockIdx.x; + const int atoms_num = atoms_num_info[bgrid_id].x; + const int pre_atoms_num = atoms_num_info[bgrid_id].y; + const double3 mgrid_pos = mgrids_pos[mgrid_id]; + + for (int atom_id = threadIdx.x; atom_id < atoms_num; atom_id += blockDim.x) + { + const int atom_type = iat2it[atoms_iat[atom_id + pre_atoms_num]]; + const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num]; // rcoord is the ralative coordinate of an atom and a biggrid + const double3 coord = make_double3(mgrid_pos.x-rcoord.x, // coord is the relative coordinate of an atom and a meshgrid + mgrid_pos.y-rcoord.y, + mgrid_pos.z-rcoord.z); + double dist = norm3d(coord.x, coord.y, coord.z); + if (dist < rcut[atom_type]) + { + if (dist < 1.0E-9) + { dist += 1.0E-9; } + // since nwl is less or equal than 5, the size of ylma is (5+1)^2 + double ylma[36]; + const int nwl = ucell_atom_nwl[atom_type]; + sph_harm(nwl, ylmcoef, coord.x/dist, coord.y/dist, coord.z/dist, ylma); + + const double pos = dist / dr_uniform; + const int ip = static_cast(pos); + const double dx = pos - ip; + const double dx2 = dx * dx; + const double dx3 = dx2 * dx; + + const double c3 = 3.0 * dx2 - 2.0 * dx3; + const double c1 = 1.0 - c3; + const double c2 = (dx - 2.0 * dx2 + dx3) * dr_uniform; + const double c4 = (dx3 - dx2) * dr_uniform; + + double psi = 0; + const int it_nw = atom_type * nwmax; + int iw_nr = it_nw * nrmax + ip; + int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + + bgrids_phi_len[bgrid_id] * mgrid_id; + + for (int iw = 0; iw < atom_nw[atom_type]; iw++, iw_nr += nrmax) + { + if (atom_iw2_new[it_nw + iw]) + { + psi = c1 * psi_u[iw_nr] + c2 * dpsi_u[iw_nr] + + c3 * psi_u[iw_nr + 1] + c4 * dpsi_u[iw_nr + 1]; + } + phi[phi_idx + iw] = psi * ylma[atom_iw2_ylm[it_nw + iw]]; + } + } + else + { + int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + + bgrids_phi_len[bgrid_id] * mgrid_id; + for (int iw = 0; iw < atom_nw[atom_type]; iw++) + { + phi[phi_idx + iw] = 0.0; + } + } + } +} + +__global__ void set_phi_dphi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_iw2_l, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ phi, + double* __restrict__ dphi_x, + double* __restrict__ dphi_y, + double* __restrict__ dphi_z) +{ + const int bgrid_id = blockIdx.y; + const int mgrid_id = blockIdx.x; + const int atoms_num = atoms_num_info[bgrid_id].x; + const int pre_atoms_num = atoms_num_info[bgrid_id].y; + const double3 mgrid_pos = mgrids_pos[mgrid_id]; + + for (int atom_id = threadIdx.x; atom_id < atoms_num; atom_id += blockDim.x) + { + const int atom_type = iat2it[atoms_iat[atom_id + pre_atoms_num]]; + const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num]; + const double3 coord = make_double3(mgrid_pos.x-rcoord.x, + mgrid_pos.y-rcoord.y, + mgrid_pos.z-rcoord.z); + double dist = norm3d(coord.x, coord.y, coord.z); + if (dist < rcut[atom_type]) + { + if (dist < 1.0E-9) + { dist += 1.0E-9; } + // since nwl is less or equal than 5, the size of rly is (5+1)^2 + // size of grly = 36 * 3 + double rly[36]; + double grly[36 * 3]; + const int nwl = ucell_atom_nwl[atom_type]; + grad_rl_sph_harm(nwl, ylmcoef, coord.x, coord.y, coord.z, rly, grly); + + // interpolation + const double pos = dist / dr_uniform; + const int ip = static_cast(pos); + const double x0 = pos - ip; + const double x1 = 1.0 - x0; + const double x2 = 2.0 - x0; + const double x3 = 3.0 - x0; + const double x12 = x1 * x2 / 6; + const double x03 = x0 * x3 / 2; + double tmp = 0; + double dtmp = 0; + const int it_nw = atom_type * nwmax; + int iw_nr = it_nw * nrmax + ip; + int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + + bgrids_phi_len[bgrid_id] * mgrid_id; + for (int iw = 0; iw < atom_nw[atom_type]; iw++, iw_nr += nrmax) + { + if (atom_iw2_new[it_nw + iw]) + { + tmp = x12 * (psi_u[iw_nr] * x3 + psi_u[iw_nr + 3] * x0) + + x03 * (psi_u[iw_nr + 1] * x2 - psi_u[iw_nr + 2] * x1); + dtmp = x12 * (dpsi_u[iw_nr] * x3 + dpsi_u[iw_nr + 3] * x0) + + x03 * (dpsi_u[iw_nr + 1] * x2 - dpsi_u[iw_nr + 2] * x1); + } + const int iw_l = atom_iw2_l[it_nw + iw]; + const int idx_ylm = atom_iw2_ylm [it_nw + iw]; + const double rl = pow_int(dist, iw_l); + const double tmprl = tmp / rl; + + // if phi == nullptr, it means that we only need dphi. + if(phi != nullptr) + { + phi[phi_idx + iw] = tmprl * rly[idx_ylm]; + } + // derivative of wave functions with respect to atom positions. + const double tmpdphi_rly = (dtmp - tmp * iw_l / dist) / rl * rly[idx_ylm] / dist; + + dphi_x[phi_idx + iw] = tmpdphi_rly * coord.x + tmprl * grly[idx_ylm * 3 + 0]; + dphi_y[phi_idx + iw] = tmpdphi_rly * coord.y + tmprl * grly[idx_ylm * 3 + 1]; + dphi_z[phi_idx + iw] = tmpdphi_rly * coord.z + tmprl * grly[idx_ylm * 3 + 2]; + } + } + else + { + int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + + bgrids_phi_len[bgrid_id] * mgrid_id; + for (int iw = 0; iw < atom_nw[atom_type]; iw++) + { + if(phi != nullptr) + { + phi[phi_idx + iw] = 0.0; + } + dphi_x[phi_idx + iw] = 0.0; + dphi_y[phi_idx + iw] = 0.0; + dphi_z[phi_idx + iw] = 0.0; + } + } + } +} + +// The code for `set_ddphi_kernel` is quite difficult to understand. +// To grasp it, you better refer to the CPU function `set_ddphi` +__global__ void set_ddphi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_iw2_l, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ ddphi_xx, + double* __restrict__ ddphi_xy, + double* __restrict__ ddphi_xz, + double* __restrict__ ddphi_yy, + double* __restrict__ ddphi_yz, + double* __restrict__ ddphi_zz) +{ + const int bgrid_id = blockIdx.y; + const int mgrid_id = blockIdx.x; + const int atoms_num = atoms_num_info[bgrid_id].x; + const int pre_atoms_num = atoms_num_info[bgrid_id].y; + const double3 mgrid_pos = mgrids_pos[mgrid_id]; + + for (int atom_id = threadIdx.x; atom_id < atoms_num; atom_id += blockDim.x) + { + const int atom_type = iat2it[atoms_iat[atom_id + pre_atoms_num]]; + const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num]; + double coord[3]{mgrid_pos.x-rcoord.x, + mgrid_pos.y-rcoord.y, + mgrid_pos.z-rcoord.z}; + double dist = norm3d(coord[0], coord[1], coord[2]); + if (dist < rcut[atom_type]) + { + int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + + bgrids_phi_len[bgrid_id] * mgrid_id; + for(int i = 0; i < 6; i++) + { + coord[i/2] += std::pow(-1, i%2) * 0.0001; + double dist = norm3d(coord[0], coord[1], coord[2]); + if (dist < 1.0E-9) + { dist += 1.0E-9; } + // since nwl is less or equal than 5, the size of rly is (5+1)^2 + // size of grly = 36 * 3 + double rly[36]; + double grly[36 * 3]; + const int nwl = ucell_atom_nwl[atom_type]; + grad_rl_sph_harm(nwl, ylmcoef, coord[0], coord[1], coord[2], rly, grly); + + // interpolation + const double pos = dist / dr_uniform; + const int ip = static_cast(pos); + const double x0 = pos - ip; + const double x1 = 1.0 - x0; + const double x2 = 2.0 - x0; + const double x3 = 3.0 - x0; + const double x12 = x1 * x2 / 6; + const double x03 = x0 * x3 / 2; + double tmp = 0; + double dtmp = 0; + const int it_nw = atom_type * nwmax; + int iw_nr = it_nw * nrmax + ip; + for (int iw = 0; iw < atom_nw[atom_type]; iw++, iw_nr += nrmax) + { + if (atom_iw2_new[it_nw + iw]) + { + tmp = x12 * (psi_u[iw_nr] * x3 + psi_u[iw_nr + 3] * x0) + + x03 * (psi_u[iw_nr + 1] * x2 - psi_u[iw_nr + 2] * x1); + dtmp = x12 * (dpsi_u[iw_nr] * x3 + dpsi_u[iw_nr + 3] * x0) + + x03 * (dpsi_u[iw_nr + 1] * x2 - dpsi_u[iw_nr + 2] * x1); + } + const int iw_l = atom_iw2_l[it_nw + iw]; + const int idx_ylm = atom_iw2_ylm [it_nw + iw]; + const double rl = pow_int(dist, iw_l); + const double tmprl = tmp / rl; + const double tmpdphi_rly = (dtmp - tmp * iw_l / dist) / rl * rly[idx_ylm] / dist; + + double dphi[3]; + dphi[0] = tmpdphi_rly * coord[0] + tmprl * grly[idx_ylm * 3 + 0]; + dphi[1] = tmpdphi_rly * coord[1] + tmprl * grly[idx_ylm * 3 + 1]; + dphi[2] = tmpdphi_rly * coord[2] + tmprl * grly[idx_ylm * 3 + 2]; + + if (i == 0) + { + ddphi_xx[phi_idx + iw] += dphi[0]; + ddphi_xy[phi_idx + iw] += dphi[1]; + ddphi_xz[phi_idx + iw] += dphi[2]; + } else if (i == 1) + { + ddphi_xx[phi_idx + iw] -= dphi[0]; + ddphi_xy[phi_idx + iw] -= dphi[1]; + ddphi_xz[phi_idx + iw] -= dphi[2]; + } else if (i == 2) + { + ddphi_xy[phi_idx + iw] += dphi[0]; + ddphi_yy[phi_idx + iw] += dphi[1]; + ddphi_yz[phi_idx + iw] += dphi[2]; + } else if (i == 3) + { + ddphi_xy[phi_idx + iw] -= dphi[0]; + ddphi_yy[phi_idx + iw] -= dphi[1]; + ddphi_yz[phi_idx + iw] -= dphi[2]; + } else if (i == 4) + { + ddphi_xz[phi_idx + iw] += dphi[0]; + ddphi_yz[phi_idx + iw] += dphi[1]; + ddphi_zz[phi_idx + iw] += dphi[2]; + } else // i == 5 + { + ddphi_xz[phi_idx + iw] -= dphi[0]; + ddphi_yz[phi_idx + iw] -= dphi[1]; + ddphi_zz[phi_idx + iw] -= dphi[2]; + } + } + coord[i/2] -= std::pow(-1, i%2) * 0.0001; // recover coord + } + + for (int iw = 0; iw < atom_nw[atom_type]; iw++) + { + ddphi_xx[phi_idx + iw] /= 0.0002; + ddphi_xy[phi_idx + iw] /= 0.0004; + ddphi_xz[phi_idx + iw] /= 0.0004; + ddphi_yy[phi_idx + iw] /= 0.0002; + ddphi_yz[phi_idx + iw] /= 0.0004; + ddphi_zz[phi_idx + iw] /= 0.0002; + } + } + } +} + +__global__ void phi_mul_vldr3_kernel( + const double* __restrict__ vl, + const double dr3, + const double* __restrict__ phi, + const int mgrids_per_bgrid, + const int* __restrict__ mgrids_local_idx, + const int* __restrict__ bgrids_phi_len, + const int* __restrict__ bgrids_phi_start, + double* __restrict__ result) +{ + const int bgrid_id = blockIdx.y; + const int mgrid_id = blockIdx.x; + const int phi_len = bgrids_phi_len[bgrid_id]; + const int phi_start = bgrids_phi_start[bgrid_id] + mgrid_id * phi_len; + const int mgrid_id_in_batch = bgrid_id * mgrids_per_bgrid + mgrid_id; + const double vldr3 = vl[mgrids_local_idx[mgrid_id_in_batch]] * dr3; + for(int i = threadIdx.x; i < phi_len; i += blockDim.x) + { + result[phi_start + i] = phi[phi_start + i] * vldr3; + } +} + +// rho(ir) = \sum_{iwt} \phi_i(ir,iwt) * \phi_j^*(ir,iwt) +// each block calculate the dot product of phi_i and phi_j of a meshgrid +__global__ void phi_dot_phi_kernel( + const double* __restrict__ phi_i, + const double* __restrict__ phi_j, + const int mgrids_per_bgrid, + const int* __restrict__ mgrids_local_idx, + const int* __restrict__ bgrids_phi_len, + const int* __restrict__ bgrids_phi_start, + double* __restrict__ rho) +{ + __shared__ double s_data[32]; // the length of s_data equals the max warp num of a block + const int bgrid_id = blockIdx.y; + const int mgrid_id = blockIdx.x; + const int phi_len = bgrids_phi_len[bgrid_id]; + const int phi_start = bgrids_phi_start[bgrid_id] + mgrid_id * phi_len; + const double* phi_i_mgrid = phi_i + phi_start; + const double* phi_j_mgrid = phi_j + phi_start; + const int mgrid_id_in_batch = bgrid_id * mgrids_per_bgrid + mgrid_id; + const int mgrid_local_idx = mgrids_local_idx[mgrid_id_in_batch]; + const int tid = threadIdx.x; + const int warp_id = tid / 32; + const int lane_id = tid % 32; + double tmp_sum = 0; + + for (int i = tid; i < phi_len; i += blockDim.x) + { + tmp_sum += phi_i_mgrid[i] * phi_j_mgrid[i]; + } + + tmp_sum = warpReduceSum(tmp_sum); + + if (lane_id == 0) + { + s_data[warp_id] = tmp_sum; + } + __syncthreads(); + + tmp_sum = (tid < blockDim.x / 32) ? s_data[tid] : 0; + if(warp_id == 0) + { + tmp_sum = warpReduceSum(tmp_sum); + } + + if(tid == 0) + { + rho[mgrid_local_idx] += tmp_sum; + } +} + +__global__ void phi_dot_dphi_kernel( + const double* __restrict__ phi, + const double* __restrict__ dphi_x, + const double* __restrict__ dphi_y, + const double* __restrict__ dphi_z, + const int mgrids_per_bgrid, + const int* __restrict__ bgrids_phi_len, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ atoms_iat, + const int* __restrict__ iat2it, + const int* __restrict__ atom_nw, + double* force) +{ + __shared__ double s_data[32 * 3]; // the length of s_data equals the max warp num of a block times 3 + const int bgrid_id = blockIdx.y; + const int atoms_num = atoms_num_info[bgrid_id].x; + const int pre_atoms_num = atoms_num_info[bgrid_id].y; + const int bgrid_phi_len = bgrids_phi_len[bgrid_id]; + const int tid = threadIdx.x; + const int warp_id = tid / 32; + const int lane_id = tid % 32; + + for (int atom_id = blockIdx.x; atom_id < atoms_num; atom_id += gridDim.x) + { + const int atom_phi_start = atoms_phi_start[atom_id + pre_atoms_num]; + const int iat = atoms_iat[atom_id + pre_atoms_num]; + const int nw = atom_nw[iat2it[iat]]; + double f[3] = {0.0, 0.0, 0.0}; + for (int mgrid_id = 0; mgrid_id < mgrids_per_bgrid; mgrid_id++) + { + const int phi_start = atom_phi_start + mgrid_id * bgrid_phi_len; + for (int iw = tid; iw < nw; iw += blockDim.x) + { + int phi_idx = phi_start + iw; + f[0] += phi[phi_idx] * dphi_x[phi_idx]; + f[1] += phi[phi_idx] * dphi_y[phi_idx]; + f[2] += phi[phi_idx] * dphi_z[phi_idx]; + } + } + + // reduce the force in each block + for (int i = 0; i < 3; i++) + { + f[i] = warpReduceSum(f[i]); + } + + if (lane_id == 0) + { + for (int i = 0; i < 3; i++) + { + s_data[warp_id * 3 + i] = f[i]; + } + } + __syncthreads(); + + for (int i = 0; i < 3; i++) + { + f[i] = (tid < blockDim.x / 32) ? s_data[tid * 3 + i] : 0; + } + if (warp_id == 0) + { + for (int i = 0; i < 3; i++) + { + f[i] = warpReduceSum(f[i]); + } + } + if (tid == 0) + { + for (int i = 0; i < 3; i++) + { + atomicAdd(&force[iat * 3 + i], f[i] * 2); + } + } + } +} + +__global__ void phi_dot_dphi_r_kernel( + const double* __restrict__ phi, + const double* __restrict__ dphi_x, + const double* __restrict__ dphi_y, + const double* __restrict__ dphi_z, + const int mgrids_per_bgrid, + const int* __restrict__ bgrids_phi_len, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ iat2it, + const int* __restrict__ atom_nw, + double* __restrict__ svl) +{ + __shared__ double s_data[32 * 6]; // the length of s_data equals the max warp num of a block times 6 + const int tid = threadIdx.x; + const int bgrid_id = blockIdx.y; + const int atoms_num = atoms_num_info[bgrid_id].x; + const int pre_atoms_num = atoms_num_info[bgrid_id].y; + const int bgrid_phi_len = bgrids_phi_len[bgrid_id]; + const int warp_id = tid / 32; + const int lane_id = tid % 32; + + double stress[6]{0.0}; + for (int mgrid_id = blockIdx.x; mgrid_id < mgrids_per_bgrid; mgrid_id += gridDim.x) + { + const double3 mgrid_pos = mgrids_pos[mgrid_id]; + for (int atom_id = 0; atom_id < atoms_num; atom_id++) + { + const int atom_phi_start = atoms_phi_start[atom_id + pre_atoms_num] + mgrid_id * bgrid_phi_len; + const int iat = atoms_iat[atom_id + pre_atoms_num]; + const int nw = atom_nw[iat2it[iat]]; + const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num]; // rcoord is the ralative coordinate of an atom and a biggrid + const double3 coord = make_double3(mgrid_pos.x-rcoord.x, // coord is the relative coordinate of an atom and a meshgrid + mgrid_pos.y-rcoord.y, + mgrid_pos.z-rcoord.z); + for (int iw = tid; iw < nw; iw += blockDim.x) + { + int phi_idx = atom_phi_start + iw; + stress[0] += phi[phi_idx] * dphi_x[phi_idx] * coord.x; + stress[1] += phi[phi_idx] * dphi_x[phi_idx] * coord.y; + stress[2] += phi[phi_idx] * dphi_x[phi_idx] * coord.z; + stress[3] += phi[phi_idx] * dphi_y[phi_idx] * coord.y; + stress[4] += phi[phi_idx] * dphi_y[phi_idx] * coord.z; + stress[5] += phi[phi_idx] * dphi_z[phi_idx] * coord.z; + } + } + } + + // reduce the stress in each block + for (int i = 0; i < 6; i++) + { + stress[i] = warpReduceSum(stress[i]); + } + + if (lane_id == 0) + { + for (int i = 0; i < 6; i++) + { + s_data[warp_id * 6 + i] = stress[i]; + } + } + __syncthreads(); + + for (int i = 0; i < 6; i++) + { + stress[i] = (tid < blockDim.x / 32) ? s_data[tid * 6 + i] : 0; + } + if (warp_id == 0) + { + for (int i = 0; i < 6; i++) + { + stress[i] = warpReduceSum(stress[i]); + } + } + if (tid == 0) + { + for (int i = 0; i < 6; i++) + { + atomicAdd(&svl[i], stress[i] * 2); + } + } +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh new file mode 100644 index 0000000000..4d32475542 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh @@ -0,0 +1,135 @@ +#pragma once + +#include + +namespace ModuleGint +{ + +__global__ void set_phi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ phi); + +__global__ void set_phi_dphi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_iw2_l, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ phi, + double* __restrict__ dphi_x, + double* __restrict__ dphi_y, + double* __restrict__ dphi_z); + +__global__ void set_ddphi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_iw2_l, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ ddphi_xx, + double* __restrict__ ddphi_xy, + double* __restrict__ ddphi_xz, + double* __restrict__ ddphi_yy, + double* __restrict__ ddphi_yz, + double* __restrict__ ddphi_zz); + +__global__ void phi_mul_vldr3_kernel( + const double* __restrict__ vl, + const double dr3, + const double* __restrict__ phi, + const int mgrids_per_bgrid, + const int* __restrict__ mgrids_local_idx, + const int* __restrict__ bgrids_phi_len, + const int* __restrict__ bgrids_phi_start, + double* __restrict__ result); + +// rho(ir) = \sum_{iwt} \phi_i(ir,iwt) * \phi_j^*(ir,iwt) +// each block calculate the dot product of phi_i and phi_j of a meshgrid +__global__ void phi_dot_phi_kernel( + const double* __restrict__ phi_i, // phi_i(ir,iwt) + const double* __restrict__ phi_j, // phi_j(ir,iwt) + const int mgrids_per_bgrid, // the number of mgrids of each biggrid + const int* __restrict__ mgrids_local_idx, // the idx of mgrid in local cell + const int* __restrict__ bgrids_phi_len, // the length of phi on a mgrid of a biggrid + const int* __restrict__ bgrids_phi_start, // the start idx in phi of each biggrid + double* __restrict__ rho); // rho(ir) + +__global__ void phi_dot_dphi_kernel( + const double* __restrict__ phi, + const double* __restrict__ dphi_x, + const double* __restrict__ dphi_y, + const double* __restrict__ dphi_z, + const int mgrids_per_bgrid, + const int* __restrict__ bgrids_phi_len, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ atoms_iat, + const int* __restrict__ iat2it, + const int* __restrict__ atom_nw, + double* force); + +__global__ void phi_dot_dphi_r_kernel( + const double* __restrict__ phi, + const double* __restrict__ dphi_x, + const double* __restrict__ dphi_y, + const double* __restrict__ dphi_z, + const int mgrids_per_bgrid, + const int* __restrict__ bgrids_phi_len, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ iat2it, + const int* __restrict__ atom_nw, + double* __restrict__ svl); + +} diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu new file mode 100644 index 0000000000..38fba5de00 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu @@ -0,0 +1,13 @@ +#include "set_const_mem.cuh" +#include "gint_helper.cuh" + +__constant__ double ylmcoe_d[100]; + +namespace ModuleGint +{ + __host__ void set_ylmcoe_d(const double* ylmcoe_h, double** ylmcoe_d_addr) + { + checkCuda(cudaMemcpyToSymbol(ylmcoe_d, ylmcoe_h, sizeof(double) * 100)); + checkCuda(cudaGetSymbolAddress((void**)ylmcoe_d_addr, ylmcoe_d)); + } +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh new file mode 100644 index 0000000000..715fa98cde --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh @@ -0,0 +1,7 @@ +#pragma once +#include + +namespace ModuleGint +{ +__host__ void set_ylmcoe_d(const double* ylmcoe_h, double** ylmcoe_d_addr); +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh new file mode 100644 index 0000000000..b36828222b --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh @@ -0,0 +1,396 @@ +#pragma once + +namespace ModuleGint +{ + +static __device__ void sph_harm( + const int nwl, + const double* __restrict__ ylmcoef, + const double x, + const double y, + const double z, + double* __restrict__ ylma +) +{ + /*************************** + L = 0 + ***************************/ + ylma[0] = ylmcoef[0]; // l=0, m=0 + double tmp0; + if (nwl == 0) + return; + + /*************************** + L = 1 + ***************************/ + ylma[1] = ylmcoef[1] * z; // l=1, m=0 + ylma[2] = -ylmcoef[1] * x; // l=1, m=1 + ylma[3] = -ylmcoef[1] * y; // l=1, m=-1 + if (nwl == 1) + return; + + /*************************** + L = 2 + ***************************/ + tmp0=ylmcoef[3] * ylma[0]; + ylma[4] = ylmcoef[2] * z * ylma[1] - tmp0 ; // l=2, m=0 + tmp0 = ylmcoef[4] * z; + ylma[5] = tmp0 * ylma[2]; // l=2,m=1 + ylma[6] = tmp0 * ylma[3]; // l=2,m=-1 + + tmp0 = ylmcoef[4] * x; + ylma[7] = ylmcoef[5] * ylma[4] - ylmcoef[6] * ylma[0] + - tmp0 * ylma[2]; // l=2,m=2 + ylma[8] = -tmp0 * ylma[3]; + if (nwl == 2) + return; + + /*************************** + L = 3 + ***************************/ + tmp0=ylmcoef[8] * ylma[1]; + ylma[9] = ylmcoef[7] * z * ylma[4] - tmp0; // l=3, m=0 + + tmp0 = ylmcoef[9] * z; + ylma[10] = tmp0 * ylma[5] - ylmcoef[10] * ylma[2]; // l=3,m=1 + ylma[11] = tmp0 * ylma[6] - ylmcoef[10] * ylma[3]; // l=3,m=-1 + + tmp0 = ylmcoef[11] * z; + ylma[12] = tmp0 * ylma[7]; // l=3,m=2 + ylma[13] = tmp0 * ylma[8]; // l=3,m=-2 + + tmp0 = ylmcoef[14] * x; + ylma[14] = ylmcoef[12] * ylma[10] - ylmcoef[13] * ylma[2] + - tmp0 * ylma[7]; // l=3,m=3 + ylma[15] = ylmcoef[12] * ylma[11] - ylmcoef[13] * ylma[3] + - tmp0 * ylma[8]; // l=3,m=-3 + if (nwl == 3) + return; + + /*************************** + L = 4 + ***************************/ + tmp0=ylmcoef[16] * ylma[4]; + ylma[16] = ylmcoef[15] * z * ylma[9] - tmp0; // l=4,m=0 + + tmp0 = ylmcoef[17] * z; + ylma[17] = tmp0 * ylma[10] - ylmcoef[18] * ylma[5]; // l=4,m=1 + ylma[18] = tmp0 * ylma[11] - ylmcoef[18] * ylma[6]; // l=4,m=-1 + + tmp0 = ylmcoef[19] * z; + ylma[19] = tmp0 * ylma[12] - ylmcoef[20] * ylma[7]; // l=4,m=2 + ylma[20] = tmp0 * ylma[13] - ylmcoef[20] * ylma[8]; // l=4,m=-2 + + tmp0 = 3.0 * z; + ylma[21] = tmp0 * ylma[14]; // l=4,m=3 + ylma[22] = tmp0 * ylma[15]; // l=4,m=-3 + + tmp0 = ylmcoef[23] * x; + ylma[23] = ylmcoef[21] * ylma[19] - ylmcoef[22] * ylma[7] + - tmp0 * ylma[14]; // l=4,m=4 + ylma[24] = ylmcoef[21] * ylma[20] - ylmcoef[22] * ylma[8] + - tmp0 * ylma[15]; // l=4,m=-4 + if (nwl == 4) + return; + + /*************************** + L = 5 + ***************************/ + tmp0=ylmcoef[25] * ylma[9]; + ylma[25] + = ylmcoef[24] * z * ylma[16] - tmp0; // l=5,m=0 + + tmp0 = ylmcoef[26] * z; + ylma[26] = tmp0 * ylma[17] - ylmcoef[27] * ylma[10]; // l=5,m=1 + ylma[27] = tmp0 * ylma[18] - ylmcoef[27] * ylma[11]; // l=5,m=-1 + + tmp0 = ylmcoef[28] * z; + ylma[28] = tmp0 * ylma[19] - ylmcoef[29] * ylma[12]; // l=5,m=2 + ylma[29] = tmp0 * ylma[20] - ylmcoef[29] * ylma[13]; // l=5,m=-2 + + tmp0 = ylmcoef[30] * z; + ylma[30] = tmp0 * ylma[21] - ylmcoef[31] * ylma[14]; // l=5,m=3 + ylma[31] = tmp0 * ylma[22] - ylmcoef[31] * ylma[15]; // l=5,m=-3 + + tmp0 = ylmcoef[32] * z; + ylma[32] = tmp0 * ylma[23]; // l=5,m=4 + ylma[33] = tmp0 * ylma[24]; // l=5,m=-4 + + tmp0 = ylmcoef[35] * x; + ylma[34] = ylmcoef[33] * ylma[30] - ylmcoef[34] * ylma[14] + - tmp0 * ylma[23]; // l=5,m=5 + ylma[35] = ylmcoef[33] * ylma[31] - ylmcoef[34] * ylma[15] + - tmp0 * ylma[24]; // l=5,m=-5 + if (nwl == 5) + return; + /* + // if nwl > 5 + for (int il = 6; il <= nwl; il++) + { + int istart = il * il; + int istart1 = (il - 1) * (il - 1); + int istart2 = (il - 2) * (il - 2); + + double fac2 = sqrt(4.0 * istart - 1.0); + double fac4 = sqrt(4.0 * istart1 - 1.0); + + for (int im = 0; im < 2 * il - 1; im++) + { + int imm = (im + 1) / 2; + ylma[istart + im] = fac2 / sqrt((double)istart - imm * imm) * (z + * ylma[istart1 + im] - sqrt((double)istart1 - imm * imm) / fac4 * + ylma[istart2 + im]); + } + + double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0)); + double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0)); + double bl3 = sqrt(2.0) / fac2; + + ylma[istart + 2 * il - 1] = (bl3 * ylma[istart + 2 * il - 5] - bl2 * + ylma[istart2 + 2 * il - 5] - 2.0 * x * ylma[istart1 + 2 * il - 3]) / + bl1; ylma[istart + 2 * il] = (bl3 * ylma[istart + 2 * il - 4] - bl2 * + ylma[istart2 + 2 * il - 4] - 2.0 * x * ylma[istart1 + 2 * il - 2]) / + bl1; + }*/ +} + +static __device__ void grad_rl_sph_harm( + const int nwl, + const double* __restrict__ ylmcoef, + const double x, + const double y, + const double z, + double* __restrict__ rly, + double* __restrict__ grly +) +{ + double r2 = x * x + y * y + z * z; + double tx = x * 2; + double ty = y * 2; + double tz = z * 2; + + //begin calculation + /*************************** + L = 0 + ***************************/ + rly[0] = ylmcoef[0]; //l=0, m=0 + grly[0] = grly[1] = grly[2] = 0.0; + if (nwl == 0) return; + + /*************************** + L = 1 + ***************************/ + rly[1] = ylmcoef[1]*z; //l=1, m=0 + grly[3] = grly[4] = 0.0; + grly[5] = ylmcoef[1]; + + rly[2] = -ylmcoef[1]*x; //l=1, m=1 + grly[7] = grly[8] = 0.0; + grly[6] = -ylmcoef[1]; + + rly[3] = -ylmcoef[1]*y; //l=1, m=-1 + grly[9] = grly[11] = 0.0; + grly[10] = -ylmcoef[1]; + + if (nwl == 1) return; + + /*************************** + L = 2 + ***************************/ + rly[4] = ylmcoef[2]*z*rly[1]-ylmcoef[3]*rly[0]*r2;//l=2, m=0 + grly[12] = ylmcoef[2]*z*grly[3]-ylmcoef[3]*(grly[0]*r2+rly[0]*tx);//l=2, m=0 + grly[13] = ylmcoef[2]*z*grly[4]-ylmcoef[3]*(grly[1]*r2+rly[0]*ty);//l=2, m=0 + grly[14] = ylmcoef[2]*(z*grly[5]+rly[1])-ylmcoef[3]*(grly[2]*r2+rly[0]*tz);//l=2, m=0 + + + double tmp0 = ylmcoef[4]*z; + rly[5] = tmp0*rly[2];//l=2,m=1 + grly[15] = tmp0*grly[6]; + grly[16] = tmp0*grly[7]; + grly[17] = ylmcoef[4]*(rly[2]+z*grly[8]); + + rly[6] = tmp0*rly[3];//l=2,m=-1 + grly[18] = tmp0*grly[9]; + grly[19] = tmp0*grly[10]; + grly[20] = ylmcoef[4]*(rly[3]+z*grly[11]); + + double tmp2 = ylmcoef[4]*x; + rly[7]= ylmcoef[5]*rly[4]-ylmcoef[6]*rly[0]*r2 - tmp2*rly[2];//l=2,m=2 + grly[21] = ylmcoef[5]*grly[12]-ylmcoef[6]*(rly[0]*tx+grly[0]*r2)-ylmcoef[4]*(x*grly[6]+rly[2]); + +// std::cout << "\np1 = "<< ylmcoef[5]*grly[12] << " p2 = " << -ylmcoef[6]*rly[0]*tx +// << " p3 = " << -ylmcoef[4]*x*grly[6] << " p4 = " << -ylmcoef[4]*rly[2] << std::endl; + + grly[22] = ylmcoef[5]*grly[13]-ylmcoef[6]*(rly[0]*ty+grly[1]*r2)-tmp2*grly[7]; + grly[23] = ylmcoef[5]*grly[14]-ylmcoef[6]*(rly[0]*tz+grly[2]*r2)-tmp2*grly[8]; + + rly[8] = -tmp2*rly[3]; + grly[24] = -ylmcoef[4]*(rly[3]+x*grly[9]); + grly[25] = -tmp2*grly[10]; + grly[26] = -tmp2*grly[11]; +// rly[8] = tmp1+tmp2*rly[3];//l=2,m=-2 + if (nwl == 2) return; + + /*************************** + L = 3 + ***************************/ + rly[9] = ylmcoef[7]*z*rly[4]-ylmcoef[8]*rly[1]*r2; //l=3, m=0 + grly[27] = ylmcoef[7]*z*grly[12]-ylmcoef[8]*(rly[1]*tx+grly[3]*r2); + grly[28] = ylmcoef[7]*z*grly[13]-ylmcoef[8]*(rly[1]*ty+grly[4]*r2); + grly[29] = ylmcoef[7]*(rly[4]+z*grly[14])-ylmcoef[8]*(rly[1]*tz+grly[5]*r2); + + double tmp3 = ylmcoef[9]*z; + rly[10] = tmp3*rly[5]-ylmcoef[10]*rly[2]*r2;//l=3,m=1 + grly[30] = tmp3*grly[15]-ylmcoef[10]*(grly[6]*r2+rly[2]*tx); + grly[31] = tmp3*grly[16]-ylmcoef[10]*(grly[7]*r2+rly[2]*ty); + grly[32] = ylmcoef[9]*(z*grly[17]+rly[5])-ylmcoef[10]*(grly[8]*r2+rly[2]*tz); + + rly[11] = tmp3*rly[6]-ylmcoef[10]*rly[3]*r2;//l=3,m=-1 + grly[33] = tmp3*grly[18]-ylmcoef[10]*(grly[9]*r2+rly[3]*tx); + grly[34] = tmp3*grly[19]-ylmcoef[10]*(grly[10]*r2+rly[3]*ty); + grly[35] = ylmcoef[9]*(z*grly[20]+rly[6])-ylmcoef[10]*(grly[11]*r2+rly[3]*tz); + + double tmp4 = ylmcoef[11]*z; + rly[12] = tmp4*rly[7];//l=3,m=2 + grly[36] = tmp4*grly[21]; + grly[37] = tmp4*grly[22]; + grly[38] = ylmcoef[11]*(z*grly[23]+rly[7]); + + rly[13] = tmp4*rly[8];//l=3,m=-2 + grly[39] = tmp4*grly[24]; + grly[40] = tmp4*grly[25]; + grly[41] = ylmcoef[11]*(z*grly[26]+rly[8]); + + double tmp5 = ylmcoef[14]*x; + rly[14] = ylmcoef[12]*rly[10]-ylmcoef[13]*rly[2]*r2-tmp5*rly[7];//l=3,m=3 + grly[42] = ylmcoef[12]*grly[30]-ylmcoef[13]*(rly[2]*tx+grly[6]*r2)-ylmcoef[14]*(rly[7]+x*grly[21]); + grly[43] = ylmcoef[12]*grly[31]-ylmcoef[13]*(rly[2]*ty+grly[7]*r2)-tmp5*grly[22]; + grly[44] = ylmcoef[12]*grly[32]-ylmcoef[13]*(rly[2]*tz+grly[8]*r2)-tmp5*grly[23]; + + rly[15] = ylmcoef[12]*rly[11]-ylmcoef[13]*rly[3]*r2-tmp5*rly[8];//l=3,m=-3 + grly[45] = ylmcoef[12]*grly[33]-ylmcoef[13]*(rly[3]*tx+grly[9]*r2)-ylmcoef[14]*(rly[8]+x*grly[24]); + grly[46] = ylmcoef[12]*grly[34]-ylmcoef[13]*(rly[3]*ty+grly[10]*r2)-tmp5*grly[25]; + grly[47] = ylmcoef[12]*grly[35]-ylmcoef[13]*(rly[3]*tz+grly[11]*r2)-tmp5*grly[26]; + if (nwl == 3) return; + + /*************************** + L = 4 + ***************************/ + rly[16] = ylmcoef[15]*z*rly[9]-ylmcoef[16]*rly[4]*r2;//l=4,m=0 + grly[48] = ylmcoef[15]*z*grly[27]-ylmcoef[16]*(rly[4]*tx+grly[12]*r2); + grly[49] = ylmcoef[15]*z*grly[28]-ylmcoef[16]*(rly[4]*ty+grly[13]*r2); + grly[50] = ylmcoef[15]*(z*grly[29]+rly[9])-ylmcoef[16]*(rly[4]*tz+grly[14]*r2); + + double tmp6 = ylmcoef[17]*z; + rly[17] = tmp6*rly[10]-ylmcoef[18]*rly[5]*r2;//l=4,m=1 + grly[51] = tmp6*grly[30]-ylmcoef[18]*(rly[5]*tx+grly[15]*r2); + grly[52] = tmp6*grly[31]-ylmcoef[18]*(rly[5]*ty+grly[16]*r2); + grly[53] = ylmcoef[17]*(z*grly[32]+rly[10])-ylmcoef[18]*(rly[5]*tz+grly[17]*r2); + + rly[18] = tmp6*rly[11]-ylmcoef[18]*rly[6]*r2;//l=4,m=-1 + grly[54] = tmp6*grly[33]-ylmcoef[18]*(rly[6]*tx+grly[18]*r2); + grly[55] = tmp6*grly[34]-ylmcoef[18]*(rly[6]*ty+grly[19]*r2); + grly[56] = ylmcoef[17]*(z*grly[35]+rly[11])-ylmcoef[18]*(rly[6]*tz+grly[20]*r2); + + double tmp7 = ylmcoef[19]*z; + rly[19] = tmp7*rly[12]-ylmcoef[20]*rly[7]*r2;//l=4,m=2 + grly[57] = tmp7*grly[36]-ylmcoef[20]*(rly[7]*tx+grly[21]*r2); + grly[58] = tmp7*grly[37]-ylmcoef[20]*(rly[7]*ty+grly[22]*r2); + grly[59] = ylmcoef[19]*(z*grly[38]+rly[12])-ylmcoef[20]*(rly[7]*tz+grly[23]*r2); + + rly[20] = tmp7*rly[13]-ylmcoef[20]*rly[8]*r2;//l=4,m=-2 + grly[60] = tmp7*grly[39]-ylmcoef[20]*(rly[8]*tx+grly[24]*r2); + grly[61] = tmp7*grly[40]-ylmcoef[20]*(rly[8]*ty+grly[25]*r2); + grly[62] = ylmcoef[19]*(z*grly[41]+rly[13])-ylmcoef[20]*(rly[8]*tz+grly[26]*r2); + + double tmp8 = 3.0*z; + rly[21] = tmp8*rly[14];//l=4,m=3 + grly[63] = tmp8*grly[42]; + grly[64] = tmp8*grly[43]; + grly[65] = 3.0*(z*grly[44]+rly[14]); + + + rly[22] = tmp8*rly[15];//l=4,m=-3 + grly[66] = tmp8*grly[45]; + grly[67] = tmp8*grly[46]; + grly[68] = 3.0*(z*grly[47]+rly[15]); + + double tmp9 = ylmcoef[23]*x; + rly[23] = ylmcoef[21]*rly[19]-ylmcoef[22]*rly[7]*r2-tmp9*rly[14];//l=4,m=4 + grly[69] = ylmcoef[21]*grly[57]-ylmcoef[22]*(rly[7]*tx+grly[21]*r2)-ylmcoef[23]*(x*grly[42]+rly[14]); + grly[70] = ylmcoef[21]*grly[58]-ylmcoef[22]*(rly[7]*ty+grly[22]*r2)-tmp9*grly[43]; + grly[71] = ylmcoef[21]*grly[59]-ylmcoef[22]*(rly[7]*tz+grly[23]*r2)-tmp9*grly[44]; + + rly[24] = ylmcoef[21]*rly[20]-ylmcoef[22]*rly[8]*r2-tmp9*rly[15];//l=4,m=-4 + grly[72] = ylmcoef[21]*grly[60]-ylmcoef[22]*(rly[8]*tx+grly[24]*r2)-ylmcoef[23]*(x*grly[45]+rly[15]); + grly[73] = ylmcoef[21]*grly[61]-ylmcoef[22]*(rly[8]*ty+grly[25]*r2)-tmp9*grly[46]; + grly[74] = ylmcoef[21]*grly[62]-ylmcoef[22]*(rly[8]*tz+grly[26]*r2)-tmp9*grly[47]; + + if (nwl == 4) return; + + /*************************** + L = 5 + ***************************/ + rly[25] = ylmcoef[24]*z*rly[16]-ylmcoef[25]*rly[9]*r2;//l=5,m=0 + grly[75] = ylmcoef[24]*z*grly[48]-ylmcoef[25]*(rly[9]*tx+grly[27]*r2); + grly[76] = ylmcoef[24]*z*grly[49]-ylmcoef[25]*(rly[9]*ty+grly[28]*r2); + grly[77] = ylmcoef[24]*(z*grly[50]+rly[16])-ylmcoef[25]*(rly[9]*tz+grly[29]*r2); + + double tmp10 = ylmcoef[26]*z; + rly[26] = tmp10*rly[17]-ylmcoef[27]*rly[10]*r2;//l=5,m=1 + grly[78] = tmp10*grly[51]-ylmcoef[27]*(rly[10]*tx+grly[30]*r2); + grly[79] = tmp10*grly[52]-ylmcoef[27]*(rly[10]*ty+grly[31]*r2); + grly[80] = ylmcoef[26]*(z*grly[53]+rly[17])-ylmcoef[27]*(rly[10]*tz+grly[32]*r2); + + rly[27] = tmp10*rly[18]-ylmcoef[27]*rly[11]*r2;//l=5,m=-1 + grly[81] = tmp10*grly[54]-ylmcoef[27]*(rly[11]*tx+grly[33]*r2); + grly[82] = tmp10*grly[55]-ylmcoef[27]*(rly[11]*ty+grly[34]*r2); + grly[83] = ylmcoef[26]*(z*grly[56]+rly[18])-ylmcoef[27]*(rly[11]*tz+grly[35]*r2); + + double tmp11 = ylmcoef[28]*z; + rly[28] = tmp11*rly[19]-ylmcoef[29]*rly[12]*r2;//l=5,m=2 + grly[84] = tmp11*grly[57]-ylmcoef[29]*(rly[12]*tx+grly[36]*r2); + grly[85] = tmp11*grly[58]-ylmcoef[29]*(rly[12]*ty+grly[37]*r2); + grly[86] = ylmcoef[28]*(z*grly[59]+rly[19])-ylmcoef[29]*(rly[12]*tz+grly[38]*r2); + + rly[29] = tmp11*rly[20]-ylmcoef[29]*rly[13]*r2;//l=5,m=-2 + grly[87] = tmp11*grly[60]-ylmcoef[29]*(rly[13]*tx+grly[39]*r2); + grly[88] = tmp11*grly[61]-ylmcoef[29]*(rly[13]*ty+grly[40]*r2); + grly[89] = ylmcoef[28]*(z*grly[62]+rly[20])-ylmcoef[29]*(rly[13]*tz+grly[41]*r2); + + double tmp12 = ylmcoef[30]*z; + rly[30] = tmp12*rly[21]-ylmcoef[31]*rly[14]*r2;//l=5,m=3 + grly[90] = tmp12*grly[63]-ylmcoef[31]*(grly[42]*r2+rly[14]*tx); + grly[91] = tmp12*grly[64]-ylmcoef[31]*(grly[43]*r2+rly[14]*ty); + grly[92] = ylmcoef[30]*(z*grly[65]+rly[21])-ylmcoef[31]*(grly[44]*r2+rly[14]*tz); + + rly[31] = tmp12*rly[22]-ylmcoef[31]*rly[15]*r2;//l=5,m=-3 + grly[93] = tmp12*grly[66]-ylmcoef[31]*(grly[45]*r2+rly[15]*tx); + grly[94] = tmp12*grly[67]-ylmcoef[31]*(grly[46]*r2+rly[15]*ty); + grly[95] = ylmcoef[30]*(z*grly[68]+rly[22])-ylmcoef[31]*(grly[47]*r2+rly[15]*tz); + + double tmp13 = ylmcoef[32]*z; + rly[32] = tmp13*rly[23];//l=5,m=4 + grly[96] = tmp13*grly[69]; + grly[97] = tmp13*grly[70]; + grly[98] = ylmcoef[32]*(rly[23]+z*grly[71]); + + rly[33] = tmp13*rly[24];//l=5,m=-4 + grly[99] = tmp13*grly[72]; + grly[100] = tmp13*grly[73]; + grly[101] = ylmcoef[32]*(rly[24]+z*grly[74]); + + double tmp14 = ylmcoef[35]*x; + rly[34] = ylmcoef[33]*rly[30]-ylmcoef[34]*rly[14]*r2-tmp14*rly[23];//l=5,m=5 + grly[102] = ylmcoef[33]*grly[90]-ylmcoef[34]*(rly[14]*tx+grly[42]*r2)-ylmcoef[35]*(x*grly[69]+rly[23]); + grly[103] = ylmcoef[33]*grly[91]-ylmcoef[34]*(rly[14]*ty+grly[43]*r2)-tmp14*grly[70]; + grly[104] = ylmcoef[33]*grly[92]-ylmcoef[34]*(rly[14]*tz+grly[44]*r2)-tmp14*grly[71]; + + rly[35] = ylmcoef[33]*rly[31]-ylmcoef[34]*rly[15]*r2-tmp14*rly[24];//l=5,m=-5 + grly[105] = ylmcoef[33]*grly[93]-ylmcoef[34]*(rly[15]*tx+grly[45]*r2)-ylmcoef[35]*(x*grly[72]+rly[24]); + grly[106] = ylmcoef[33]*grly[94]-ylmcoef[34]*(rly[15]*ty+grly[46]*r2)-tmp14*grly[73]; + grly[107] = ylmcoef[33]*grly[95]-ylmcoef[34]*(rly[15]*tz+grly[47]*r2)-tmp14*grly[74]; + + if (nwl == 5) return; +} +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h index f24d1194b4..0c146b86ab 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h @@ -17,16 +17,16 @@ class LocalCellInfo std::shared_ptr unitcell_info); // getter functions - int get_startidx_bx() const { return startidx_bx_; }; - int get_startidx_by() const { return startidx_by_; }; - int get_startidx_bz() const { return startidx_bz_; }; - int get_nbx() const { return nbx_; }; - int get_nby() const { return nby_; }; - int get_nbz() const { return nbz_; }; - int get_bgrids_num() const { return nbxyz_; }; - int get_mgrids_num() const { return nmxyz_; }; - std::shared_ptr get_unitcell_info() const { return unitcell_info_; }; - std::shared_ptr get_bgrid_info() const { return unitcell_info_->get_bgrid_info(); }; + int get_startidx_bx() const { return startidx_bx_; } + int get_startidx_by() const { return startidx_by_; } + int get_startidx_bz() const { return startidx_bz_; } + int get_nbx() const { return nbx_; } + int get_nby() const { return nby_; } + int get_nbz() const { return nbz_; } + int get_bgrids_num() const { return nbxyz_; } + int get_mgrids_num() const { return nmxyz_; } + std::shared_ptr get_unitcell_info() const { return unitcell_info_; } + std::shared_ptr get_bgrid_info() const { return unitcell_info_->get_bgrid_info(); } //==================================================================== // functions related to the big grid diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h index 99376c9a20..a8307b1048 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h @@ -35,10 +35,10 @@ class MeshGridInfo meshgrid_GT_ = meshgrid_latvec0_.Inverse(); meshgrid_volume_ = std::abs(meshgrid_latvec0_.Det()); - }; - - double get_volume() const { return meshgrid_volume_; }; - Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * meshgrid_latvec0_; }; + } + + double get_volume() const { return meshgrid_volume_; } + Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * meshgrid_latvec0_; } Vec3d get_direct_coord(const Vec3d& cart_coord) const { return cart_coord * meshgrid_GT_; } private: diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp index d714546864..5df52f9453 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp @@ -9,7 +9,7 @@ void PhiOperator::set_bgrid(std::shared_ptr biggrid) { biggrid_ = biggrid; rows_ = biggrid_->get_mgrids_num(); - cols_ = biggrid_->get_mgrid_phi_len(); + cols_ = biggrid_->get_phi_len(); biggrid_->set_atoms_startidx(atoms_startidx_); biggrid_->set_atoms_phi_len(atoms_phi_len_); @@ -18,14 +18,13 @@ void PhiOperator::set_bgrid(std::shared_ptr biggrid) // init is_atom_on_mgrid_ and atoms_relative_coords_ const int atoms_num = biggrid_->get_atoms_num(); atoms_relative_coords_.resize(atoms_num); - is_atom_on_mgrid_.resize(atoms_num); + is_atom_on_mgrid_.resize(biggrid_->get_mgrids_num() * atoms_num); for(int i = 0; i < atoms_num; ++i) { biggrid_->set_atom_relative_coords(biggrid_->get_atom(i), atoms_relative_coords_[i]); - is_atom_on_mgrid_[i].resize(rows_); for(int j = 0; j < rows_; ++j) { - is_atom_on_mgrid_[i][j] = atoms_relative_coords_[i][j].norm() <= biggrid_->get_atom(i)->get_rcut(); + is_atom_on_mgrid_[i * rows_ + j] = atoms_relative_coords_[i][j].norm() <= biggrid_->get_atom(i)->get_rcut(); } } @@ -109,10 +108,10 @@ void PhiOperator::phi_dot_dphi_r( for(int j = 0; j < biggrid_->get_atoms_num(); ++j) { const int start_idx = atoms_startidx_[j]; + const Vec3d& r3 = atoms_relative_coords_[j][i]; for(int k = 0; k < atoms_phi_len_[j]; ++k) { const int idx = i * cols_ + start_idx + k; - const Vec3d& r3 = atoms_relative_coords_[j][i]; const double phi_val = phi[idx]; sxx += phi_val * dphi_x[idx] * r3[0]; sxy += phi_val * dphi_x[idx] * r3[1]; @@ -131,6 +130,86 @@ void PhiOperator::phi_dot_dphi_r( svl[0](2, 2) += szz * 2; } +void PhiOperator::cal_env_gamma( + const double* phi, + const double* wfc, + const vector& trace_lo, + double* rho) const +{ + for(int i = 0; i < biggrid_->get_atoms_num(); ++i) + { + const auto atom = biggrid_->get_atom(i); + const int iw_start = atom->get_start_iw(); + const int start_idx = atoms_startidx_[i]; + for(int j = 0; j < biggrid_->get_mgrids_num(); ++j) + { + if(is_atom_on_mgrid(i, j)) + { + double tmp = 0.0; + int iw_lo = trace_lo[iw_start]; + for(int iw = 0; iw < atom->get_nw(); ++iw, ++iw_lo) + { + tmp += phi[j * cols_ + start_idx + iw] * wfc[iw_lo]; + } + rho[meshgrids_local_idx_[j]] += tmp; + } + } + } +} + +void PhiOperator::cal_env_k( + const double* phi, + const std::complex* wfc, + const vector& trace_lo, + const int ik, + const int nspin, + const int npol, + const int lgd, + const std::vector& kvec_c, + const std::vector& kvec_d, + double* rho) const +{ + for(int i = 0; i < biggrid_->get_atoms_num(); ++i) + { + const auto atom = biggrid_->get_atom(i); + const int iw_start = atom->get_start_iw(); + const Vec3d R(atom->get_unitcell_idx()); + const double arg = (kvec_d[ik] * R) * ModuleBase::TWO_PI; + const std::complex kphase = std::complex(cos(arg), sin(arg)); + const int start_idx = atoms_startidx_[i]; + for(int j = 0; j < biggrid_->get_mgrids_num(); ++j) + { + if(is_atom_on_mgrid(i, j)) + { + std::complex tmp{0.0, 0.0}; + int phi_start_idx = j * cols_ + start_idx; + + int iw_lo = 0; + if (nspin == 4) // is it a simple add of 2 spins? + { + for (int is = 0; is < 2; ++is) + { + iw_lo = trace_lo[iw_start] / npol + lgd / npol * is; + for (int iw = 0; iw < atom->get_nw(); ++iw, ++iw_lo) + { + tmp += std::complex(phi[phi_start_idx + iw], 0.0) * wfc[iw_lo] * kphase; + } + } + } + else + { + iw_lo = trace_lo[iw_start]; + for (int iw = 0; iw < atom->get_nw(); ++iw, ++iw_lo) + { + tmp += std::complex(phi[phi_start_idx + iw], 0.0) * wfc[iw_lo] * kphase; + } + } + rho[meshgrids_local_idx_[j]] += tmp.real(); + } + } + } +} + //=============================== // private methods @@ -150,7 +229,7 @@ void PhiOperator::init_atom_pair_start_end_idx_() int end_idx = -1; for(int mgrid_idx = 0; mgrid_idx < mgrids_num; ++mgrid_idx) { - if(is_atom_on_mgrid_[i][mgrid_idx] && is_atom_on_mgrid_[j][mgrid_idx]) + if(is_atom_on_mgrid(i, mgrid_idx) && is_atom_on_mgrid(j, mgrid_idx)) { start_idx = mgrid_idx; break; @@ -158,7 +237,7 @@ void PhiOperator::init_atom_pair_start_end_idx_() } for(int mgrid_idx = mgrids_num - 1; mgrid_idx >= 0; --mgrid_idx) { - if(is_atom_on_mgrid_[i][mgrid_idx] && is_atom_on_mgrid_[j][mgrid_idx]) + if(is_atom_on_mgrid(i, mgrid_idx) && is_atom_on_mgrid(j, mgrid_idx)) { end_idx = mgrid_idx; break; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h index 5b5366e701..48044e0014 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" #include "big_grid.h" namespace ModuleGint @@ -28,8 +28,8 @@ class PhiOperator void set_bgrid(std::shared_ptr biggrid); // getter - int get_rows() const {return rows_;}; - int get_cols() const {return cols_;}; + int get_rows() const {return rows_;} + int get_cols() const {return cols_;} // get phi of the big grid // the dimension of phi is num_mgrids * (\sum_{i=0}^{atoms_->size()} atoms_[i]->nw) @@ -93,6 +93,24 @@ class PhiOperator const double* dphi_z, ModuleBase::matrix *svl) const; + void cal_env_gamma( + const double* phi, + const double* wfc, + const vector& trace_lo, + double* rho) const; + + void cal_env_k( + const double* phi, + const std::complex* wfc, + const vector& trace_lo, + const int ik, + const int nspin, + const int npol, + const int lgd, + const std::vector& kvec_c, + const std::vector& kvec_d, + double* rho) const; + private: void init_atom_pair_start_end_idx_(); @@ -103,14 +121,19 @@ class PhiOperator int x = std::min(a, b); int y = std::abs(a - b); return atom_pair_start_end_idx_[(2 * biggrid_->get_atoms_num() - x + 1) * x / 2 + y]; - }; + } + + bool is_atom_on_mgrid(int atom_idx, int mgrid_idx) const + { + return is_atom_on_mgrid_[atom_idx * rows_ + mgrid_idx]; + } // the row number of the phi matrix // rows_ = biggrid_->get_mgrids_num() int rows_; // the column number of the phi matrix - // cols_ = biggrid_->get_mgrid_phi_len() + // cols_ = biggrid_->get_phi_len() int cols_; // the local index of the meshgrids @@ -124,9 +147,8 @@ class PhiOperator std::vector> atoms_relative_coords_; // record whether the atom affects the meshgrid - // is_atom_on_mgrid_[i][j] = true if the ith atom affects the jth meshgrid, otherwise false - // FIXME,std::vector> is not a efficient data structure, we can use a 1D array to replace it. - std::vector> is_atom_on_mgrid_; + // is_atom_on_mgrid_[i * rows_ + j] = true if the ith atom affects jhe ith meshgrid, otherwise false + std::vector is_atom_on_mgrid_; // the start index of the phi of each atom std::vector atoms_startidx_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp index 44603560d2..79c4b29c23 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp @@ -103,7 +103,7 @@ void PhiOperator::phi_mul_vldr3( } } -// hr(iwt_i,iwt_j) = \sum_{ir} phi_i(ir,iwt_i) * phi_i(ir,iwt_j) +// hr(iwt_i,iwt_j) += \sum_{ir} phi_i(ir,iwt_i) * phi_i(ir,iwt_j) // this is a thread-safe function template void PhiOperator::phi_mul_phi( diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp index 4d01acc262..c84f087487 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp @@ -20,24 +20,6 @@ void GintAtom::set_ddphi( // orb_ does not have the member variable dr_uniform const double dr_uniform = orb_->PhiLN(0, 0).dr_uniform; - // store the pointer to reduce repeated address fetching - std::vector p_psi_uniform(atom_->nw); - std::vector p_dpsi_uniform(atom_->nw); - std::vector p_ddpsi_uniform(atom_->nw); - std::vector phi_nr_uniform(atom_->nw); - for (int iw=0; iw< atom_->nw; ++iw) - { - if ( atom_->iw2_new[iw] ) - { - int l = atom_->iw2l[iw]; - int n = atom_->iw2n[iw]; - p_psi_uniform[iw] = orb_->PhiLN(l, n).psi_uniform.data(); - p_dpsi_uniform[iw] = orb_->PhiLN(l, n).dpsi_uniform.data(); - p_ddpsi_uniform[iw] = orb_->PhiLN(l, n).ddpsi_uniform.data(); - phi_nr_uniform[iw] = orb_->PhiLN(l, n).nr_uniform; - } - } - std::vector rly(std::pow(atom_->nwl + 1, 2)); ModuleBase::Array_Pool grly(std::pow(atom_->nwl + 1, 2), 3); // TODO: A better data structure such as a 3D tensor can be used to store dphi @@ -96,24 +78,15 @@ void GintAtom::set_ddphi( { if(atom_->iw2_new[iw]) { - auto psi_uniform = p_psi_uniform[iw]; - auto dpsi_uniform = p_dpsi_uniform[iw]; - - if(ip >= phi_nr_uniform[iw] - 4) - { - tmp = dtmp = 0.0; - } - else - { - // use Polynomia Interpolation method to get the - // wave functions - - tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) - + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); - - dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) - + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); - } + auto psi_uniform = p_psi_uniform_[iw]; + auto dpsi_uniform = p_dpsi_uniform_[iw]; + // use Polynomia Interpolation method to get the + // wave functions + tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) + + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); + + dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) + + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); } // get the 'l' of this localized wave function diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h index b75806fa2a..df1e88b38c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h @@ -21,16 +21,16 @@ class UnitCellInfo int nmx, int nmy, int nmz); // getter functions - int get_nbx() const { return nbx_; }; - int get_nby() const { return nby_; }; - int get_nbz() const { return nbz_; }; - int get_bgrids_num() const { return nbxyz_; }; - int get_nmx() const { return nmx_; }; - int get_nmy() const { return nmy_; }; - int get_nmz() const { return nmz_; }; - int get_mgrids_num() const { return nmxyz_; }; - std::shared_ptr get_bgrid_info() const { return biggrid_info_; }; - std::shared_ptr get_mgrid_info() const { return meshgrid_info_; }; + int get_nbx() const { return nbx_; } + int get_nby() const { return nby_; } + int get_nbz() const { return nbz_; } + int get_bgrids_num() const { return nbxyz_; } + int get_nmx() const { return nmx_; } + int get_nmy() const { return nmy_; } + int get_nmz() const { return nmz_; } + int get_mgrids_num() const { return nmxyz_; } + std::shared_ptr get_bgrid_info() const { return biggrid_info_; } + std::shared_ptr get_mgrid_info() const { return meshgrid_info_; } //==================================================================== // functions related to the big grid @@ -40,25 +40,25 @@ class UnitCellInfo Vec3i bgrid_idx_1Dto3D(const int index_1d) const { return index1Dto3D(index_1d, nbx_, nby_, nbz_); - }; + } // transform the 3D index of a biggrid in the unit cell to the 1D index int bgrid_idx_3Dto1D(const Vec3i index_3d) const { return index3Dto1D(index_3d.x, index_3d.y, index_3d.z, nbx_, nby_, nbz_); - }; + } // get the cartesian coordinate of a big grid in the unit cell from the 3D index Vec3d get_bgrid_coord(Vec3i index_3d) const { return biggrid_info_->get_cartesian_coord(index_3d); - }; + } // get the cartesian coordinate of a big grid in the unit cell from the 1D index Vec3d get_bgrid_coord(int index_1d) const { return get_bgrid_coord(bgrid_idx_1Dto3D(index_1d)); - }; + } // get the 3D index of a big grid in the unit cell from the cartesian coordinate Vec3i get_bgrid_idx_3d(const Vec3d coord) const @@ -68,7 +68,7 @@ class UnitCellInfo static_cast(floor(direct_coord.x)), static_cast(floor(direct_coord.y)), static_cast(floor(direct_coord.z))); - }; + } // Get the relative Cartesian coordinates of big grid A relative to big grid B // returned vector = coordinates of point A - coordinates of point B @@ -77,7 +77,7 @@ class UnitCellInfo Vec3d get_relative_coord(Vec3i index_3d_a, Vec3i index_3d_b) const { return get_bgrid_coord(index_3d_a - index_3d_b); - }; + } // get the extended unitcell index of a big grid Vec3i get_unitcell_idx(const Vec3i index_3d) const @@ -85,7 +85,7 @@ class UnitCellInfo return Vec3i(floor_div(index_3d.x, nbx_), floor_div(index_3d.y, nby_), floor_div(index_3d.z, nbz_)); - }; + } // map the extended big grid index to the big grid index in unitcell Vec3i map_ext_idx_to_ucell(const Vec3i index_3d) const @@ -93,7 +93,7 @@ class UnitCellInfo return Vec3i(index_3d.x - floor_div(index_3d.x, nbx_) * nbx_, index_3d.y - floor_div(index_3d.y, nby_) * nby_, index_3d.z - floor_div(index_3d.z, nbz_) * nbz_); - }; + } //==================================================================== @@ -116,7 +116,7 @@ class UnitCellInfo Vec3d get_mgrid_coord(Vec3i index_3d) const { return meshgrid_info_->get_cartesian_coord(index_3d); - }; + } // get the cartesian coordinate of a meshgrid in the unit cell from the 1D index Vec3d get_mgrid_coord(int index_1d) const diff --git a/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp b/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp index 54250a2ce2..89b71306a1 100644 --- a/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp +++ b/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp @@ -16,6 +16,9 @@ HContainer::~HContainer() } } +template +HContainer::HContainer() {} + // copy constructor template HContainer::HContainer(const HContainer& HR_in, T* data_array) @@ -35,17 +38,38 @@ HContainer::HContainer(const HContainer& HR_in, T* data_array) // move constructor template -HContainer::HContainer(HContainer&& HR_in) +HContainer::HContainer(HContainer&& HR_in) noexcept { this->atom_pairs = std::move(HR_in.atom_pairs); this->sparse_ap = std::move(HR_in.sparse_ap); this->sparse_ap_index = std::move(HR_in.sparse_ap_index); + this->wrapper_pointer = HR_in.wrapper_pointer; this->gamma_only = HR_in.gamma_only; this->paraV = HR_in.paraV; this->current_R = -1; + HR_in.wrapper_pointer = nullptr; // tmp terms not moved } +// move assignment +template +HContainer& HContainer::operator=(HContainer&& HR_in) noexcept +{ + if (this != &HR_in) + { + this->atom_pairs = std::move(HR_in.atom_pairs); + this->sparse_ap = std::move(HR_in.sparse_ap); + this->sparse_ap_index = std::move(HR_in.sparse_ap_index); + this->wrapper_pointer = HR_in.wrapper_pointer; + this->gamma_only = HR_in.gamma_only; + this->paraV = HR_in.paraV; + this->current_R = -1; + + HR_in.wrapper_pointer = nullptr; + } + return *this; +} + // simple constructor template HContainer::HContainer(int natom) diff --git a/source/module_hamilt_lcao/module_hcontainer/hcontainer.h b/source/module_hamilt_lcao/module_hcontainer/hcontainer.h index cf50e7c263..edaca9577e 100644 --- a/source/module_hamilt_lcao/module_hcontainer/hcontainer.h +++ b/source/module_hamilt_lcao/module_hcontainer/hcontainer.h @@ -146,6 +146,8 @@ class HContainer // Destructor of class HContainer ~HContainer(); + HContainer(); + /** * @brief copy constructor * when data_array is not nullptr, new HContainer will be wrapper for data_array @@ -154,7 +156,9 @@ class HContainer HContainer(const HContainer& HR_in, T* data_array = nullptr); // move constructor - HContainer(HContainer&& HR_in); + HContainer(HContainer&& HR_in) noexcept; + // move assignment + HContainer& operator=(HContainer&& HR_in) noexcept; // simple constructor HContainer(int natom); diff --git a/source/module_io/cal_ldos.cpp b/source/module_io/cal_ldos.cpp index 4826e65cc0..31133cfa81 100644 --- a/source/module_io/cal_ldos.cpp +++ b/source/module_io/cal_ldos.cpp @@ -60,7 +60,7 @@ void Cal_ldos::cal_ldos_lcao(const elecstate::ElecStateLCAO* pelec, } // calculate ldos -#ifndef __NEW_GINT +#ifdef __OLD_GINT ModuleBase::WARNING_QUIT("Cal_ldos::dm2ldos", "do not support old grid integral, please recompile with __NEW_GINT"); #else diff --git a/source/module_io/get_pchg_lcao.cpp b/source/module_io/get_pchg_lcao.cpp index b8b84f3125..316ffdb3f6 100644 --- a/source/module_io/get_pchg_lcao.cpp +++ b/source/module_io/get_pchg_lcao.cpp @@ -3,6 +3,7 @@ #include "module_io/cube_io.h" #include "source_estate/module_charge/symmetry_rho.h" #include "source_estate/module_dm/cal_dm_psi.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" Get_pchg_lcao::Get_pchg_lcao(psi::Psi* psi_gamma_in, const Parallel_Orbitals* ParaV_in) : psi_gamma(psi_gamma_in), ParaV(ParaV_in) @@ -69,10 +70,14 @@ void Get_pchg_lcao::begin(Gint_Gamma& gg, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(); +#ifdef __OLD_GINT gg.initialize_pvpR(*ucell_in, GridD_in, nspin); gg.transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin); gg.cal_gint(&inout); +#else + ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho); +#endif // A solution to replace the original implementation of the following code: // pelec->charge->save_rho_before_sum_band(); @@ -164,10 +169,15 @@ void Get_pchg_lcao::begin(Gint_k& gk, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(ik); +#ifdef __OLD_GINT gk.initialize_pvpR(*ucell_in, GridD_in, nspin); gk.transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin); gk.cal_gint(&inout); +#else + ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho); +#endif + // Using std::vector to replace the original double** rho_save std::vector> rho_save(nspin, std::vector(rhopw_nrxx)); @@ -206,11 +216,14 @@ void Get_pchg_lcao::begin(Gint_k& gk, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(); +#ifdef __OLD_GINT gk.initialize_pvpR(*ucell_in, GridD_in, nspin); gk.transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin); gk.cal_gint(&inout); - +#else + ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho); +#endif // Using std::vector to replace the original double** rho_save std::vector> rho_save(nspin, std::vector(rhopw_nrxx)); diff --git a/source/module_io/get_wf_lcao.cpp b/source/module_io/get_wf_lcao.cpp index 039b2231de..197068ed7b 100644 --- a/source/module_io/get_wf_lcao.cpp +++ b/source/module_io/get_wf_lcao.cpp @@ -4,6 +4,11 @@ #include "module_io/write_wfc_pw.h" #include "source_base/memory.h" +#ifndef __OLD_GINT +#include "module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h" +#endif + Get_wf_lcao::Get_wf_lcao(const elecstate::ElecState* pes) { pes_ = pes; @@ -40,6 +45,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, prepare_get_wf(ofs_running); +#ifdef __OLD_GINT // allocate grid wave functions for gamma_only std::vector wfc_gamma_grid(nspin); for (int is = 0; is < nspin; ++is) @@ -50,6 +56,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, wfc_gamma_grid[is][ib] = new double[gg.gridt->lgd]; } } +#endif // for pw_wfc in G space psi::Psi> psi_g; @@ -57,42 +64,48 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // if (out_wfc_pw || out_wfc_r) psi_g.resize(nspin, nbands, kv.ngk[0]); +#ifdef __OLD_GINT const double mem_size = sizeof(double) * double(gg.gridt->lgd) * double(nbands) * double(nspin) / 1024.0 / 1024.0; ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size); - ModuleBase::GlobalFunc::OUT(ofs_running, "On-the-fly memory consumption (MB)", mem_size); + ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "On-the-fly memory consumption (MB)", mem_size); +#endif // Set this->bands_picked_ this->select_bands(out_wfc_norm, nbands, fermi_band); // Calculate out_wfc_norm - for (int ib = 0; ib < nbands; ++ib) + for (int is = 0; is < nspin; ++is) { - if (bands_picked_[ib]) + psid->fix_k(is); +#ifdef __OLD_GINT + #ifdef __MPI + wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); + #else + // if not MPI enabled, it is the case psid holds a global matrix. + // use fix_k to switch between different spin channels (actually kpoints, + // because now the same kpoint in different spin channels are treated + // as distinct kpoints) + for (int i = 0; i < nbands; ++i) { - for (int is = 0; is < nspin; ++is) + for (int j = 0; j < nlocal; ++j) { - ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); - - psid->fix_k(is); -#ifdef __MPI - wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); + wfc_gamma_grid[is][i][j] = psid[0](i, j); + } + } + #endif #else - // if not MPI enabled, it is the case psid holds a global matrix. - // use fix_k to switch between different spin channels (actually kpoints, - // because now the same kpoint in different spin channels are treated - // as distinct kpoints) - - for (int i = 0; i < nbands; ++i) - { - for (int j = 0; j < nlocal; ++j) - { - wfc_gamma_grid[is][i][j] = psid[0](i, j); - } - } + ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, nlocal, pes_->charge->rho[is]); #endif - + for (int ib = 0; ib < nbands; ++ib) + { + if (bands_picked_[ib]) + { + #ifdef __OLD_GINT + ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); - + #else + gint_env.cal_env_band(ib); + #endif pes_->charge->save_rho_before_sum_band(); // pint out information @@ -124,33 +137,37 @@ void Get_wf_lcao::begin(const UnitCell& ucell, this->select_bands(out_wfc_re_im, nbands, fermi_band); // Calculate out_wfc_re_im - for (int ib = 0; ib < nbands; ++ib) + for (int is = 0; is < nspin; ++is) { - if (bands_picked_[ib]) + psid->fix_k(is); +#ifdef __OLD_GINT + #ifdef __MPI + wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); + #else + // if not MPI enabled, it is the case psid holds a global matrix. use fix_k to switch between + // different spin channels (actually kpoints, because now the same kpoint in different spin channels + // are treated as distinct kpoints) + for (int i = 0; i < nbands; ++i) { - for (int is = 0; is < nspin; ++is) + for (int j = 0; j < nlocal; ++j) { - ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); - - psid->fix_k(is); -#ifdef __MPI - wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); + wfc_gamma_grid[is][i][j] = psid[0](i, j); + } + } + #endif #else - // if not MPI enabled, it is the case psid holds a global matrix. use fix_k to switch between - // different spin channels (actually kpoints, because now the same kpoint in different spin channels - // are treated as distinct kpoints) - - for (int i = 0; i < nbands; ++i) - { - for (int j = 0; j < nlocal; ++j) - { - wfc_gamma_grid[is][i][j] = psid[0](i, j); - } - } + ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, nlocal, pes_->charge->rho[is]); #endif - + for (int ib = 0; ib < nbands; ++ib) + { + if (bands_picked_[ib]) + { +#ifdef __OLD_GINT + ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); - +#else + gint_env.cal_env_band(ib); +#endif pes_->charge->save_rho_before_sum_band(); const double ef_tmp = this->pes_->eferm.get_efval(is); @@ -202,6 +219,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, pw_wfc, ofs_running); +#ifdef __OLD_GINT for (int is = 0; is < nspin; ++is) { for (int ib = 0; ib < nbands; ++ib) @@ -210,6 +228,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, } delete[] wfc_gamma_grid[is]; } +#endif return; } @@ -240,6 +259,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // allocate grid wave functions for multi-k const int nks = kv.get_nks(); std::vector**> wfc_k_grid(nks); +#ifdef __OLD_GINT for (int ik = 0; ik < nks; ++ik) { wfc_k_grid[ik] = new std::complex*[nbands]; @@ -252,7 +272,8 @@ void Get_wf_lcao::begin(const UnitCell& ucell, const double mem_size = sizeof(std::complex) * double(gk.gridt->lgd) * double(nbands) * double(nks) / 1024.0 / 1024.0; ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size); - ModuleBase::GlobalFunc::OUT(ofs_running, "On-the-fly memory consumption (MB)", mem_size); + ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "On-the-fly memory consumption (MB)", mem_size); +#endif // for pw_wfc in G space psi::Psi> psi_g; @@ -263,34 +284,44 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // Set this->bands_picked_ this->select_bands(out_wfc_norm, nbands, fermi_band); - // Calculate out_wfc_norm - for (int ib = 0; ib < nbands; ++ib) + // Calculate out_wfc_norm + const int nspin0 = (nspin == 2) ? 2 : 1; + for (int ik = 0; ik < nks; ++ik) // the loop of nspin0 is included { - if (bands_picked_[ib]) + const int ispin = kv.isk[ik]; + // 2d-to-grid conversion is unified into `wfc_2d_to_grid`. + psi->fix_k(ik); + +#ifdef __OLD_GINT + #ifdef __MPI // need to deal with NSPIN=4 !!!! + wfc_2d_to_grid(psi->get_pointer(), para_orb, wfc_k_grid[ik], gk.gridt->trace_lo); + #else + for (int i = 0; i < nbands; ++i) { - const int nspin0 = (nspin == 2) ? 2 : 1; - for (int ik = 0; ik < nks; ++ik) // the loop of nspin0 is included + for (int j = 0; j < nlocal; ++j) { - const int ispin = kv.isk[ik]; + wfc_k_grid[ik][i][j] = psi[0](i, j); + } + } + #endif +#else + ModuleGint::Gint_env_k gint_env(psi->get_pointer(), ¶_orb, kv.kvec_c, kv.kvec_d, + nbands, nlocal, ik, PARAM.inp.nspin, PARAM.globalv.npol, pes_->charge->rho[ispin]); +#endif + + for (int ib = 0; ib < nbands; ++ib) + { + if (bands_picked_[ib]) + { +#ifdef __OLD_GINT ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[ispin], pw_wfc->nrxx); // terrible, you make changes on another instance's data??? - // 2d-to-grid conversion is unified into `wfc_2d_to_grid`. - psi->fix_k(ik); - -#ifdef __MPI // need to deal with NSPIN=4 !!!! - wfc_2d_to_grid(psi->get_pointer(), para_orb, wfc_k_grid[ik], gk.gridt->trace_lo); -#else - for (int i = 0; i < nbands; ++i) - { - for (int j = 0; j < nlocal; ++j) - { - wfc_k_grid[ik][i][j] = psi[0](i, j); - } - } -#endif // deal with NSPIN=4 gk.cal_env_k(ik, wfc_k_grid[ik][ib], pes_->charge->rho[ispin], kv.kvec_c, kv.kvec_d, ucell); +#else + gint_env.cal_env_band(ib); +#endif // ik0 is the real k-point index, starting from 0 int ik0 = kv.ik2iktot[ik]; @@ -404,7 +435,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, } } } - +#ifdef __OLD_GINT for (int ik = 0; ik < nks; ++ik) { for (int ib = 0; ib < nbands; ++ib) @@ -413,7 +444,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, } delete[] wfc_k_grid[ik]; } - +#endif return; } diff --git a/source/module_io/read_input_item_other.cpp b/source/module_io/read_input_item_other.cpp index 74b1e25221..5694c7b33b 100644 --- a/source/module_io/read_input_item_other.cpp +++ b/source/module_io/read_input_item_other.cpp @@ -501,6 +501,12 @@ void ReadInput::item_others() item.annotation = "whether to perform rdmft calculation, default is false"; read_sync_bool(input.rdmft); this->add_item(item); + item.check_value = [](const Input_Item& item, const Parameter& para) { + if (para.input.rdmft && para.input.nspin == 4) + { + ModuleBase::WARNING_QUIT("ReadInput", "rdmft is not available for nspin = 4"); + } + }; } { Input_Item item("rdmft_power_alpha"); diff --git a/source/module_io/write_HS_R.cpp b/source/module_io/write_HS_R.cpp index ebd91a3d35..e2540dca40 100644 --- a/source/module_io/write_HS_R.cpp +++ b/source/module_io/write_HS_R.cpp @@ -146,8 +146,6 @@ void ModuleIO::output_dHR(const int& istep, GlobalV::ofs_running << " | |" << std::endl; GlobalV::ofs_running << " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" << std::endl; - gint_k.allocate_pvdpR(); - const int nspin = PARAM.inp.nspin; if (nspin == 1 || nspin == 4) @@ -163,28 +161,13 @@ void ModuleIO::output_dHR(const int& istep, orb, cspin, sparse_thr, + v_eff, gint_k); } else if (nspin == 2) { for (int cspin = 0; cspin < 2; cspin++) { - // note: some MPI process will not have grids when MPI cores are too - // many, v_eff in these processes are empty - const double* vr_eff1 - = v_eff.nc * v_eff.nr > 0 ? &(v_eff(cspin, 0)) : nullptr; - - if (!PARAM.globalv.gamma_only_local) - { - if (PARAM.inp.vl_in_h) - { - Gint_inout inout(vr_eff1, - cspin, - Gint_Tools::job_type::dvlocal); - gint_k.cal_gint(&inout); - } - } - sparse_format::cal_dH(ucell, pv, HS_Arrays, @@ -193,6 +176,7 @@ void ModuleIO::output_dHR(const int& istep, orb, cspin, sparse_thr, + v_eff, gint_k); } } @@ -201,8 +185,6 @@ void ModuleIO::output_dHR(const int& istep, sparse_format::destroy_dH_R_sparse(HS_Arrays); - gint_k.destroy_pvdpR(); - ModuleBase::timer::tick("ModuleIO", "output_dHR"); return; } diff --git a/source/module_lr/esolver_lrtd_lcao.cpp b/source/module_lr/esolver_lrtd_lcao.cpp index 1e09216303..d51fbea6a7 100644 --- a/source/module_lr/esolver_lrtd_lcao.cpp +++ b/source/module_lr/esolver_lrtd_lcao.cpp @@ -241,7 +241,7 @@ LR::ESolver_LR::ESolver_LR(ModuleESolver::ESolver_KS_LCAO&& ks_sol this->nupdown = cal_nupdown_form_occ(ks_sol.pelec->wg); reset_dim_spin2(); } - +#ifdef __OLD_GINT //grid integration this->gt_ = std::move(ks_sol.GridT); @@ -255,7 +255,9 @@ LR::ESolver_LR::ESolver_LR(ModuleESolver::ESolver_KS_LCAO&& ks_sol } this->set_gint(); this->gint_->reset_DMRGint(1); - +#else + this->gint_info_ = std::move(ks_sol.gint_info_); +#endif // move pw basis if (this->pw_rho_flag) { @@ -393,6 +395,7 @@ LR::ESolver_LR::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu this->ucell, search_radius, PARAM.inp.test_atom_input); +#ifdef __OLD_GINT this->set_gint(); this->gint_->gridt = &this->gt_; @@ -451,7 +454,26 @@ LR::ESolver_LR::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu &ucell, &orb); this->gint_->initialize_pvpR(ucell, &this->gd, 1); // always use nspin=1 for transition density - +#else + gint_info_.reset( + new ModuleGint::GintInfo( + this->pw_big->nbx, + this->pw_big->nby, + this->pw_big->nbz, + this->pw_rho->nx, + this->pw_rho->ny, + this->pw_rho->nz, + 0, + 0, + this->pw_big->nbzp_start, + this->pw_big->nbx, + this->pw_big->nby, + this->pw_big->nbzp, + orb.Phi, + ucell, + this->gd)); + ModuleGint::Gint::set_gint_info(gint_info_.get()); +#endif // if EXX from scratch, init 2-center integral and calculate Cs, Vs #ifdef __EXX if ((xc_kernel == "hf" || xc_kernel == "hse") && this->input.lr_solver != "spectrum") diff --git a/source/module_lr/esolver_lrtd_lcao.h b/source/module_lr/esolver_lrtd_lcao.h index e6335ab69c..17e2a64a55 100644 --- a/source/module_lr/esolver_lrtd_lcao.h +++ b/source/module_lr/esolver_lrtd_lcao.h @@ -17,6 +17,7 @@ #include "source_estate/module_dm/density_matrix.h" #include "module_lr/potentials/pot_hxc_lrtd.h" #include "module_lr/hamilt_casida.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_info.h" #ifdef __EXX // #include #include "module_ri/Exx_LRI.h" @@ -93,6 +94,9 @@ namespace LR Gint_Gamma gint_g_; Gint_k gint_k_; typename TGint::type* gint_ = nullptr; + #ifndef __OLD_GINT + std::unique_ptr gint_info_ = nullptr; + #endif void set_gint(); /// @brief variables for parallel distribution of KS orbitals diff --git a/source/module_lr/lr_spectrum.cpp b/source/module_lr/lr_spectrum.cpp index d2fb10ab02..28c23270e7 100644 --- a/source/module_lr/lr_spectrum.cpp +++ b/source/module_lr/lr_spectrum.cpp @@ -6,6 +6,7 @@ #include "module_lr/utils/lr_util.h" #include "module_lr/utils/lr_util_hcontainer.h" #include "module_lr/utils/lr_util_print.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" template elecstate::DensityMatrix LR::LR_Spectrum::cal_transition_density_matrix(const int istate, const T* X_in, const bool need_R) @@ -34,6 +35,7 @@ elecstate::DensityMatrix LR::LR_Spectrum::cal_transition_density_matrix return DM_trans; } +#ifdef __OLD_GINT template void LR::LR_Spectrum::cal_gint_rho(double** rho, const int& nrxx) { @@ -41,6 +43,7 @@ void LR::LR_Spectrum::cal_gint_rho(double** rho, const int& nrxx) Gint_inout inout_rho(rho, Gint_Tools::job_type::rho, 1, false); this->gint->cal_gint(&inout_rho); } +#endif inline void check_sum_rule(const double& osc_tot) { @@ -59,12 +62,16 @@ ModuleBase::Vector3 LR::LR_Spectrum::cal_transition_dipole_istat const elecstate::DensityMatrix& DM_trans = this->cal_transition_density_matrix(istate); for (int is = 0;is < this->nspin_x;++is) { - this->gint->transfer_DM2DtoGrid({ DM_trans.get_DMR_vector().at(is) }); - // 2. transition density double** rho_trans; LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, this->rho_basis.nrxx); +#ifdef __OLD_GINT + this->gint->transfer_DM2DtoGrid({ DM_trans.get_DMR_vector().at(is) }); this->cal_gint_rho(rho_trans, this->rho_basis.nrxx); +#else + ModuleBase::GlobalFunc::ZEROS(rho_trans[0], this->rho_basis.nrxx); + ModuleGint::cal_gint_rho({ DM_trans.get_DMR_vector().at(is) }, 1, rho_trans, false); +#endif // 3. transition dipole moment for (int ir = 0; ir < rho_basis.nrxx; ++ir) @@ -79,7 +86,7 @@ ModuleBase::Vector3 LR::LR_Spectrum::cal_transition_dipole_istat } LR_Util::_deallocate_2order_nested_ptr(rho_trans, 1); } - trans_dipole *= (ucell.omega / static_cast(gint->get_ncxyz())); // dv + trans_dipole *= (ucell.omega / static_cast(rho_basis.nxyz)); // dv trans_dipole *= static_cast(this->nk); // nk is divided inside DM_trans, now recover it if (this->nspin_x == 1) { trans_dipole *= sqrt(2.0); } // *2 for 2 spins, /sqrt(2) for the halfed dimension of X in the normalizaiton Parallel_Reduce::reduce_all(trans_dipole.x); @@ -108,14 +115,24 @@ ModuleBase::Vector3> LR::LR_Spectrum>: // real part LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'R'); +#ifdef __OLD_GINT this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); this->cal_gint_rho(rho_trans_real, this->rho_basis.nrxx); +#else + ModuleBase::GlobalFunc::ZEROS(rho_trans_real[0], this->rho_basis.nrxx); + ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_real, false); +#endif // LR_Util::print_grid_nonzero(rho_trans_real[0], this->rho_basis.nrxx, 10, "rho_trans"); // imag part LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'I'); +#ifdef __OLD_GINT this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); this->cal_gint_rho(rho_trans_imag, this->rho_basis.nrxx); +#else + ModuleBase::GlobalFunc::ZEROS(rho_trans_imag[0], this->rho_basis.nrxx); + ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_imag, false); +#endif // LR_Util::print_grid_nonzero(rho_trans_imag[0], this->rho_basis.nrxx, 10, "rho_trans"); // 3. transition dipole moment @@ -133,7 +150,7 @@ ModuleBase::Vector3> LR::LR_Spectrum>: LR_Util::_deallocate_2order_nested_ptr(rho_trans_real, 1); LR_Util::_deallocate_2order_nested_ptr(rho_trans_imag, 1); } - trans_dipole *= (ucell.omega / static_cast(gint->get_ncxyz())); // dv + trans_dipole *= (ucell.omega / static_cast(rho_basis.nxyz)); // dv trans_dipole *= static_cast(this->nk); // nk is divided inside DM_trans, now recover it if (this->nspin_x == 1) { trans_dipole *= sqrt(2.0); } // *2 for 2 spins, /sqrt(2) for the halfed dimension of X in the normalizaiton Parallel_Reduce::reduce_all(trans_dipole.x); diff --git a/source/module_lr/operator_casida/operator_lr_hxc.cpp b/source/module_lr/operator_casida/operator_lr_hxc.cpp index ebff00e5f1..8ec9fece42 100644 --- a/source/module_lr/operator_casida/operator_lr_hxc.cpp +++ b/source/module_lr/operator_casida/operator_lr_hxc.cpp @@ -10,6 +10,7 @@ #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h" #include "module_lr/ao_to_mo_transformer/ao_to_mo.h" #include "source_pw/hamilt_pwdft/global.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" inline double conj(double a) { return a; } inline std::complex conj(std::complex a) { return std::conj(a); } @@ -22,7 +23,6 @@ namespace LR ModuleBase::TITLE("OperatorLRHxc", "act"); const int& sl = ispin_ks[0]; const auto psil_ks = LR_Util::get_psi_spin(psi_ks, sl, nk); - const int& lgd = gint->gridt->lgd; this->DM_trans->cal_DMR(); //DM_trans->get_DMR_vector() is 2d-block parallized // LR_Util::print_DMR(*DM_trans, ucell.nat, "DMR"); @@ -55,7 +55,6 @@ namespace LR { ModuleBase::TITLE("OperatorLRHxc", "grid_calculation(real)"); ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation"); - this->gint->transfer_DM2DtoGrid(this->DM_trans->get_DMR_vector()); // 2d block to grid // 2. transition electron density // \f[ \tilde{\rho}(r)=\sum_{\mu_j, \mu_b}\tilde{\rho}_{\mu_j,\mu_b}\phi_{\mu_b}(r)\phi_{\mu_j}(r) \f] @@ -63,20 +62,28 @@ namespace LR const int& nrxx = this->pot.lock()->nrxx; LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // currently gint_kernel_rho uses PARAM.inp.nspin, it needs refactor ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx); +#ifdef __OLD_GINT + this->gint->transfer_DM2DtoGrid(this->DM_trans->get_DMR_vector()); // 2d block to grid Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false); this->gint->cal_gint(&inout_rho); - +#else + ModuleGint::cal_gint_rho(this->DM_trans->get_DMR_vector(), 1, rho_trans, false); +#endif // 3. v_hxc = f_hxc * rho_trans ModuleBase::matrix vr_hxc(1, nrxx); //grid this->pot.lock()->cal_v_eff(rho_trans, ucell, vr_hxc, ispin_ks); LR_Util::_deallocate_2order_nested_ptr(rho_trans, 1); // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r) + this->hR->set_zero(); // clear hR for each bands +#ifdef __OLD_GINT Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal); this->gint->get_hRGint()->set_zero(); this->gint->cal_gint(&inout_vlocal); - this->hR->set_zero(); // clear hR for each bands this->gint->transfer_pvpR(&*this->hR, &ucell); //grid to 2d block +#else + ModuleGint::cal_gint_vl(vr_hxc.c, &*this->hR); +#endif ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation"); } @@ -96,8 +103,6 @@ namespace LR LR_Util::get_DMR_real_imag_part(*this->DM_trans, DM_trans_real_imag, ucell.nat, type); // if (this->first_print)LR_Util::print_DMR(DM_trans_real_imag, ucell.nat, "DMR(2d, real)"); - this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); - // LR_Util::print_HR(*this->gint->get_DMRGint()[0], this->ucell.nat, "DMR(grid, real)"); // 2. transition electron density double** rho_trans; @@ -105,8 +110,14 @@ namespace LR LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // nspin=1 for transition density ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx); +#ifdef __OLD_GINT + this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); + // LR_Util::print_HR(*this->gint->get_DMRGint()[0], this->ucell.nat, "DMR(grid, real)"); Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false); this->gint->cal_gint(&inout_rho); +#else + ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans, false); +#endif // print_grid_nonzero(rho_trans[0], nrxx, 10, "rho_trans"); // 3. v_hxc = f_hxc * rho_trans @@ -117,13 +128,16 @@ namespace LR LR_Util::_deallocate_2order_nested_ptr(rho_trans, 1); // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r) + HR_real_imag.set_zero(); +#ifdef __OLD_GINT Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal); this->gint->get_hRGint()->set_zero(); this->gint->cal_gint(&inout_vlocal); - // LR_Util::print_HR(*this->gint->get_hRGint(), this->ucell.nat, "VR(grid)"); - HR_real_imag.set_zero(); this->gint->transfer_pvpR(&HR_real_imag, &ucell, &this->gd); +#else + ModuleGint::cal_gint_vl(vr_hxc.c, &HR_real_imag); +#endif // LR_Util::print_HR(HR_real_imag, this->ucell.nat, "VR(real, 2d)"); LR_Util::set_HR_real_imag_part(HR_real_imag, *this->hR, ucell.nat, type); }; diff --git a/source/module_rdmft/rdmft_tools.cpp b/source/module_rdmft/rdmft_tools.cpp index bd45a49631..bff3229fa0 100644 --- a/source/module_rdmft/rdmft_tools.cpp +++ b/source/module_rdmft/rdmft_tools.cpp @@ -12,6 +12,7 @@ #include "source_estate/module_pot/pot_local.h" #include "source_estate/module_pot/pot_xc.h" #include "source_pw/hamilt_pwdft/structure_factor.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" #include #include @@ -240,8 +241,9 @@ void Veff_rdmft::initialize_HR(const UnitCell* ucell_in, const Grid_Driv // this part of the code is copying from class Veff and do some modifications. -template -void Veff_rdmft::contributeHR() +// nspin == 1 or 2 case +template<> +void Veff_rdmft, double>::contributeHR() { ModuleBase::TITLE("Veff", "contributeHR"); ModuleBase::timer::tick("Veff", "contributeHR"); @@ -261,8 +263,12 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_hartree(is, 0); // do grid integral calculation to get HR +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GK->cal_gint(&inout); +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } } else if( potential_ == "local" ) @@ -276,8 +282,12 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_local(0, 0); // do grid integral calculation to get HR +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal); this->GK->cal_gint(&inout); +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } else if( potential_ == "xc" ) { @@ -296,8 +306,12 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_XC(is, 0); // do grid integral calculation to get HR +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GK->cal_gint(&inout); +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } } else @@ -307,7 +321,9 @@ void Veff_rdmft::contributeHR() // get HR for 2D-block parallel format // this->GK->transfer_pvpR(this->hR); +#ifdef __OLD_GINT this->GK->transfer_pvpR(this->hR,this->ucell,this->gd); +#endif if(this->nspin == 2) { @@ -318,6 +334,12 @@ void Veff_rdmft::contributeHR() return; } +template<> +void Veff_rdmft, std::complex>::contributeHR() +{ + // nspin = 4 case not implemented currently. +} + // this part of the code is copying from class Veff and do some modifications. // special case of gamma-only template<> @@ -343,8 +365,12 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_hartree(is, 0); // do grid integral calculation to get HR +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GG->cal_gint(&inout); +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } } else if( potential_ == "local" ) @@ -358,12 +384,16 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_local(0, 0); // do grid integral calculation to get HR +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal); // because in gamma_only, cal_gint would not set hRGint zero first // so must use cal_vlocal(), and in rdmft_test.h, calculate V_hartree->contributeHR() first this->GG->cal_vlocal(&inout, false); // cal_gint ??? +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } else if( potential_ == "xc" ) { @@ -381,8 +411,12 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_XC(is, 0); // do grid integral calculation to get HR +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GG->cal_gint(&inout); +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } } else @@ -390,9 +424,10 @@ void Veff_rdmft::contributeHR() std::cout << "\n\n!!!!!!\n there may be something wrong when use class Veff_rdmft\n\n!!!!!!\n"; } +#ifdef __OLD_GINT // get HR for 2D-block parallel format this->GG->transfer_pvpR(this->hR,this->ucell); - +#endif this->new_e_iteration = false; if(this->nspin == 2) diff --git a/source/module_rdmft/rdmft_tools.h b/source/module_rdmft/rdmft_tools.h index ac3db2744d..e8a5c52e5f 100644 --- a/source/module_rdmft/rdmft_tools.h +++ b/source/module_rdmft/rdmft_tools.h @@ -284,8 +284,9 @@ class Veff_rdmft : public hamilt::OperatorLCAO this->cal_type = hamilt::calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); - +#ifdef __OLD_GINT GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin); +#endif } Veff_rdmft(Gint_Gamma* GG_in, hamilt::HS_Matrix_K* hsk_in, @@ -310,8 +311,9 @@ class Veff_rdmft : public hamilt::OperatorLCAO this->cal_type = hamilt::calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); - +#ifdef __OLD_GINT GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin); +#endif } ~Veff_rdmft(){}; diff --git a/source/module_rdmft/update_state_rdmft.cpp b/source/module_rdmft/update_state_rdmft.cpp index 0cdffed794..183bfba6a1 100644 --- a/source/module_rdmft/update_state_rdmft.cpp +++ b/source/module_rdmft/update_state_rdmft.cpp @@ -8,6 +8,7 @@ #include "source_estate/module_dm/cal_dm_psi.h" #include "source_estate/module_dm/density_matrix.h" #include "source_estate/module_charge/symmetry_rho.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" namespace rdmft @@ -105,10 +106,13 @@ void RDMFT::update_charge(UnitCell& ucell) { ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx); } - +#ifdef __OLD_GINT GG->transfer_DM2DtoGrid(DM_gamma_only.get_DMR_vector()); Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin); GG->cal_gint(&inout); +#else + ModuleGint::cal_gint_rho(DM_gamma_only.get_DMR_vector(), nspin, charge->rho); +#endif if (XC_Functional::get_ked_flag()) { @@ -136,9 +140,13 @@ void RDMFT::update_charge(UnitCell& ucell) ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx); } +#ifdef __OLD_GINT GK->transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin); GK->cal_gint(&inout); +#else + ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, charge->rho); +#endif if (XC_Functional::get_ked_flag()) { diff --git a/source/source_base/intarray.cpp b/source/source_base/intarray.cpp index a2e3dcce4d..10c3b7f39d 100644 --- a/source/source_base/intarray.cpp +++ b/source/source_base/intarray.cpp @@ -6,8 +6,6 @@ namespace ModuleBase { -int IntArray::arrayCount = 0; - void IntArrayAlloc() { std::cout << "\n Allocation error for IntArray " << std::endl; @@ -23,7 +21,6 @@ IntArray::IntArray(const int d1,const int d2) size = bound1 * bound2; ptr = new int[size];zero_out(); assert( ptr != nullptr); - ++arrayCount; } IntArray::IntArray(const int d1,const int d2,const int d3) @@ -37,7 +34,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3) size = bound1 * bound2 * bound3 ; //* sizeof(float); ptr = new int[size];zero_out(); assert(ptr != nullptr); - ++arrayCount; } IntArray::IntArray(const int d1,const int d2,const int d3,const int d4) @@ -52,7 +48,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3,const int d4) size = bound1 * bound2 * bound3 * bound4 ; //* sizeof(float); ptr = new int[size];zero_out(); assert(ptr != nullptr); - ++arrayCount; } IntArray::IntArray(const int d1,const int d2,const int d3, @@ -68,7 +63,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3, size = bound1 * bound2 * bound3 * bound4 * bound5; ptr = new int[size];zero_out(); assert(ptr != nullptr); - ++arrayCount; } IntArray::IntArray(const int d1,const int d2,const int d3, @@ -85,7 +79,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3, size = bound1 * bound2 * bound3 * bound4 * bound5 * bound6; ptr = new int[size];zero_out(); assert(ptr != nullptr); - ++arrayCount; } //******************************** diff --git a/source/source_base/intarray.h b/source/source_base/intarray.h index 96996b5b22..9147dc184e 100644 --- a/source/source_base/intarray.h +++ b/source/source_base/intarray.h @@ -48,17 +48,30 @@ class IntArray void create(const int d1, const int d2, const int d3, const int d4, const int d5, const int d6); /** - * @brief Equal an IntArray to another one + * @brief copy assignment * * @param right * @return const IntArray& */ - const IntArray &operator=(const IntArray &right) - { - assert( this->size == right.size ); - for (int i = 0;i < size;i++) ptr[i] = right.ptr[i]; - return *this;// enables x = y = z; - }; + IntArray &operator=(const IntArray &other) + { + if(this != &other) + { + delete[] ptr; + size = other.size; + dim = other.dim; + bound1 = other.bound1; + bound2 = other.bound2; + bound3 = other.bound3; + bound4 = other.bound4; + bound5 = other.bound5; + bound6 = other.bound6; + ptr = new int[size]; + for (int i = 0;i < size;i++) + { ptr[i] = other.ptr[i]; } + } + return *this; + } /** * @brief Equal all elements of an IntArray to an @@ -71,7 +84,7 @@ class IntArray { for (int i = 0;i < size;i++) ptr[i] = right; return *this;// enables x = y = z; - }; + } /** * @brief Access elements by using operator "()" @@ -85,14 +98,14 @@ class IntArray assert( d1 < bound1 ); assert( d2 < bound2 ); return ptr[ d1 * bound2 + d2 ]; - }; + } int &operator()(const int d1, const int d2, const int d3) { assert( d1 < bound1 ); assert( d2 < bound2 ); assert( d3 < bound3 ); return ptr[ (d1 * bound2 + d2) * bound3 + d3 ]; - }; + } int &operator()(const int d1, const int d2, const int d3, const int d4) { assert( d1 < bound1 ); @@ -100,7 +113,7 @@ class IntArray assert( d3 < bound3 ); assert( d4 < bound4 ); return ptr[ ((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4 ]; - }; + } int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5) { assert( d1 < bound1 ); @@ -109,7 +122,7 @@ class IntArray assert( d4 < bound4 ); assert( d5 < bound5 ); return ptr[ (((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5 ]; - }; + } int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5, const int d6) { assert( d1 < bound1 ); @@ -119,7 +132,7 @@ class IntArray assert( d5 < bound5 ); assert( d6 < bound6 ); return ptr[ ((((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5) * bound6 + d6 ]; - }; + } /** * @brief Access elements by using "()" through pointer @@ -134,14 +147,14 @@ class IntArray assert( d1 < bound1 ); assert( d2 < bound2 ); return ptr[ d1 * bound2 + d2 ]; - }; + } const int &operator()(const int d1, const int d2, const int d3) const { assert( d1 < bound1 ); assert( d2 < bound2 ); assert( d3 < bound3 ); return ptr[ (d1 * bound2 + d2) * bound3 + d3 ]; - }; + } const int &operator()(const int d1, const int d2, const int d3, const int d4) const { assert( d1 < bound1 ); @@ -149,7 +162,7 @@ class IntArray assert( d3 < bound3 ); assert( d4 < bound4 ); return ptr[ ((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4 ]; - }; + } const int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5) const { assert( d1 < bound1 ); @@ -158,7 +171,7 @@ class IntArray assert( d4 < bound4 ); assert( d5 < bound5 ); return ptr[ (((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5 ]; - }; + } const int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5, const int d6) const { assert( d1 < bound1 ); @@ -168,7 +181,7 @@ class IntArray assert( d5 < bound5 ); assert( d6 < bound6 ); return ptr[ ((((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5) * bound6 + d6 ]; - }; + } /** * @brief Set all elements of an IntArray to zero @@ -209,16 +222,6 @@ class IntArray return bound6; } - /** - * @brief Get the Array Count object - * - * @return int - */ - static int getArrayCount(void) - { - return arrayCount; - } - private: int size=0; int dim=0; @@ -228,7 +231,6 @@ class IntArray int bound4=0; int bound5=0; int bound6=0; - static int arrayCount; void freemem(); }; } // namespace ModuleBase diff --git a/source/source_base/test/intarray_test.cpp b/source/source_base/test/intarray_test.cpp index 6ccfb24452..7372b4e115 100644 --- a/source/source_base/test/intarray_test.cpp +++ b/source/source_base/test/intarray_test.cpp @@ -12,8 +12,6 @@ * - construct an int array (2 to 6 dimensions) * - Creat * - create an int array (2 to 6 dimensions) - * - GetArrayCount - * - get the total number of int array created * - GetSize * - get the total size of an int array * - GetDim @@ -51,14 +49,6 @@ class IntArrayTest : public testing::Test const int zero = 0; }; -TEST_F(IntArrayTest,GetArrayCount) -{ - count0 = ModuleBase::IntArray::getArrayCount(); - ModuleBase::IntArray c3, c4; - count1 = ModuleBase::IntArray::getArrayCount(); - EXPECT_EQ((count1-count0),2); -} - TEST_F(IntArrayTest,Construct) { ModuleBase::IntArray x2(1,5); diff --git a/source/source_basis/module_ao/ORB_atomic.cpp b/source/source_basis/module_ao/ORB_atomic.cpp index 9b40d923ef..99f5953bda 100644 --- a/source/source_basis/module_ao/ORB_atomic.cpp +++ b/source/source_basis/module_ao/ORB_atomic.cpp @@ -7,8 +7,6 @@ Numerical_Orbital::Numerical_Orbital() { // make std::pair of new and delete // question remains - this->nchi = nullptr; - this->phiLN = new Numerical_Orbital_Lm[1]; this->rcut = 0.0; this->max_nchi = 0; this->type = 0; @@ -16,8 +14,6 @@ Numerical_Orbital::Numerical_Orbital() Numerical_Orbital::~Numerical_Orbital() { - delete[] nchi; - delete[] phiLN; } void Numerical_Orbital::set_orbital_info(const int& type_in, @@ -34,8 +30,7 @@ void Numerical_Orbital::set_orbital_info(const int& type_in, this->lmax = lmax_in; // (2) set nchi and total nchi. - delete[] this->nchi; - this->nchi = new int[this->lmax + 1]; + this->nchi.resize(this->lmax + 1); for (int i = 0; i < this->lmax + 1; i++) { this->nchi[i] = nchi_in[i]; diff --git a/source/source_basis/module_ao/ORB_atomic.h b/source/source_basis/module_ao/ORB_atomic.h index 71212f8b28..e71c0958d3 100644 --- a/source/source_basis/module_ao/ORB_atomic.h +++ b/source/source_basis/module_ao/ORB_atomic.h @@ -66,7 +66,6 @@ class Numerical_Orbital const inline Numerical_Orbital_Lm& PhiLN( const int &L, const int &N)const { - assert(this->phiLN != nullptr); return this->phiLN[ this->find_chi(L, N) ]; } @@ -98,7 +97,7 @@ class Numerical_Orbital NOAR.set_position(R1_in, R2_in); } - Numerical_Orbital_Lm*& chi() { return this->phiLN; } + std::vector& chi() { return this->phiLN; } private: @@ -115,13 +114,13 @@ class Numerical_Orbital int type; int lmax; - int* nchi; + std::vector nchi; int total_nchi; int max_nchi; ModuleBase::IntArray find_chi; double rcut; - Numerical_Orbital_Lm* phiLN;// length: total_nchi (only store radial function ) + std::vector phiLN;// length: total_nchi (only store radial function ) //========================================================== // Keep the old interface diff --git a/source/source_basis/module_ao/ORB_read.cpp b/source/source_basis/module_ao/ORB_read.cpp index 36d5d55f35..8cb0e4075f 100644 --- a/source/source_basis/module_ao/ORB_read.cpp +++ b/source/source_basis/module_ao/ORB_read.cpp @@ -419,8 +419,7 @@ void LCAO_Orbitals::read_orb_file(std::ofstream& ofs_in, // GlobalV::ofs_running } // OUT(GlobalV::ofs_running,"Total number of chi(l,n)",total_nchi); - delete[] ao[it].phiLN; - ao[it].phiLN = new Numerical_Orbital_Lm[total_nchi]; + ao[it].phiLN.resize(total_nchi); int meshr = 0; // number of mesh points int meshr_read = 0; diff --git a/source/source_basis/module_nao/radial_set.cpp b/source/source_basis/module_nao/radial_set.cpp index 9c83590926..2570e99806 100644 --- a/source/source_basis/module_nao/radial_set.cpp +++ b/source/source_basis/module_nao/radial_set.cpp @@ -87,9 +87,8 @@ RadialSet& RadialSet::operator=(const RadialSet& rhs) void RadialSet::to_numerical_orbital(Numerical_Orbital& no, const int nk_legacy, const double lcao_dk) const { - delete[] no.chi(); - - no.chi() = new Numerical_Orbital_Lm[nchi_]; + no.chi().clear(); + no.chi().resize(nchi_); for (int i = 0; i < nchi_; i++) { chi_[i].to_numerical_orbital_lm(no.chi()[i], nk_legacy, lcao_dk); diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp index 483ae23d5a..a45f12a127 100644 --- a/source/source_esolver/esolver_ks_lcao.cpp +++ b/source/source_esolver/esolver_ks_lcao.cpp @@ -61,6 +61,8 @@ // test RDMFT #include "module_rdmft/rdmft.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_info.h" + #include namespace ModuleESolver @@ -830,7 +832,6 @@ void ESolver_KS_LCAO::iter_finish(UnitCell& ucell, const int istep, int& this->p_chgmix->mix_dmr(dm); } } - // 6) save charge density // Peize Lin add 2020.04.04 if (GlobalC::restart.info_save.save_charge) diff --git a/source/source_esolver/esolver_ks_lcao.h b/source/source_esolver/esolver_ks_lcao.h index 43a180104f..59e427f013 100644 --- a/source/source_esolver/esolver_ks_lcao.h +++ b/source/source_esolver/esolver_ks_lcao.h @@ -95,6 +95,11 @@ class ESolver_KS_LCAO : public ESolver_KS //! Grid integration: used to store some basic information Grid_Technique GridT; +#ifndef __OLD_GINT + //! GintInfo: used to store some basic infomation about module_gint + std::unique_ptr gint_info_; +#endif + //! NAO orbitals: two-center integrations TwoCenterBundle two_center_bundle_; diff --git a/source/source_esolver/lcao_before_scf.cpp b/source/source_esolver/lcao_before_scf.cpp index 0e7142d4a9..7bcd63a5f1 100644 --- a/source/source_esolver/lcao_before_scf.cpp +++ b/source/source_esolver/lcao_before_scf.cpp @@ -60,6 +60,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) PARAM.inp.test_atom_input); //! 4) initialize NAO basis set +#ifdef __OLD_GINT double dr_uniform = 0.001; std::vector rcuts; std::vector> psi_u; @@ -92,10 +93,19 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) dpsi_u, d2psi_u, PARAM.inp.nstream); + + psi_u.clear(); + psi_u.shrink_to_fit(); + dpsi_u.clear(); + dpsi_u.shrink_to_fit(); + d2psi_u.clear(); + d2psi_u.shrink_to_fit(); + LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big); //! 6) prepare grid integral -#ifdef __NEW_GINT - auto gint_info = std::make_shared( +#else + gint_info_.reset( + new ModuleGint::GintInfo( this->pw_big->nbx, this->pw_big->nby, this->pw_big->nbz, @@ -110,26 +120,16 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) this->pw_big->nbzp, orb_.Phi, ucell, - this->gd); - ModuleGint::Gint::init_gint_info(gint_info); + this->gd)); + ModuleGint::Gint::set_gint_info(gint_info_.get()); #endif - psi_u.clear(); - psi_u.shrink_to_fit(); - dpsi_u.clear(); - dpsi_u.shrink_to_fit(); - d2psi_u.clear(); - d2psi_u.shrink_to_fit(); - // 7) For each atom, calculate the adjacent atoms in different cells // and allocate the space for H(R) and S(R). // If k point is used here, allocate HlocR after atom_arrange. this->RA.for_2d(ucell, this->gd, this->pv, PARAM.globalv.gamma_only_local, orb_.cutoffs()); - // 8) after ions move, prepare grid in Gint - LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big); - - // 9) initialize the Hamiltonian operators + // 8) initialize the Hamiltonian operators // if atom moves, then delete old pointer and add a new one if (this->p_hamilt != nullptr) { @@ -169,7 +169,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) #ifdef __MLALGO - // 10) for each ionic step, the overlap must be rebuilt + // 9) for each ionic step, the overlap must be rebuilt // since it depends on ionic positions if (PARAM.globalv.deepks_setorb) { @@ -198,7 +198,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) } #endif - // 11) prepare sc calculation + // 10) prepare sc calculation if (PARAM.inp.sc_mag_switch) { spinconstrain::SpinConstrain& sc = spinconstrain::SpinConstrain::getScInstance(); @@ -217,7 +217,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) this->pelec); } - // 12) set xc type before the first cal of xc in pelec->init_scf + // 11) set xc type before the first cal of xc in pelec->init_scf // Peize Lin add 2016-12-03 #ifdef __EXX if (PARAM.inp.calculation != "nscf") @@ -233,10 +233,10 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) } #endif - // 13) init_scf, should be before_scf? mohan add 2025-03-10 + // 12) init_scf, should be before_scf? mohan add 2025-03-10 this->pelec->init_scf(istep, ucell, this->Pgrid, this->sf.strucFac, this->locpp.numeric, ucell.symm); - // 14) initalize DMR + // 13) initalize DMR // DMR should be same size with Hamiltonian(R) dynamic_cast*>(this->pelec) ->get_DM() @@ -247,7 +247,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) this->ld.init_DMR(ucell, orb_, this->pv, this->gd); #endif - // 15) two cases are considered: + // 14) two cases are considered: // 1. DMK in DensityMatrix is not empty (istep > 0), then DMR is initialized by DMK // 2. DMK in DensityMatrix is empty (istep == 0), then DMR is initialized by zeros if (istep > 0) @@ -255,7 +255,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) dynamic_cast*>(this->pelec)->get_DM()->cal_DMR(); } - // 16) the electron charge density should be symmetrized, + // 15) the electron charge density should be symmetrized, // here is the initialization Symmetry_rho srho; for (int is = 0; is < PARAM.inp.nspin; is++) @@ -263,10 +263,10 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) srho.begin(is, this->chr, this->pw_rho, ucell.symm); } - // 17) why we need to set this sentence? mohan add 2025-03-10 + // 16) why we need to set this sentence? mohan add 2025-03-10 this->p_hamilt->non_first_scf = istep; - // 18) update of RDMFT, added by jghan + // 17) update of RDMFT, added by jghan if (PARAM.inp.rdmft == true) { // necessary operation of these parameters have be done with p_esolver->Init() in source/source_main/driver_run.cpp diff --git a/source/source_esolver/lcao_others.cpp b/source/source_esolver/lcao_others.cpp index 702c0ab49b..6d9c3a98a0 100644 --- a/source/source_esolver/lcao_others.cpp +++ b/source/source_esolver/lcao_others.cpp @@ -91,6 +91,7 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) PARAM.inp.test_atom_input); // (3) Periodic condition search for each grid. +#ifdef __OLD_GINT double dr_uniform = 0.001; std::vector rcuts; std::vector> psi_u; @@ -98,7 +99,6 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) std::vector> d2psi_u; Gint_Tools::init_orb(dr_uniform, rcuts, ucell, orb_, psi_u, dpsi_u, d2psi_u); - this->GridT.set_pbc_grid(this->pw_rho->nx, this->pw_rho->ny, this->pw_rho->nz, @@ -122,12 +122,35 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) dpsi_u, d2psi_u, PARAM.inp.nstream); + psi_u.clear(); psi_u.shrink_to_fit(); dpsi_u.clear(); dpsi_u.shrink_to_fit(); d2psi_u.clear(); d2psi_u.shrink_to_fit(); + // prepare grid in Gint + LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big); +#else + gint_info_.reset( + new ModuleGint::GintInfo( + this->pw_big->nbx, + this->pw_big->nby, + this->pw_big->nbz, + this->pw_rho->nx, + this->pw_rho->ny, + this->pw_rho->nz, + 0, + 0, + this->pw_big->nbzp_start, + this->pw_big->nbx, + this->pw_big->nby, + this->pw_big->nbzp, + orb_.Phi, + ucell, + this->gd)); + ModuleGint::Gint::set_gint_info(gint_info_.get()); +#endif // (2)For each atom, calculate the adjacent atoms in different cells // and allocate the space for H(R) and S(R). @@ -184,9 +207,6 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) } } - // prepare grid in Gint - LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big); - // init Hamiltonian if (this->p_hamilt != nullptr) { diff --git a/source/source_estate/elecstate_lcao.cpp b/source/source_estate/elecstate_lcao.cpp index f18e87ac51..38e854a4eb 100644 --- a/source/source_estate/elecstate_lcao.cpp +++ b/source/source_estate/elecstate_lcao.cpp @@ -34,7 +34,7 @@ void ElecStateLCAO>::psiToRho(const psi::Psigint_k->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer DM2D to DM_grid in gint Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); this->gint_k->cal_gint(&inout); @@ -71,7 +71,7 @@ void ElecStateLCAO::psiToRho(const psi::Psi& psi) //------------------------------------------------------------ ModuleBase::GlobalFunc::NOTE("Calculate the charge on real space grid!"); -#ifndef __NEW_GINT +#ifdef __OLD_GINT this->gint_gamma->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer DM2D to DM_grid in gint Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); this->gint_gamma->cal_gint(&inout); @@ -139,7 +139,7 @@ void ElecStateLCAO::dmToRho(std::vector pexsi_DM, std::vectorgint_gamma->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer DM2D to DM_grid in gint Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); this->gint_gamma->cal_gint(&inout); @@ -152,7 +152,7 @@ void ElecStateLCAO::dmToRho(std::vector pexsi_DM, std::vectorcharge->kin_r[0], this->charge->nrxx); } -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau); this->gint_gamma->cal_gint(&inout1); #else diff --git a/source/source_estate/elecstate_lcao_cal_tau.cpp b/source/source_estate/elecstate_lcao_cal_tau.cpp index d07aeba678..2b611f4c17 100644 --- a/source/source_estate/elecstate_lcao_cal_tau.cpp +++ b/source/source_estate/elecstate_lcao_cal_tau.cpp @@ -16,7 +16,7 @@ void ElecStateLCAO>::cal_tau(const psi::Psicharge->kin_r[is], this->charge->nrxx); } -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau, PARAM.inp.nspin); this->gint_k->cal_gint(&inout1); #else @@ -36,7 +36,7 @@ void ElecStateLCAO::cal_tau(const psi::Psi& psi) { ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[is], this->charge->nrxx); } -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau, PARAM.inp.nspin); this->gint_gamma->cal_gint(&inout1); #else diff --git a/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref b/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref index 4fb5aa8716..540c8789a6 100644 --- a/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref +++ b/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref @@ -1,8 +1,8 @@ etotref -74.3929556166736603 etotperatomref -14.8785911233 -totalforceref 778.174241 -totalstressref 1272.711589 +totalforceref 1495.625575 +totalstressref 574.321174 totaldosref 12 deepks_desc 8.045214 -deepks_dm_eig 29.53046025202608 +deepks_dm_eig 29.530460252025964 totaltimeref 1.12 diff --git a/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref b/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref index 8d613e5354..123d104b33 100644 --- a/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref +++ b/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref @@ -1,7 +1,7 @@ etotref -469.5735907784966230 etotperatomref -156.5245302595 -totalforceref 330.972666 -totalstressref 24771.556634 +totalforceref 10.194156 +totalstressref 510.485544 totaldosref 28 deepks_desc 2.126589 deepks_dm_eig 10.532812121143177