From a8740c12c9ba463d423ac03b549a46e69c4c870e Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Wed, 16 Apr 2025 15:32:57 +0800 Subject: [PATCH 01/63] version 0.0 --- .../module_gint/kernels/cuda/cuda_tools.cu | 1 + .../module_gint/temp_gint/batch_biggrid.cpp | 10 ++ .../module_gint/temp_gint/batch_biggrid.h | 44 +++++++ .../module_gint/temp_gint/big_grid.cpp | 14 ++- .../module_gint/temp_gint/big_grid.h | 9 +- .../module_gint/temp_gint/gint.h | 2 +- .../module_gint/temp_gint/gint_atom.h | 2 - .../module_gint/temp_gint/gint_info.cpp | 20 +++ .../module_gint/temp_gint/gint_info.h | 16 ++- .../temp_gint/kernel/gint_gpu_vars.cpp | 117 ++++++++++++++++++ .../temp_gint/kernel/gint_gpu_vars.h | 40 ++++++ .../temp_gint/kernel/phi_operator_gpu.cpp | 19 +++ .../temp_gint/kernel/phi_operator_gpu.h | 48 +++++++ .../temp_gint/kernel/set_const_mem.cu | 11 ++ .../temp_gint/kernel/set_const_mem.cuh | 9 ++ .../module_gint/temp_gint/phi_operator.cpp | 2 +- .../module_gint/temp_gint/phi_operator.h | 4 +- source/source_esolver/lcao_before_scf.cpp | 13 +- 18 files changed, 366 insertions(+), 15 deletions(-) create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu index 32dfe42b24..09bab1586e 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu @@ -305,6 +305,7 @@ inline T* Cuda_Mem_Wrapper::get_host_pointer(const int stream_id) return this->host_pointer + stream_id * this->one_stream_size_aligned; } template class Cuda_Mem_Wrapper; +template class Cuda_Mem_Wrapper; template class Cuda_Mem_Wrapper; template class Cuda_Mem_Wrapper; template class Cuda_Mem_Wrapper; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp new file mode 100644 index 0000000000..080a06d174 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp @@ -0,0 +1,10 @@ +#include "batch_biggrid.h" + +namespace ModuleGint +{ + +int BatchBigGrid::max_batch_size_ = 0; +int BatchBigGrid::max_atoms_num_ = 0; +int BatchBigGrid::max_phi_len_ = 0; + +} diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h new file mode 100644 index 0000000000..0528f7a8ef --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h @@ -0,0 +1,44 @@ +#pragma once +#include +#include +#include "big_grid.h" + +namespace ModuleGint +{ + +class BatchBigGrid +{ + public: + BatchBigGrid(std::vector> biggrids) + { + biggrids_ = biggrids; + max_batch_size_ = std::max(max_batch_size_, biggrids_.size()); + int atoms_num = 0; + int phi_len = 0; + for(const auto& biggrid : biggrids_) + { + atoms_num += biggrid->get_atoms_num(); + phi_len += biggrid->get_phi_len(); + } + max_atoms_num_ = std::max(max_atoms_num_, atoms_num); + max_phi_len_ = std::max(max_phi_len_, phi_len); + }; + + const std::vector>& get_bgrids() { return biggrids_; } + + static int get_max_batch_size() { return max_batch_size_; } + static int get_max_atoms_num() { return max_atoms_num_; } + static int get_max_phi_len() { return max_phi_len_; } + + private: + std::vector> biggrids_; + + // the max number of biggrids of a batch_biggrid + static int max_batch_size_; + // the max number of total atoms of a batch_biggrid + static int max_atoms_num_; + // the max number of total wavefunctions of a batch_biggrid + static int max_phi_len_; +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp index d972cd90bb..af082f3b9e 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp @@ -13,7 +13,7 @@ void BigGrid::add_atom(const GintAtom* atom) atoms_.push_back(atom); } -int BigGrid::get_mgrid_phi_len() const +int BigGrid::get_phi_len() const { int len = 0; for(const auto& atom : atoms_) @@ -73,6 +73,11 @@ void BigGrid::set_mgrids_local_idx(std::vector& mgrids_idx) const } } +void BigGrid::set_atom_relative_coords(const GintAtom* atom, std::vector& atom_coord) const +{ + set_atom_relative_coords(atom->get_bgrid_idx(), atom->get_tau_in_bgrid(), atom_coord); +} + void BigGrid::set_atom_relative_coords(const Vec3i bgrid_idx, const Vec3d tau_in_bgrid, std::vector& atom_coord) const { Vec3i this_bgrid_idx = localcell_info_->get_bgrid_global_idx_3D(idx_); @@ -89,12 +94,13 @@ void BigGrid::set_atom_relative_coords(const Vec3i bgrid_idx, const Vec3d tau_in } } - -void BigGrid::set_atom_relative_coords(const GintAtom* atom, std::vector& atom_coord) const +Vec3d BigGrid::get_bgrid_atom_rcoord(const GintAtom* atom) const { - set_atom_relative_coords(atom->get_bgrid_idx(), atom->get_tau_in_bgrid(), atom_coord); + Vec3i this_bgrid_idx = localcell_info_->get_bgrid_global_idx_3D(idx_); + return unitcell_info_->get_relative_coord(bgrid_idx, this_bgrid_idx) + tau_in_bgrid;; } + bool BigGrid::is_atom_on_bgrid(const GintAtom* atom) const { std::vector coords; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h index c1d5596e13..5c1b5c2c2c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h @@ -40,7 +40,7 @@ class BigGrid // get the total number of phi of a meshgrid // return: (\sum_{i=0}^{atoms_->size()} atoms_[i]->nw) - int get_mgrid_phi_len() const; + int get_phi_len() const; // set the start index of the phi of each atom // return: vector[i] = \sum_{j=0}^{i-1} atoms_[j]->nw @@ -55,6 +55,9 @@ class BigGrid // set the 1D index of the meshgrids in the local cell void set_mgrids_local_idx(std::vector& mgrids_idx) const; + // a wrapper function to get the relative coordinates of the atom and the meshgrids + void set_atom_relative_coords(const GintAtom* atom, std::vector& atom_coord) const; + /** * @brief Set the coordinates of the meshgrids of the big grid relative to an atom * @@ -64,8 +67,8 @@ class BigGrid */ void set_atom_relative_coords(const Vec3i bgrid_idx, const Vec3d tau_in_bgrid, std::vector& atom_coord) const; - // a wrapper function to get the relative coordinates of the atom and the meshgrids - void set_atom_relative_coords(const GintAtom* atom, std::vector& atom_coord) const; + // get the relative coords of the atom and the biggrid (used in gpu code) + Vec3d get_bgrid_atom_rcoord(const GintAtom* atom) const; // if the atom affects the big grid, return true, otherwise false // note when we say an atom affects a big grid, it does not mean that the atom affects all the meshgrid on the big grid, diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint.h index a14f014a6c..2d8a1c1cba 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint.h @@ -16,7 +16,7 @@ class Gint // note that gint_info_ is a static member variable // it is shared by all instances of Gint - static void init_gint_info(std::shared_ptr gint_info) + static void set_gint_info(std::shared_ptr gint_info) { gint_info_ = gint_info; } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h index b1da5d586a..b54f1feedd 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h @@ -109,8 +109,6 @@ class GintAtom Vec3d tau_in_biggrid_; // the numerical orbitals of this atom - // In fact, I think the Numerical_Orbital class - // should be a member of the Atom class, not the GintAtom class const Numerical_Orbital* orb_; }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp index 11f2a5d59e..a32ef1a0d3 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp @@ -47,6 +47,10 @@ GintInfo::GintInfo( // initialize the ijr_info // this step needs to be done after init_atoms_, because it requires the information of is_atom_on_bgrid init_ijr_info_(ucell, gd); + + #ifdef __CUDA + init_bgrid_batches(nbz_local); + #endif } template @@ -207,6 +211,22 @@ void GintInfo::init_ijr_info_(const UnitCell& ucell, Grid_Driver& gd) return; } +#ifdef __CUDA +void GintInfo::init_bgrid_batches_(int batch_size) +{ + for (int i = 0; i < biggrids_.size(); i+ = batch_size) + { + std::vector> bgrid_vec; + for(int j = i; j < i + batch_size && j < biggrids_.size(); j++) + { + bgrid_vec.push_back(biggrids_[j]); + } + auto bgrid_batch = std::make_shared(bgrid_vec); + bgrid_batches_.push_back(bgrid_batch); + } +} +#endif + template std::shared_ptr> GintInfo::get_hr(int npol) const; template std::shared_ptr>> GintInfo::get_hr>(int npol) const; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h index c234ec165c..8e09e08e6f 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h @@ -13,6 +13,10 @@ #include "localcell_info.h" #include "divide_info.h" +#ifdef __CUDA +#include "batch_biggrid.h" +#endif + namespace ModuleGint { @@ -29,7 +33,8 @@ class GintInfo const UnitCell& ucell, Grid_Driver& gd); // getter functions - std::vector> get_biggrids() const { return biggrids_; }; + std::vector>& get_biggrids() const { return biggrids_; }; + std::shared_ptr get_bgrid_info() const { return biggrid_info_; }; double get_local_mgrid_num() const { return localcell_info_->get_mgrids_num(); }; double get_mgrid_volume() const { return meshgrid_info_->get_volume(); }; @@ -77,6 +82,15 @@ class GintInfo // format for storing atomic pair information in hcontainer, used for initializing hcontainer std::vector ijr_info_; + + #ifdef __CUDA + public: + std::vector>& get_bgrid_batches() const { return bgrid_batches_; }; + + private: + void init_bgrid_batches_(int batch_size); + std::vector> bgrid_batches_; + #endif }; } // namespace ModuleGint diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp new file mode 100644 index 0000000000..2dccc7982b --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp @@ -0,0 +1,117 @@ +#include "gint_gpu_vars.h" + +namespace ModuleGint +{ + +GintGpuVars::GintGpuVars(const std::shared_ptr gint_info, + const UnitCell& ucell, + Numerical_Orbital* Phi) +{ + #ifdef __MPI + // Set GPU for different MPI processes + dev_id_ = base_device::information::set_device_by_rank(); +#endif + std::vector ylmcoef_h(100); + for (int i = 0; i < 100; i++) + { + ylmcoef_h[i] = ModuleBase::Ylm::ylmcoef[i]; + } + set_ylmcoef_d(ylmcoef_h.data(), &ylmcoef_d); + + const int ntype = ucell.ntype; + std::vector atom_nw_h(ntype); + std::vector ucell_atom_nwl_h(ntype); + for (int i = 0; i < ntype; i++) + { + atom_nw_h[i] = ucell.atom[i].nw; + ucell_atom_nwl_h[i] = ucell.atom[i].nwl; + } + checkCuda(cudaMalloc((void**)&atom_nw_d, sizeof(int) * ntype)); + checkCuda(cudaMemcpy(atom_nw_d, atom_nw_h.data(), sizeof(int) * ntype, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&ucell_atom_nwl_d, sizeof(int) * ntype)); + checkCuda(cudaMemcpy(ucell_atom_nwl_d, ucell_atom_nwl_h.data(), sizeof(int) * ntype, cudaMemcpyHostToDevice)); + + const double dr_uniform = Phi[0].PhiLN(0, 0).dr_uniform; + const double max_rcut = 0; + for (int i = 0; i < ntype; i++) + { + const double rcut = Phi[i].getRcut(); + if (rcut > max_rcut) + { + max_rcut = rcut; + } + } + const double nr_max = static_cast(1 / dr_uniform * max_rcut) + 10; + + const int nwmax = ucell.nwmax; + std::vector psi_u_h(ntype * nwmax * nr_max); + std::vector dpsi_u_h(ntype * nwmax * nr_max); + std::vector d2psi_u_h(ntype * nwmax * nr_max); + std::vector atom_iw2_new_h(ntype * nwmax); + std::vector atom_iw2_ylm_h(ntype * nwmax); + std::vector atom_iw2_l(ntype * nwmax); + for (int i = 0; i < ntype; i++) + { + Atom* atomx = &ucell.atom[i]; + for (int j = 0; j < atomx->nw; j++) + { + atom_iw2_new[i * nwmax + j] = atomx->iw2_new[j]; + atom_iw2_ylm[i * nwmax + j] = atomx->iw2_ylm[j]; + atom_iw2_l[i * nwmax + j] = atomx->iw2_l[j]; + const auto psi_ptr = &Phi[i].PhiLN(atomx->iw2l[j], atomx->iw2n[j]); + const int psi_size = psi_ptr->psi_uniform.size(); + int idx = i * nwmax * nr_max + j * nr_max; + for (int k = 0; k < psi_size; k++) + { + psi_u_h[idx + k] = psi_ptr->psi_uniform[k]; + dpsi_u_h[idx + k] = psi_ptr->dpsi_uniform[k]; + d2psi_u_h[idx + k] = psi_ptr->d2psi_uniform[k]; + } + } + } + + checkCuda(cudaMalloc((void**)&atom_iw2_new_d, sizeof(bool) * ntype * nwmax)); + checkCuda(cudaMemcpy(atom_iw2_new_d, atom_iw2_new_h.data(), sizeof(bool) * ntype * nwmax, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&atom_iw2_ylm_d, sizeof(int) * ntype * nwmax)); + checkCuda(cudaMemcpy(atom_iw2_ylm_d, atom_iw2_ylm_h.data(), sizeof(int) * ntype * nwmax, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&atom_iw2_l_d, sizeof(int) * ntype * nwmax)); + checkCuda(cudaMemcpy(atom_iw2_l_d, atom_iw2_l_h.data(), sizeof(int) * ntype * nwmax, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&psi_u_d, sizeof(double) * ntype * nwmax * nr_max)); + checkCuda(cudaMemcpy(psi_u_d, psi_u_h.data(), sizeof(double) * ntype * nwmax * nr_max, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&dpsi_u_d, sizeof(double) * ntype * nwmax * nr_max)); + checkCuda(cudaMemcpy(dpsi_u_d, dpsi_u_h.data(), sizeof(double) * ntype * nwmax * nr_max, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&d2psi_u_d, sizeof(double) * ntype * nwmax * nr_max)); + checkCuda(cudaMemcpy(d2psi_u_d, d2psi_u_h.data(), sizeof(double) * ntype * nwmax * nr_max, cudaMemcpyHostToDevice)); + + const int mgrid_num = biggrid_info_->get_mgrid_num(); + std::vector mgrid_pos_h(mgrid_num); + for(int i = 0; i < mgrid_num; i++) + { + mgrid_pos_h[i].x = biggrid_info_->get_mgrid_coord(i).x; + mgrid_pos_h[i].y = biggrid_info_->get_mgrid_coord(i).y; + mgrid_pos_h[i].z = biggrid_info_->get_mgrid_coord(i).z; + } + checkCuda(cudaMalloc((void**)&mgrid_pos_d, sizeof(double3) * mgrid_num)); + checkCuda(cudaMemcpy(mgrid_pos_d, mgrid_pos_h.data(), sizeof(double3) * mgrid_num, cudaMemcpyHostToDevice)); + + checkCuda(cudaMalloc((void**)&iat2it_d, sizeof(int) * ucell.nat)); + checkCuda(cudaMemcpy(iat2it_d, ucell.iat2it, sizeof(int) * ucell.nat, cudaMemcpyHostToDevice)); + + gemm_algo_selector(gint_info->get_bgrid_info()->get_mgrids_num(), fastest_matrix_mul, *ucell); +} + +GintGpuVars::~GintGpuVars() +{ + checkCuda(cudaFree(atom_nw_d)); + checkCuda(cudaFree(ucell_atom_nwl_d)); + checkCuda(cudaFree(atom_iw2_new_d)); + checkCuda(cudaFree(atom_iw2_ylm_d)); + checkCuda(cudaFree(atom_iw2_l_d)); + checkCuda(cudaFree(psi_u_d)); + checkCuda(cudaFree(dpsi_u_d)); + checkCuda(cudaFree(d2psi_u_d)); + checkCuda(cudaFree(mgrid_pos_d)); + checkCuda(cudaFree(iat2it_d)); +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h new file mode 100644 index 0000000000..cffa5e36b9 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include "set_const_mem.cuh" +#include "module_base/ylm.h" +#include "module_cell/unitcell.h" +#include "module_cell/atom_spec.h" +#include "module_gint/kernels/cuda/gemm_selector.cuh" +#include "module_gint/temp_gint/gint_info.h" +#include "module_gint/kernels/cuda/cuda_tools.cuh" + +namespace ModuleGint +{ + +class GintGpuVars +{ + public: + GintGpuVars(const std::shared_ptr gint_info, + const UnitCell& ucell, + Numerical_Orbital* Phi); + ~GintGpuVars(); + + // ylmcoef_d is __constant__ memory + double* ylmcoef_d = nullptr; + int* atom_nw_d = nullptr; + int* ucell_atom_nwl_d = nullptr; + bool* atom_iw2_new_d = nullptr; + int* atom_iw2_ylm_d = nullptr; + int* atom_iw2_l_d = nullptr; + double* psi_u_d = nullptr; + double* dpsi_u_d = nullptr; + double* d2psi_u_d = nullptr; + double3* mgrid_pos_d = nullptr; + int* iat2it_d = nullptr; + int dev_id = 0; + matrix_multiple_func_type fastest_matrix_mul; + +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cpp new file mode 100644 index 0000000000..d9fbb1f0fb --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cpp @@ -0,0 +1,19 @@ +#include "phi_operator_gpu.h" + +namespace ModuleGint +{ + +PhiOperatorGpu::PhiOperatorGpu(cudaStream_t stream) +: stream_(stream) +{ + atoms_num_info_h_.reserve(BatchBigGrid::get_max_batch_size()); + atoms_num_info_h_.reserve(BatchBigGrid::get_max_batch_size()); + atoms_iat_h_.reserve(BatchBigGrid::get_max_atoms_num()); + atoms_iat_d_.reserve(BatchBigGrid::get_max_atoms_num()); + atoms_bgrids_rcoords_h_.reserve(BatchBigGrid::get_max_atoms_num()); + atoms_bgrids_rcoords_d_.reserve(BatchBigGrid::get_max_atoms_num()); +} + + + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h new file mode 100644 index 0000000000..e3cecc0c0f --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h @@ -0,0 +1,48 @@ +#pragma once +#include +#include + +#include "temp_gint/batch_biggrid.h" +#include "module_gint/kernels/cuda/cuda_tools.cuh" +#include "gint_gpu_vars.h" +#include +#include + +namespace ModuleGint +{ + +class PhiOperatorGpu +{ + +public: + PhiOperatorGpu(cudaStream_t stream); + + void set_bgrid_batch(std::shared_ptr bgrid_batch); + static void set_gint_gpu_vars(std::shared_ptr gint_gpu_vars) + { + gint_gpu_vars_ = gint_gpu_vars; + }; + + void + +private: + std::shared_ptr bgrid_batch_; + static std::shared_ptr gint_gpu_vars_; + cudaStream_t stream_ = nullptr; + + // The first number in every group of two represents the number of atoms on that bigcell. + // The second number represents the cumulative number of atoms up to that bigcell. + thrust::host_vector atoms_num_info_h_; + thrust::device_vector atoms_num_info_d_; + + // the iat of each atom + thrust::host_vector atoms_iat_h_; + thrust::device_vector atoms_iat_d_; + + // atoms_bgrids_rcoords_ here represents the relative coordinates from the big grid to the atoms + thrust::host_vector atoms_bgrids_rcoords_h_; + thrust::device_vector atoms_bgrids_rcoords_d_; + +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu new file mode 100644 index 0000000000..b3b807a0b9 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu @@ -0,0 +1,11 @@ +#include "set_const_mem.cuh" +#include "module_gint/kernels/cuda/cuda_tools.cuh" + +namespace ModuleGint +{ + __host__ void set_ylmcoe_d(const double* ylmcoe_h, double** ylmcoe_d_addr) + { + checkCuda(cudaMemcpyToSymbol(ylmcoe_d, ylmcoe_h, sizeof(double) * 100)); + checkCuda(cudaGetSymbolAddress((void**)ylmcoe_d_addr, ylmcoe_d)); + } +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh new file mode 100644 index 0000000000..c063d622dd --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh @@ -0,0 +1,9 @@ +#pragma once +#include + +__constant__ double ylmcoe_d[100]; + +namespace ModuleGint +{ +__host__ void set_ylmcoe_d(const double* ylmcoe_h, double** ylmcoe_d_addr); +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp index d714546864..f65c3369d6 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp @@ -9,7 +9,7 @@ void PhiOperator::set_bgrid(std::shared_ptr biggrid) { biggrid_ = biggrid; rows_ = biggrid_->get_mgrids_num(); - cols_ = biggrid_->get_mgrid_phi_len(); + cols_ = biggrid_->get_phi_len(); biggrid_->set_atoms_startidx(atoms_startidx_); biggrid_->set_atoms_phi_len(atoms_phi_len_); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h index 5b5366e701..8fe9cbfc11 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" #include "big_grid.h" namespace ModuleGint @@ -110,7 +110,7 @@ class PhiOperator int rows_; // the column number of the phi matrix - // cols_ = biggrid_->get_mgrid_phi_len() + // cols_ = biggrid_->get_phi_len() int cols_; // the local index of the meshgrids diff --git a/source/source_esolver/lcao_before_scf.cpp b/source/source_esolver/lcao_before_scf.cpp index 418c09ce41..f921bab0a4 100644 --- a/source/source_esolver/lcao_before_scf.cpp +++ b/source/source_esolver/lcao_before_scf.cpp @@ -31,6 +31,10 @@ #ifdef __EXX #include "module_io/restart_exx_csr.h" #endif +#ifdef __CUDA +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h" +#endif namespace ModuleESolver { @@ -111,7 +115,14 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) orb_.Phi, ucell, this->gd); - ModuleGint::Gint::init_gint_info(gint_info); +#ifdef __CUDA + auto gint_gpu_vars = std::make_shared( + gint_info, + ucell, + orb_.Phi); + ModuleGint::PhiOperatorGpu::set_gint_gpu_vars(gint_gpu_vars); +#endif + ModuleGint::Gint::set_gint_info(gint_info); #endif psi_u.clear(); From 7f061140b954b9e5c412a7db7ed9c099d722f3cd Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sat, 26 Apr 2025 19:38:40 +0800 Subject: [PATCH 02/63] change gemm function --- .../module_gint/kernels/cuda/code_gen.cuh | 466 +++++++++--------- .../module_gint/kernels/cuda/code_gen_00.cu | 46 +- .../module_gint/kernels/cuda/code_gen_01.cu | 46 +- .../module_gint/kernels/cuda/code_gen_02.cu | 46 +- .../module_gint/kernels/cuda/code_gen_03.cu | 46 +- .../module_gint/kernels/cuda/code_gen_04.cu | 46 +- .../module_gint/kernels/cuda/code_gen_05.cu | 46 +- .../module_gint/kernels/cuda/code_gen_06.cu | 46 +- .../module_gint/kernels/cuda/code_gen_07.cu | 46 +- .../module_gint/kernels/cuda/code_gen_08.cu | 46 +- .../module_gint/kernels/cuda/code_gen_09.cu | 52 +- .../module_gint/kernels/cuda/gemm_selector.cu | 12 +- .../kernels/cuda/vbatch_matrix_mul.cuh | 98 ++-- 13 files changed, 521 insertions(+), 521 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cuh index a4b1a75916..b6e01e62d6 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cuh @@ -4,470 +4,470 @@ #include "gemm_selector.cuh" #include -extern template void gemm_time_measure(int, int, int*, int*, int*, double**, int*, double**, int*, double**, int*, int, cudaStream_t, float&, matrix_multiple_func_type&, double*, double*, double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); #endif \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_00.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_00.cu index a07c411485..194e7bb863 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_00.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_00.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int, int, int*, int*, int*, double**, int*, double**, int*, double**, int*, int, cudaStream_t, float&, matrix_multiple_func_type&, double*, double*, double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_01.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_01.cu index 9f725c23c6..9d5b2dc664 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_01.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_01.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_02.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_02.cu index 090eab0709..7d0996d49a 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_02.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_02.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_03.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_03.cu index 046d0e5063..ab271c8783 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_03.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_03.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_04.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_04.cu index f74209d829..22d2fa85a2 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_04.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_04.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_05.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_05.cu index c9cb81bd7c..33ad2ac892 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_05.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_05.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_06.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_06.cu index f5fac39df2..91933f8bf6 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_06.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_06.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_07.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_07.cu index 971c6eb0c0..08a47c3a9b 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_07.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_07.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_08.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_08.cu index 8643faae70..89834ab470 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_08.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_08.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_09.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_09.cu index 8cf333bf6f..277e163712 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_09.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_09.cu @@ -1,53 +1,53 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); \ No newline at end of file +template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cu index b8dda451f4..48b0a9bf7f 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cu @@ -59,8 +59,8 @@ void gemm_algo_selector(int matrix_k, matrix_multiple_func_type& fastest_algo,co n.get_host_pointer()[index] = ucell.atoms[l].nw; k.get_host_pointer()[index] = matrix_k; - lda.get_host_pointer()[index] = matrix_k; - ldb.get_host_pointer()[index] = matrix_k; + lda.get_host_pointer()[index] = ucell.atoms[j].nw; + ldb.get_host_pointer()[index] = ucell.atoms[l].nw; ldc.get_host_pointer()[index] = ucell.atoms[l].nw; A_array.get_host_pointer()[index] @@ -71,19 +71,19 @@ void gemm_algo_selector(int matrix_k, matrix_multiple_func_type& fastest_algo,co = &C.get_device_pointer()[index * max_n * max_m]; // test atom add BlasConnector::gemm( - 'N', 'T', + 'N', m.get_host_pointer()[index], n.get_host_pointer()[index], matrix_k, 1.0, &A.get_host_pointer()[index * max_m * matrix_k], - matrix_k, + lda.get_host_pointer()[index], &B.get_host_pointer()[index * max_n * matrix_k], - matrix_k, + ldb.get_host_pointer()[index], 1.0, &cpu_result[index * max_m * max_n], - n.get_host_pointer()[index]); + ldc.get_host_pointer()[index]); index++; } } diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh index 8a7263c9a5..39bc336e77 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh @@ -31,15 +31,15 @@ template shmem copy - T ra[BLK_M / DIM_YA][BLK_K / DIM_XA]; - T rb[BLK_N / DIM_YB][BLK_K / DIM_XB]; + T ra[BLK_K / DIM_XA][BLK_M / DIM_YA]; + T rb[BLK_K / DIM_XB][BLK_N / DIM_YB]; // bound is the correction to offs_d in order to not get out of memory bound // so bound could be negative value since offs_d could be out of bound @@ -89,22 +89,22 @@ static __device__ void vbatched_gemm_device(int M, // Load A dev->shmem #pragma unroll - for (n = 0; n < BLK_M; n += DIM_YA) + for (n = 0; n < BLK_K; n += DIM_XA) { #pragma unroll - for (m = 0; m < BLK_K; m += DIM_XA) + for (m = 0; m < BLK_M; m += DIM_YA) { - sA(n + idyA, m + idxA) = fetch(A, m, n, boundA); + sA(m + idyA, n + idxA) = fetch(A, m, n, boundA); } } #pragma unroll - for (n = 0; n < BLK_N; n += DIM_YB) + for (n = 0; n < BLK_K; n += DIM_XB) { #pragma unroll - for (m = 0; m < BLK_K; m += DIM_XB) + for (m = 0; m < BLK_N; m += DIM_YB) { - sB(m + idxB, n + idyB) = fetch(B, m, n, boundB); + sB(n + idxB, m + idyB) = fetch(B, m, n, boundB); } } @@ -112,31 +112,31 @@ static __device__ void vbatched_gemm_device(int M, for (kk = 0; kk < K - BLK_K; kk += BLK_K) { - offs_dA += BLK_K; - boundA -= BLK_K; + offs_dA += BLK_K * LDA; + boundA -= BLK_K * LDA; - offs_dB += BLK_K; - boundB -= BLK_K; + offs_dB += BLK_K * LDB; + boundB -= BLK_K * LDB; // Load A dev->regs #pragma unroll - for (n = 0; n < BLK_M / DIM_YA; n++) + for (n = 0; n < BLK_K / DIM_XA; n++) { #pragma unroll - for (m = 0; m < BLK_K / DIM_XA; m++) + for (m = 0; m < BLK_M / DIM_YA; m++) { - ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA); + ra[n][m] = fetch(A, m * DIM_YA, n * DIM_XA, boundA); } } // Load B dev->regs #pragma unroll - for (n = 0; n < BLK_N / DIM_YB; n++) + for (n = 0; n < BLK_K / DIM_XB; n++) { #pragma unroll - for (m = 0; m < BLK_K / DIM_XB; m++) + for (m = 0; m < BLK_N / DIM_YB; m++) { - rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB); + rb[n][m] = fetch(B, m * DIM_YB, n * DIM_XB, boundB); } } @@ -174,23 +174,23 @@ static __device__ void vbatched_gemm_device(int M, // Load A regs->shmem #pragma unroll - for (n = 0; n < BLK_M / DIM_YA; n++) + for (n = 0; n < BLK_K / DIM_XA; n++) { #pragma unroll - for (m = 0; m < BLK_K / DIM_XA; m++) + for (m = 0; m < BLK_M / DIM_YA; m++) { - sA(n * DIM_YA + idyA, m * DIM_XA + idxA) = ra[n][m]; + sA(m * DIM_YA + idyA, n * DIM_XA + idxA) = ra[n][m]; } } // Load B regs->shmem #pragma unroll - for (n = 0; n < BLK_N / DIM_YB; n++) + for (n = 0; n < BLK_K / DIM_XB; n++) { #pragma unroll - for (m = 0; m < BLK_K / DIM_XB; m++) + for (m = 0; m < BLK_N / DIM_YB; m++) { - sB(m * DIM_XB + idxB, n * DIM_YB + idyB) = rb[n][m]; + sB(n * DIM_XB + idxB, m * DIM_YB + idyB) = rb[n][m]; } } __syncthreads(); @@ -260,16 +260,16 @@ template -static __global__ void vbatched_gemm_kernel(int* M, - int* N, - int* K, - T** global_A_array, - int* global_lda, - T** global_B_array, - int* global_ldb, +static __global__ void vbatched_gemm_kernel(const int* M, + const int* N, + const int* K, + const T* const* global_A_array, + const int* global_lda, + const T* const* global_B_array, + const int* global_ldb, T** global_C_array, - int* global_ldc, - T* alpha) + const int* global_ldc, + const T* alpha) { extern __shared__ __align__(sizeof(T)) unsigned char smem[]; T* shared_mem = reinterpret_cast(smem); @@ -381,7 +381,7 @@ static inline int ceildiv(int x, int y) * especially the fact that we can relatively easily control the arrangement of * the matrix elements, we have only implemented one type of requirement for * matrix transposition. That is, we have implemented the operation C = alpha * - * trans(A) * B + C under the constraint of column-major order. + * A * trans(B) + C under the constraint of column-major order. * * Finally, we would like to thank Magma for its contributions to the field of * scientific computing. @@ -415,8 +415,8 @@ void vbatched_gemm_impl(int max_m, // The positions of A and B have been swapped here. // This is because the original code is for column-major matrices. // We use row-major matrices, so we need to swap A and B. - // The vbatched_gemm_impl is for C = trans(A) * B + C, but we need trans(C). - // Which means: trans(C) = trans(trans(A)*B + C) = trans(B) * A + trans(C) + // The vbatched_gemm_impl is for C = A * trans(B) + C, but we need trans(C). + // Which means: trans(C) = trans(A * trans(B) + C) = B * trans(A) + trans(C) // Then, ldc should be N, lda and ldb should be K size_t shared_mem_size = 0; @@ -452,7 +452,7 @@ void vbatched_gemm_impl(int max_m, if (remain_num > 0) { dim3 dimGrid(ceildiv(max_n, BLK_M), ceildiv(max_m, BLK_N), remain_num); - T* alpha_tmp = nullptr; + const T* alpha_tmp = nullptr; if (alpha != nullptr) { alpha_tmp = alpha + loop_num * max_batch_count; @@ -484,15 +484,15 @@ template void gemm_time_measure(int max_m, int max_n, - int* m, - int* n, - int* k, - T** global_A_array, - int* global_lda, - T** global_B_array, - int* global_ldb, + const int* m, + const int* n, + const int* k, + const T* const* global_A_array, + const int* global_lda, + const T* const* global_B_array, + const int* global_ldb, T** global_C_array, - int* global_ldc, + const int* global_ldc, int batchCount, cudaStream_t stream, float& fast_time, From 1ef0450b2c2570b001cd5525a84513c459f2902e Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Wed, 30 Apr 2025 17:26:19 +0800 Subject: [PATCH 03/63] enable gint_vl_gpu --- .../module_gint/CMakeLists.txt | 10 + .../module_gint/grid_technique.cpp | 2 +- .../module_gint/kernels/cuda/cuda_tools.cu | 2 +- .../module_gint/kernels/cuda/cuda_tools.cuh | 10 +- .../module_gint/temp_gint/batch_biggrid.cpp | 23 + .../module_gint/temp_gint/batch_biggrid.h | 46 +- .../module_gint/temp_gint/big_grid.cpp | 6 +- .../module_gint/temp_gint/big_grid.h | 6 +- .../module_gint/temp_gint/gint_info.cpp | 8 +- .../module_gint/temp_gint/gint_info.h | 10 +- .../module_gint/temp_gint/gint_interface.cpp | 14 +- .../module_gint/temp_gint/gint_vl_gpu.cpp | 70 ++++ .../module_gint/temp_gint/gint_vl_gpu.h | 51 +++ .../temp_gint/kernel/cuda_mem_wrapper.h | 183 ++++++++ .../temp_gint/kernel/gint_gpu_vars.cpp | 69 +-- .../temp_gint/kernel/gint_gpu_vars.h | 26 +- .../temp_gint/kernel/phi_operator_gpu.cpp | 19 - .../temp_gint/kernel/phi_operator_gpu.cu | 231 +++++++++++ .../temp_gint/kernel/phi_operator_gpu.h | 70 +++- .../temp_gint/kernel/phi_operator_kernel.cu | 105 +++++ .../temp_gint/kernel/phi_operator_kernel.cuh | 40 ++ .../temp_gint/kernel/set_const_mem.cu | 4 +- .../temp_gint/kernel/set_const_mem.cuh | 2 - .../module_gint/temp_gint/kernel/sph.cuh | 392 ++++++++++++++++++ source/source_esolver/esolver_ks_lcao.cpp | 6 + source/source_esolver/lcao_before_scf.cpp | 11 - 26 files changed, 1282 insertions(+), 134 deletions(-) create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h delete mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index 7b43114adb..78be519ab1 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -52,6 +52,16 @@ if(NEW_GINT) temp_gint/gint_common.cpp temp_gint/gint_interface.cpp ) + if(USE_CUDA) + list(APPEND objects + temp_gint/kernel/gint_gpu_vars.cpp + temp_gint/kernel/phi_operator_gpu.cu + temp_gint/kernel/phi_operator_kernel.cu + temp_gint/kernel/set_const_mem.cu + temp_gint/batch_biggrid + temp_gint/gint_vl_gpu.cpp + ) + endif() endif() if(USE_CUDA) diff --git a/source/module_hamilt_lcao/module_gint/grid_technique.cpp b/source/module_hamilt_lcao/module_gint/grid_technique.cpp index b343d5ea2c..50dd92da6c 100644 --- a/source/module_hamilt_lcao/module_gint/grid_technique.cpp +++ b/source/module_hamilt_lcao/module_gint/grid_technique.cpp @@ -122,7 +122,7 @@ void Grid_Technique::set_pbc_grid(const int& ncx_in, this->cal_trace_lo(ucell); #if ((defined __CUDA) /* || (defined __ROCM) */) if (PARAM.inp.device == "gpu") { - this->init_gpu_gint_variables(ucell, num_stream); + // this->init_gpu_gint_variables(ucell, num_stream); } #endif diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu index 09bab1586e..fd0e6039b0 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu @@ -24,7 +24,7 @@ cudaError_t __checkCudaLastError(const char *file, const int line) return result; } -void dump_cuda_array_to_file(double* cuda_array, +void dump_cuda_array_to_file(const double* cuda_array, int width, int hight, const std::string& filename) diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh index 803e76ff22..4a820b9ce7 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh @@ -15,15 +15,15 @@ cudaError_t check(cudaError_t result, const char *const func, const char *const file, const int line); cudaError_t __checkCudaLastError(const char *file, const int line); -void dump_cuda_array_to_file(double* cuda_array, +void dump_cuda_array_to_file(const double* cuda_array, int width, int hight, const std::string& filename); -inline int ceil_div(int a, int b) -{ - return (a + b - 1) / b; -} +// inline int ceil_div(int a, int b) +// { +// return (a + b - 1) / b; +// } /* * @brief: A simple wrapper for cudaMalloc and cudaFree, sync and async CUDA diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp index 080a06d174..5f514b669d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp @@ -6,5 +6,28 @@ namespace ModuleGint int BatchBigGrid::max_batch_size_ = 0; int BatchBigGrid::max_atoms_num_ = 0; int BatchBigGrid::max_phi_len_ = 0; +int BatchBigGrid::max_atom_pairs_num_ = 0; + +BatchBigGrid::BatchBigGrid(std::vector> biggrids) +{ + biggrids_ = biggrids; + max_batch_size_ = std::max(max_batch_size_, (int)biggrids_.size()); + int atom_pairs_num = 0; + for(const auto& biggrid : biggrids_) + { + for(const auto& atom: biggrid->get_atoms()) + { + max_nw_ = std::max(max_nw_, atom->get_nw()); + } + atoms_num_ += biggrid->get_atoms_num(); + atom_pairs_num += std::pow(biggrid->get_atoms_num(), 2); + phi_len_ += biggrid->get_phi_len() * biggrid->get_mgrids_num(); + } + max_atoms_num_ = std::max(max_atoms_num_, atoms_num_); + max_phi_len_ = std::max(max_phi_len_, phi_len_); + max_atom_pairs_num_ = std::max(max_atom_pairs_num_, atom_pairs_num); +} + + } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h index 0528f7a8ef..401e1c38e9 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h @@ -9,36 +9,38 @@ namespace ModuleGint class BatchBigGrid { public: - BatchBigGrid(std::vector> biggrids) - { - biggrids_ = biggrids; - max_batch_size_ = std::max(max_batch_size_, biggrids_.size()); - int atoms_num = 0; - int phi_len = 0; - for(const auto& biggrid : biggrids_) - { - atoms_num += biggrid->get_atoms_num(); - phi_len += biggrid->get_phi_len(); - } - max_atoms_num_ = std::max(max_atoms_num_, atoms_num); - max_phi_len_ = std::max(max_phi_len_, phi_len); - }; + BatchBigGrid(std::vector> biggrids); - const std::vector>& get_bgrids() { return biggrids_; } + const std::vector>& get_bgrids() { return biggrids_; }; - static int get_max_batch_size() { return max_batch_size_; } - static int get_max_atoms_num() { return max_atoms_num_; } - static int get_max_phi_len() { return max_phi_len_; } + int get_batch_size() const { return biggrids_.size(); }; + int get_atoms_num() const { return atoms_num_; }; + int get_phi_len() const { return phi_len_;} + bool empty() {return atoms_num_ == 0; }; + static int get_max_batch_size() { return max_batch_size_; }; + static int get_max_atoms_num() { return max_atoms_num_; }; + static int get_max_phi_len() { return max_phi_len_; }; + static int get_max_atom_pairs_num() { return max_atom_pairs_num_; }; + static std::shared_ptr get_bgrid_info() { return BigGrid::get_bgrid_info(); }; private: std::vector> biggrids_; - // the max number of biggrids of a batch_biggrid + // the max nw of an atom + int max_nw_ = 0; + + int phi_len_ = 0; + // number of atoms in the batch + int atoms_num_ = 0; + + // the max number of biggrids of a biggrids batch static int max_batch_size_; - // the max number of total atoms of a batch_biggrid + // the max number of total atoms of a biggrids batch static int max_atoms_num_; - // the max number of total wavefunctions of a batch_biggrid + // the max number of total wavefunctions of a biggrids batch static int max_phi_len_; -} + // the max number of atom pairs of a biggrids batch + static int max_atom_pairs_num_; +}; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp index af082f3b9e..e20a0fb50a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp @@ -89,15 +89,15 @@ void BigGrid::set_atom_relative_coords(const Vec3i bgrid_idx, const Vec3d tau_in atom_coord.resize(biggrid_info_->get_mgrids_num()); for(int im = 0; im < biggrid_info_->get_mgrids_num(); ++im) { - const Vec3d& mcell_coord = biggrid_info_->get_mgrid_coord(im); - atom_coord[im] = mcell_coord - bgrid_relative_coord; + const Vec3d& mgrid_coord = biggrid_info_->get_mgrid_coord(im); + atom_coord[im] = mgrid_coord - bgrid_relative_coord; } } Vec3d BigGrid::get_bgrid_atom_rcoord(const GintAtom* atom) const { Vec3i this_bgrid_idx = localcell_info_->get_bgrid_global_idx_3D(idx_); - return unitcell_info_->get_relative_coord(bgrid_idx, this_bgrid_idx) + tau_in_bgrid;; + return unitcell_info_->get_relative_coord(atom->get_bgrid_idx(), this_bgrid_idx) + atom->get_tau_in_bgrid(); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h index 5c1b5c2c2c..b8ea90eeeb 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h @@ -23,9 +23,9 @@ class BigGrid // getter functions int get_idx() const { return idx_; }; - std::shared_ptr get_localcell_info() const { return localcell_info_; }; - std::shared_ptr get_unitcell_info() const {return unitcell_info_; }; - std::shared_ptr get_bgrid_info() const { return biggrid_info_; }; + static std::shared_ptr get_localcell_info() { return localcell_info_; }; + static std::shared_ptr get_unitcell_info() { return unitcell_info_; }; + static std::shared_ptr get_bgrid_info() { return biggrid_info_; }; const std::vector& get_atoms() const { return atoms_; }; const GintAtom* get_atom(int i) const { return atoms_[i]; }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp index a32ef1a0d3..1a56e8a017 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp @@ -49,7 +49,11 @@ GintInfo::GintInfo( init_ijr_info_(ucell, gd); #ifdef __CUDA - init_bgrid_batches(nbz_local); + if(PARAM.inp.device == "gpu") + { + init_bgrid_batches_(nbz_local); + gpu_vars_ = std::make_shared(biggrid_info_, ucell, Phi); + } #endif } @@ -214,7 +218,7 @@ void GintInfo::init_ijr_info_(const UnitCell& ucell, Grid_Driver& gd) #ifdef __CUDA void GintInfo::init_bgrid_batches_(int batch_size) { - for (int i = 0; i < biggrids_.size(); i+ = batch_size) + for (int i = 0; i < biggrids_.size(); i += batch_size) { std::vector> bgrid_vec; for(int j = i; j < i + batch_size && j < biggrids_.size(); j++) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h index 8e09e08e6f..d79043fb2e 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h @@ -15,6 +15,7 @@ #ifdef __CUDA #include "batch_biggrid.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h" #endif namespace ModuleGint @@ -33,7 +34,7 @@ class GintInfo const UnitCell& ucell, Grid_Driver& gd); // getter functions - std::vector>& get_biggrids() const { return biggrids_; }; + std::vector>& get_biggrids() { return biggrids_; }; std::shared_ptr get_bgrid_info() const { return biggrid_info_; }; double get_local_mgrid_num() const { return localcell_info_->get_mgrids_num(); }; double get_mgrid_volume() const { return meshgrid_info_->get_volume(); }; @@ -85,11 +86,14 @@ class GintInfo #ifdef __CUDA public: - std::vector>& get_bgrid_batches() const { return bgrid_batches_; }; - + std::vector>& get_bgrid_batches() { return bgrid_batches_; }; + std::shared_ptr get_gpu_vars() const { return gpu_vars_; }; + int get_dev_id() const { return gpu_vars_->dev_id_; }; + private: void init_bgrid_batches_(int batch_size); std::vector> bgrid_batches_; + std::shared_ptr gpu_vars_; #endif }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index bd945b8b19..c5e800f020 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -1,6 +1,7 @@ #include "gint_interface.h" #include "source_base/timer.h" #include "gint_vl.h" +#include "gint_vl_gpu.h" #include "gint_vl_metagga.h" #include "gint_vl_nspin4.h" #include "gint_vl_metagga_nspin4.h" @@ -17,8 +18,17 @@ void cal_gint_vl( HContainer* hR) { ModuleBase::timer::tick("Gint", "cal_gint_vl"); - Gint_vl gint_vl(vr_eff, hR); - gint_vl.cal_gint(); +#ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_vl_gpu gint_vl_gpu(vr_eff, hR); + gint_vl_gpu.cal_gint(); + } else +#endif + { + Gint_vl gint_vl(vr_eff, hR); + gint_vl.cal_gint(); + } ModuleBase::timer::tick("Gint", "cal_gint_vl"); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp new file mode 100644 index 0000000000..8d0120cbc6 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp @@ -0,0 +1,70 @@ +#include "gint_vl_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_vl_gpu::cal_gint() +{ + init_hr_gint_(); + transfer_cpu_to_gpu_(); + cal_hr_gint_(); + transfer_gpu_to_cpu_(); + compose_hr_gint(hr_gint_); + transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_), hR_); +} + +void Gint_vl_gpu::init_hr_gint_() +{ + hr_gint_ = gint_info_->get_hr(); +} + +void Gint_vl_gpu::transfer_cpu_to_gpu_() +{ + hr_gint_d_ = CudaMemWrapper(hr_gint_->get_nnr(), 0, false); + vr_eff_d_ = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_.get_device_ptr(), vr_eff_, + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); +} + +void Gint_vl_gpu::transfer_gpu_to_cpu_() +{ + checkCuda(cudaMemcpy(hr_gint_->get_wrapper(), hr_gint_d_.get_device_ptr(), + hr_gint_->get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void Gint_vl_gpu::cal_hr_gint_() +{ +#pragma omp parallel + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi(phi.get_device_ptr()); + phi_op.phi_mul_vldr3(vr_eff_d_.get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_phi_vldr3(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h new file mode 100644 index 0000000000..f609a3f053 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_vl_gpu : public Gint +{ + public: + Gint_vl_gpu( + const double* vr_eff, + HContainer* hR) + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + + void cal_gint() override; + + private: + + void init_hr_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + void cal_hr_gint_(); + + // input + const double* vr_eff_; + + + // output + HContainer* hR_; + + //======================== + // Intermediate variables + //======================== + double dr3_; + + std::shared_ptr> hr_gint_; + + CudaMemWrapper hr_gint_d_; + CudaMemWrapper vr_eff_d_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h new file mode 100644 index 0000000000..ee2f2e10e2 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h @@ -0,0 +1,183 @@ +#pragma once +#include +#include "module_base/tool_quit.h" +#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" + +template +class CudaMemWrapper +{ + public: + + CudaMemWrapper() = default; + CudaMemWrapper(const CudaMemWrapper& other) = delete; + CudaMemWrapper& operator=(const CudaMemWrapper& other) = delete; + CudaMemWrapper(CudaMemWrapper&& other) noexcept + { + this->device_ptr_ = other.device_ptr_; + this->host_ptr_ = other.host_ptr_; + this->size_ = other.size_; + this->capacity_ = other.capacity_; + this->malloc_host_ = other.malloc_host_; + this->stream_ = other.stream_; + + other.device_ptr_ = nullptr; + other.host_ptr_ = nullptr; + other.size_ = 0; + other.capacity_ = 0; + other.malloc_host_ = false; + other.stream_ = 0; + }; + + CudaMemWrapper& operator=(CudaMemWrapper&& other) noexcept + { + if (this != &other) + { + this->device_ptr_ = other.device_ptr_; + this->host_ptr_ = other.host_ptr_; + this->size_ = other.size_; + this->capacity_ = other.capacity_; + this->malloc_host_ = other.malloc_host_; + this->stream_ = other.stream_; + + other.device_ptr_ = nullptr; + other.host_ptr_ = nullptr; + other.size_ = 0; + other.capacity_ = 0; + other.malloc_host_ = false; + other.stream_ = 0; + } + return *this; + }; + + CudaMemWrapper(int capacity, + cudaStream_t stream = 0, + bool malloc_host = true) + { + capacity_ = capacity; + size_ = capacity; + malloc_host_ = malloc_host; + stream_ = stream; + + if (malloc_host) + { checkCuda(cudaMallocHost((void**)&host_ptr_, capacity * sizeof(T))); } + else + { host_ptr_ = nullptr; } + + checkCuda(cudaMalloc((void**)&device_ptr_, capacity * sizeof(T))); + }; + + ~CudaMemWrapper() + { + free(); + }; + + void copy_host_to_device_sync(int size) + { + if (host_ptr_ == nullptr) + { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to device."); } + checkCuda(cudaMemcpy(device_ptr_, host_ptr_, size * sizeof(T), cudaMemcpyHostToDevice)); + }; + + void copy_host_to_device_sync() + { + copy_host_to_device_sync(size_); + }; + + void copy_host_to_device_async(int size) + { + if (host_ptr_ == nullptr) + { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to device."); } + checkCuda(cudaMemcpyAsync(device_ptr_, host_ptr_, size * sizeof(T), cudaMemcpyHostToDevice, stream_)); + }; + + void copy_host_to_device_async() + { + copy_host_to_device_async(size_); + }; + + void copy_device_to_host_sync(int size) + { + if (host_ptr_ == nullptr) + { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to host."); } + checkCuda(cudaMemcpy(host_ptr_, device_ptr_, size * sizeof(T), cudaMemcpyDeviceToHost)); + }; + + void copy_device_to_host_sync() + { + copy_device_to_host_sync(size_); + }; + + void copy_device_to_host_async(int size) + { + if (host_ptr_ == nullptr) + { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to host."); } + checkCuda(cudaMemcpyAsync(host_ptr_, device_ptr_, size * sizeof(T), cudaMemcpyDeviceToHost, stream_)); + }; + + void copy_device_to_host_async() + { + copy_device_to_host_async(size_); + }; + + void memset_device_sync(const int size, const int value = 0) + { + checkCuda(cudaMemset(device_ptr_, value, size * sizeof(T))); + }; + + void memset_device_sync(const int value = 0) + { + memset_device_sync(size_, value); + }; + + void memset_device_async(const int size, const int value = 0) + { + checkCuda(cudaMemsetAsync(device_ptr_, value, size * sizeof(T), stream_)); + }; + + void memset_device_async(const int value = 0) + { + memset_device_async(size_, value); + }; + + void memset_host(const int size, const int value = 0) + { + if (host_ptr_ == nullptr) + { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot memset host."); } + checkCuda(cudaMemset(host_ptr_, value, size * sizeof(T))); + }; + + void memset_host(const int value = 0) + { + memset_host(size_, value); + }; + + void free() + { + checkCuda(cudaFree(device_ptr_)); + checkCuda(cudaFreeHost(host_ptr_)); + } + + T* get_device_ptr() { return device_ptr_; }; + T* get_host_ptr() { return host_ptr_; }; + + // Only supports setting size to a value less than or equal to capacity + void set_size(int new_size) + { + if (new_size > capacity_) + { + ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "New size exceeds capacity, cannot resize."); + } + size_ = new_size; + }; + + int get_size() { return size_; }; + int get_capacity() { return capacity_; }; + + private: + T* device_ptr_ = nullptr; + T* host_ptr_ = nullptr; + int size_ = 0; + int capacity_ = 0; + bool malloc_host_ = false; + cudaStream_t stream_ = 0; +}; \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp index 2dccc7982b..9d75a3fed4 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp @@ -1,14 +1,15 @@ #include "gint_gpu_vars.h" +#include "module_base/module_device/device.h" namespace ModuleGint { -GintGpuVars::GintGpuVars(const std::shared_ptr gint_info, +GintGpuVars::GintGpuVars(std::shared_ptr biggrid_info, const UnitCell& ucell, - Numerical_Orbital* Phi) + const Numerical_Orbital* Phi) { - #ifdef __MPI - // Set GPU for different MPI processes +// set device +#ifdef __MPI dev_id_ = base_device::information::set_device_by_rank(); #endif std::vector ylmcoef_h(100); @@ -16,48 +17,52 @@ GintGpuVars::GintGpuVars(const std::shared_ptr gint_info, { ylmcoef_h[i] = ModuleBase::Ylm::ylmcoef[i]; } - set_ylmcoef_d(ylmcoef_h.data(), &ylmcoef_d); + set_ylmcoe_d(ylmcoef_h.data(), &ylmcoef_d); const int ntype = ucell.ntype; std::vector atom_nw_h(ntype); std::vector ucell_atom_nwl_h(ntype); for (int i = 0; i < ntype; i++) { - atom_nw_h[i] = ucell.atom[i].nw; - ucell_atom_nwl_h[i] = ucell.atom[i].nwl; + atom_nw_h[i] = ucell.atoms[i].nw; + ucell_atom_nwl_h[i] = ucell.atoms[i].nwl; } checkCuda(cudaMalloc((void**)&atom_nw_d, sizeof(int) * ntype)); checkCuda(cudaMemcpy(atom_nw_d, atom_nw_h.data(), sizeof(int) * ntype, cudaMemcpyHostToDevice)); checkCuda(cudaMalloc((void**)&ucell_atom_nwl_d, sizeof(int) * ntype)); checkCuda(cudaMemcpy(ucell_atom_nwl_d, ucell_atom_nwl_h.data(), sizeof(int) * ntype, cudaMemcpyHostToDevice)); - const double dr_uniform = Phi[0].PhiLN(0, 0).dr_uniform; - const double max_rcut = 0; + dr_uniform = Phi[0].PhiLN(0, 0).dr_uniform; + double max_rcut = 0; + std::vector rcut_h(ntype); for (int i = 0; i < ntype; i++) { - const double rcut = Phi[i].getRcut(); - if (rcut > max_rcut) + rcut_h[i] = Phi[i].getRcut(); + if (rcut_h[i] > max_rcut) { - max_rcut = rcut; + max_rcut = rcut_h[i]; } } - const double nr_max = static_cast(1 / dr_uniform * max_rcut) + 10; + checkCuda(cudaMalloc((void**)&rcut_d, sizeof(double) * ntype)); + checkCuda(cudaMemcpy(rcut_d, rcut_h.data(), sizeof(double) * ntype, cudaMemcpyHostToDevice)); + nr_max = static_cast(1 / dr_uniform * max_rcut) + 10; - const int nwmax = ucell.nwmax; + nwmax = ucell.nwmax; std::vector psi_u_h(ntype * nwmax * nr_max); std::vector dpsi_u_h(ntype * nwmax * nr_max); std::vector d2psi_u_h(ntype * nwmax * nr_max); - std::vector atom_iw2_new_h(ntype * nwmax); + // std::vector cannot use data(), so std::vector is used instead + std::vector atom_iw2_new_h(ntype * nwmax); std::vector atom_iw2_ylm_h(ntype * nwmax); - std::vector atom_iw2_l(ntype * nwmax); + std::vector atom_iw2_l_h(ntype * nwmax); for (int i = 0; i < ntype; i++) { - Atom* atomx = &ucell.atom[i]; + Atom* atomx = &ucell.atoms[i]; for (int j = 0; j < atomx->nw; j++) { - atom_iw2_new[i * nwmax + j] = atomx->iw2_new[j]; - atom_iw2_ylm[i * nwmax + j] = atomx->iw2_ylm[j]; - atom_iw2_l[i * nwmax + j] = atomx->iw2_l[j]; + atom_iw2_new_h[i * nwmax + j] = atomx->iw2_new[j]; + atom_iw2_ylm_h[i * nwmax + j] = atomx->iw2_ylm[j]; + atom_iw2_l_h[i * nwmax + j] = atomx->iw2l[j]; const auto psi_ptr = &Phi[i].PhiLN(atomx->iw2l[j], atomx->iw2n[j]); const int psi_size = psi_ptr->psi_uniform.size(); int idx = i * nwmax * nr_max + j * nr_max; @@ -65,7 +70,7 @@ GintGpuVars::GintGpuVars(const std::shared_ptr gint_info, { psi_u_h[idx + k] = psi_ptr->psi_uniform[k]; dpsi_u_h[idx + k] = psi_ptr->dpsi_uniform[k]; - d2psi_u_h[idx + k] = psi_ptr->d2psi_uniform[k]; + d2psi_u_h[idx + k] = psi_ptr->ddpsi_uniform[k]; } } } @@ -83,25 +88,29 @@ GintGpuVars::GintGpuVars(const std::shared_ptr gint_info, checkCuda(cudaMalloc((void**)&d2psi_u_d, sizeof(double) * ntype * nwmax * nr_max)); checkCuda(cudaMemcpy(d2psi_u_d, d2psi_u_h.data(), sizeof(double) * ntype * nwmax * nr_max, cudaMemcpyHostToDevice)); - const int mgrid_num = biggrid_info_->get_mgrid_num(); - std::vector mgrid_pos_h(mgrid_num); + const int mgrid_num = biggrid_info->get_mgrids_num(); + std::vector mgrids_pos_h(mgrid_num); for(int i = 0; i < mgrid_num; i++) { - mgrid_pos_h[i].x = biggrid_info_->get_mgrid_coord(i).x; - mgrid_pos_h[i].y = biggrid_info_->get_mgrid_coord(i).y; - mgrid_pos_h[i].z = biggrid_info_->get_mgrid_coord(i).z; + mgrids_pos_h[i].x = biggrid_info->get_mgrid_coord(i).x; + mgrids_pos_h[i].y = biggrid_info->get_mgrid_coord(i).y; + mgrids_pos_h[i].z = biggrid_info->get_mgrid_coord(i).z; } - checkCuda(cudaMalloc((void**)&mgrid_pos_d, sizeof(double3) * mgrid_num)); - checkCuda(cudaMemcpy(mgrid_pos_d, mgrid_pos_h.data(), sizeof(double3) * mgrid_num, cudaMemcpyHostToDevice)); + checkCuda(cudaMalloc((void**)&mgrids_pos_d, sizeof(double3) * mgrid_num)); + checkCuda(cudaMemcpy(mgrids_pos_d, mgrids_pos_h.data(), sizeof(double3) * mgrid_num, cudaMemcpyHostToDevice)); checkCuda(cudaMalloc((void**)&iat2it_d, sizeof(int) * ucell.nat)); checkCuda(cudaMemcpy(iat2it_d, ucell.iat2it, sizeof(int) * ucell.nat, cudaMemcpyHostToDevice)); - gemm_algo_selector(gint_info->get_bgrid_info()->get_mgrids_num(), fastest_matrix_mul, *ucell); + gemm_algo_selector(mgrid_num, fastest_matrix_mul, ucell); } GintGpuVars::~GintGpuVars() { +#ifdef __MPI + checkCuda(cudaSetDevice(dev_id_)); +#endif + checkCuda(cudaFree(rcut_d)); checkCuda(cudaFree(atom_nw_d)); checkCuda(cudaFree(ucell_atom_nwl_d)); checkCuda(cudaFree(atom_iw2_new_d)); @@ -110,7 +119,7 @@ GintGpuVars::~GintGpuVars() checkCuda(cudaFree(psi_u_d)); checkCuda(cudaFree(dpsi_u_d)); checkCuda(cudaFree(d2psi_u_d)); - checkCuda(cudaFree(mgrid_pos_d)); + checkCuda(cudaFree(mgrids_pos_d)); checkCuda(cudaFree(iat2it_d)); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h index cffa5e36b9..68b591822e 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h @@ -5,9 +5,9 @@ #include "module_base/ylm.h" #include "module_cell/unitcell.h" #include "module_cell/atom_spec.h" -#include "module_gint/kernels/cuda/gemm_selector.cuh" -#include "module_gint/temp_gint/gint_info.h" -#include "module_gint/kernels/cuda/cuda_tools.cuh" +#include "module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h" +#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include "module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cuh" namespace ModuleGint { @@ -15,13 +15,17 @@ namespace ModuleGint class GintGpuVars { public: - GintGpuVars(const std::shared_ptr gint_info, + GintGpuVars(std::shared_ptr bgrid_info, const UnitCell& ucell, - Numerical_Orbital* Phi); + const Numerical_Orbital* Phi); ~GintGpuVars(); - - // ylmcoef_d is __constant__ memory + + int nwmax; + double dr_uniform; + double nr_max; + // ylmcoef_d is __constant__ memory, no need to cudaFree double* ylmcoef_d = nullptr; + double* rcut_d = nullptr; int* atom_nw_d = nullptr; int* ucell_atom_nwl_d = nullptr; bool* atom_iw2_new_d = nullptr; @@ -30,11 +34,13 @@ class GintGpuVars double* psi_u_d = nullptr; double* dpsi_u_d = nullptr; double* d2psi_u_d = nullptr; - double3* mgrid_pos_d = nullptr; + double3* mgrids_pos_d = nullptr; int* iat2it_d = nullptr; - int dev_id = 0; + + // the index of gpu device + int dev_id_ = 0; matrix_multiple_func_type fastest_matrix_mul; -} +}; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cpp deleted file mode 100644 index d9fbb1f0fb..0000000000 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include "phi_operator_gpu.h" - -namespace ModuleGint -{ - -PhiOperatorGpu::PhiOperatorGpu(cudaStream_t stream) -: stream_(stream) -{ - atoms_num_info_h_.reserve(BatchBigGrid::get_max_batch_size()); - atoms_num_info_h_.reserve(BatchBigGrid::get_max_batch_size()); - atoms_iat_h_.reserve(BatchBigGrid::get_max_atoms_num()); - atoms_iat_d_.reserve(BatchBigGrid::get_max_atoms_num()); - atoms_bgrids_rcoords_h_.reserve(BatchBigGrid::get_max_atoms_num()); - atoms_bgrids_rcoords_d_.reserve(BatchBigGrid::get_max_atoms_num()); -} - - - -} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu new file mode 100644 index 0000000000..3c9b6e723f --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu @@ -0,0 +1,231 @@ +#include "phi_operator_gpu.h" +#include "phi_operator_kernel.cuh" +#include + +namespace ModuleGint +{ +PhiOperatorGpu::PhiOperatorGpu(std::shared_ptr gint_gpu_vars, cudaStream_t stream) +:gint_gpu_vars_(gint_gpu_vars), stream_(stream), +mgrids_num_(BatchBigGrid::get_bgrid_info()->get_mgrids_num()), +atoms_num_info_(BatchBigGrid::get_max_batch_size(), stream_, true), +bgrids_phi_len_(BatchBigGrid::get_max_batch_size(), stream_, true), +bgrids_phi_start_(BatchBigGrid::get_max_batch_size(), stream_, true), +atoms_iat_(BatchBigGrid::get_max_atoms_num(), stream_, true), +atoms_bgrids_rcoords_(BatchBigGrid::get_max_atoms_num(), stream_, true), +atoms_phi_start_(BatchBigGrid::get_max_atoms_num(), stream_, true), +mgrids_local_idx_batch_(BatchBigGrid::get_max_batch_size() + * BatchBigGrid::get_bgrid_info()->get_mgrids_num(), stream_, true), +gemm_m_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_n_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_k_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_lda_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_ldb_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_ldc_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_A_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_B_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_C_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true), +gemm_alpha_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true) +{ + checkCuda(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); +} + +PhiOperatorGpu::~PhiOperatorGpu() +{ + checkCuda(cudaEventDestroy(event_)); +} + +void PhiOperatorGpu::set_bgrid_batch(std::shared_ptr bgrid_batch) +{ + bgrid_batch_ = bgrid_batch; + // The set_size here is to determine how many bytes to transfer in the subsequent copy_host_to_device + atoms_num_info_.set_size(bgrid_batch->get_batch_size()); + bgrids_phi_len_.set_size(bgrid_batch->get_batch_size()); + bgrids_phi_start_.set_size(bgrid_batch->get_batch_size()); + atoms_iat_.set_size(bgrid_batch->get_atoms_num()); + atoms_bgrids_rcoords_.set_size(bgrid_batch->get_atoms_num()); + atoms_phi_start_.set_size(bgrid_batch->get_atoms_num()); + mgrids_local_idx_batch_.set_size(bgrid_batch->get_batch_size() * mgrids_num_); + auto atoms_num_info_h = atoms_num_info_.get_host_ptr(); + auto bgrids_phi_len_h = bgrids_phi_len_.get_host_ptr(); + auto bgrids_phi_start_h = bgrids_phi_start_.get_host_ptr(); + auto atoms_iat_h = atoms_iat_.get_host_ptr(); + auto atoms_bgrids_rcoords_h = atoms_bgrids_rcoords_.get_host_ptr(); + auto atoms_phi_start_h = atoms_phi_start_.get_host_ptr(); + auto mgrids_local_idx_batch_h = mgrids_local_idx_batch_.get_host_ptr(); + int i = 0; + int j = 0; + int atoms_accum = 0; + phi_len_ = 0; + int phi_start = 0; + std::vector mgrids_local_idx; + checkCuda(cudaEventSynchronize(event_)); + for (const auto& bgrid : bgrid_batch->get_bgrids()) + { + atoms_num_info_h[i] = make_int2(bgrid->get_atoms_num(), atoms_accum); + atoms_accum += bgrid->get_atoms_num(); + bgrids_phi_start_h[i] = phi_start; + bgrid->set_mgrids_local_idx(mgrids_local_idx); + std::copy(mgrids_local_idx.begin(), mgrids_local_idx.end(), + mgrids_local_idx_batch_h + i * mgrids_num_); + int phi_len_bgrid = 0; + for (const auto& atom : bgrid->get_atoms()) + { + atoms_iat_h[j] = atom->get_iat(); + Vec3d rcoord = bgrid->get_bgrid_atom_rcoord(atom); + atoms_bgrids_rcoords_h[j] = make_double3(rcoord.x, rcoord.y, rcoord.z); + atoms_phi_start_h[j] = phi_len_ + phi_len_bgrid; + phi_len_bgrid += atom->get_nw(); + j++; + } + bgrids_phi_len_h[i] = phi_len_bgrid; + phi_len_ += phi_len_bgrid * bgrid->get_mgrids_num(); + phi_start += phi_len_bgrid * bgrid->get_mgrids_num(); + i++; + } + + atoms_num_info_.copy_host_to_device_async(); + bgrids_phi_len_.copy_host_to_device_async(); + bgrids_phi_start_.copy_host_to_device_async(); + atoms_iat_.copy_host_to_device_async(); + atoms_bgrids_rcoords_.copy_host_to_device_async(); + atoms_phi_start_.copy_host_to_device_async(); + mgrids_local_idx_batch_.copy_host_to_device_async(); + checkCuda(cudaEventRecord(event_, stream_)); +} + +void PhiOperatorGpu::set_phi(double* phi_d) +{ + checkCuda(cudaMemsetAsync(phi_d, 0, phi_len_ * sizeof(double), stream_)); + dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); + dim3 threads_per_block(64); + set_phi_kernel<<>>( + gint_gpu_vars_->nwmax, + mgrids_num_, + gint_gpu_vars_->nr_max, + gint_gpu_vars_->dr_uniform, + gint_gpu_vars_->ylmcoef_d, + gint_gpu_vars_->ucell_atom_nwl_d, + gint_gpu_vars_->atom_iw2_new_d, + gint_gpu_vars_->atom_iw2_ylm_d, + gint_gpu_vars_->atom_nw_d, + gint_gpu_vars_->iat2it_d, + gint_gpu_vars_->rcut_d, + gint_gpu_vars_->psi_u_d, + gint_gpu_vars_->dpsi_u_d, + gint_gpu_vars_->mgrids_pos_d, + atoms_iat_.get_device_ptr(), + atoms_bgrids_rcoords_.get_device_ptr(), + atoms_num_info_.get_device_ptr(), + atoms_phi_start_.get_device_ptr(), + bgrids_phi_len_.get_device_ptr(), + phi_d); +} + +void PhiOperatorGpu::phi_mul_vldr3( + const double* vl_d, + const double dr3, + const double* phi_d, + double* result_d) +{ + checkCuda(cudaMemsetAsync(result_d, 0, phi_len_ * sizeof(double), stream_)); + dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); + dim3 threads_per_block(64); + phi_mul_vldr3_kernel<<>>( + vl_d, + dr3, + phi_d, + mgrids_num_, + mgrids_local_idx_batch_.get_device_ptr(), + bgrids_phi_len_.get_device_ptr(), + bgrids_phi_start_.get_device_ptr(), + result_d); +} + +void PhiOperatorGpu::phi_mul_phi_vldr3( + const double* phi_d, + const double* phi_vldr3_d, + std::shared_ptr> hRGint, + double* hr_d) +{ + // ap_num means number of atom pairs + int ap_num = 0; + int max_m = 0; + int max_n = 0; + checkCuda(cudaEventSynchronize(event_)); + for (int i = 0; i < bgrid_batch_->get_batch_size(); i++) + { + auto bgrid = bgrid_batch_->get_bgrids()[i]; + // the length of phi on a mesh grid + const int phi_len_mgrid = bgrid->get_phi_len(); + const int pre_atoms = atoms_num_info_.get_host_ptr()[i].y; + for (int j = 0; j < bgrid->get_atoms_num(); j++) + { + auto atom_1 = bgrid->get_atoms()[j]; + const int iat_1 = atom_1->get_iat(); + const auto& r_1 = atom_1->get_R(); + const int m = atom_1->get_nw(); + const int phi_1_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + j]; + + for (int k = 0; k < bgrid->get_atoms_num(); k++) + { + auto atom_2 = bgrid->get_atoms()[k]; + const int iat_2 = atom_2->get_iat(); + const auto& r_2 = atom_2->get_R(); + const int n = atom_2->get_nw(); + + if(iat_1 > iat_2) + { continue; } + + int hr_offset = hRGint->find_matrix_offset(iat_1, iat_2, r_1 - r_2); + if (hr_offset == -1) + { continue; } + + const int phi_2_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + k]; + + + gemm_A_.get_host_ptr()[ap_num] = phi_d + phi_1_offset; + gemm_B_.get_host_ptr()[ap_num] = phi_vldr3_d + phi_2_offset; + gemm_C_.get_host_ptr()[ap_num] = hr_d + hr_offset; + gemm_lda_.get_host_ptr()[ap_num] = phi_len_mgrid; + gemm_ldb_.get_host_ptr()[ap_num] = phi_len_mgrid; + gemm_ldc_.get_host_ptr()[ap_num] = n; + gemm_m_.get_host_ptr()[ap_num] = m; + gemm_n_.get_host_ptr()[ap_num] = n; + gemm_k_.get_host_ptr()[ap_num] = bgrid->get_mgrids_num(); + ap_num++; + + max_m = std::max(max_m, m); + max_n = std::max(max_n, n); + } + } + } + + gemm_A_.copy_host_to_device_async(ap_num); + gemm_B_.copy_host_to_device_async(ap_num); + gemm_C_.copy_host_to_device_async(ap_num); + gemm_lda_.copy_host_to_device_async(ap_num); + gemm_ldb_.copy_host_to_device_async(ap_num); + gemm_ldc_.copy_host_to_device_async(ap_num); + gemm_m_.copy_host_to_device_async(ap_num); + gemm_n_.copy_host_to_device_async(ap_num); + gemm_k_.copy_host_to_device_async(ap_num); + checkCuda(cudaEventRecord(event_, stream_)); + + gint_gpu_vars_->fastest_matrix_mul(max_m, + max_n, + gemm_m_.get_device_ptr(), + gemm_n_.get_device_ptr(), + gemm_k_.get_device_ptr(), + gemm_A_.get_device_ptr(), + gemm_lda_.get_device_ptr(), + gemm_B_.get_device_ptr(), + gemm_ldb_.get_device_ptr(), + gemm_C_.get_device_ptr(), + gemm_ldc_.get_device_ptr(), + ap_num, + stream_, + nullptr); + checkCudaLastError(); +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h index e3cecc0c0f..4469457741 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h @@ -2,11 +2,10 @@ #include #include -#include "temp_gint/batch_biggrid.h" -#include "module_gint/kernels/cuda/cuda_tools.cuh" +#include "module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h" +#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" #include "gint_gpu_vars.h" -#include -#include +#include "cuda_mem_wrapper.h" namespace ModuleGint { @@ -15,34 +14,67 @@ class PhiOperatorGpu { public: - PhiOperatorGpu(cudaStream_t stream); + PhiOperatorGpu(std::shared_ptr gint_gpu_vars, cudaStream_t stream = 0); + ~PhiOperatorGpu(); void set_bgrid_batch(std::shared_ptr bgrid_batch); - static void set_gint_gpu_vars(std::shared_ptr gint_gpu_vars) - { - gint_gpu_vars_ = gint_gpu_vars; - }; - void + void set_phi(double* phi_d); + + void phi_mul_vldr3( + const double* vl_d, + const double dr3, + const double* phi_d, + double* result_d); + + void phi_mul_phi_vldr3( + const double* phi_d, + const double* phi_vldr3_d, + std::shared_ptr> hRGint, + double* hr_d); + private: std::shared_ptr bgrid_batch_; - static std::shared_ptr gint_gpu_vars_; - cudaStream_t stream_ = nullptr; + std::shared_ptr gint_gpu_vars_; + + // the number of meshgrids on a biggrid + int mgrids_num_; + + int phi_len_; + + cudaStream_t stream_ = 0; + cudaEvent_t event_; // The first number in every group of two represents the number of atoms on that bigcell. // The second number represents the cumulative number of atoms up to that bigcell. - thrust::host_vector atoms_num_info_h_; - thrust::device_vector atoms_num_info_d_; + CudaMemWrapper atoms_num_info_; // the iat of each atom - thrust::host_vector atoms_iat_h_; - thrust::device_vector atoms_iat_d_; + CudaMemWrapper atoms_iat_; // atoms_bgrids_rcoords_ here represents the relative coordinates from the big grid to the atoms - thrust::host_vector atoms_bgrids_rcoords_h_; - thrust::device_vector atoms_bgrids_rcoords_d_; - + CudaMemWrapper atoms_bgrids_rcoords_; + + // the start index of the phi array for each atom + CudaMemWrapper atoms_phi_start_; + // The length of phi for a single meshgrid on each big grid. + CudaMemWrapper bgrids_phi_len_; + // The start index of the phi array for each big grid. + CudaMemWrapper bgrids_phi_start_; + // Mapping of the index of meshgrid in the batch of biggrids to the index of meshgrid in the local cell + CudaMemWrapper mgrids_local_idx_batch_; + + CudaMemWrapper gemm_m_; + CudaMemWrapper gemm_n_; + CudaMemWrapper gemm_k_; + CudaMemWrapper gemm_lda_; + CudaMemWrapper gemm_ldb_; + CudaMemWrapper gemm_ldc_; + CudaMemWrapper gemm_A_; + CudaMemWrapper gemm_B_; + CudaMemWrapper gemm_C_; + CudaMemWrapper gemm_alpha_; }; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu new file mode 100644 index 0000000000..30cdab0bef --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu @@ -0,0 +1,105 @@ +#include "phi_operator_kernel.cuh" +#include "sph.cuh" + +namespace ModuleGint +{ + +__global__ void set_phi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ phi) +{ + const int bgrid_id = blockIdx.y; + const int mgrid_id = blockIdx.x; + const int atoms_num = atoms_num_info[bgrid_id].x; + const int pre_atoms_num = atoms_num_info[bgrid_id].y; + const double3 mgrid_pos = mgrids_pos[mgrid_id]; + + for (int atom_id = threadIdx.x; atom_id < atoms_num; atom_id += blockDim.x) + { + const int atom_type = iat2it[atoms_iat[atom_id + pre_atoms_num]]; + const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num]; + const double3 coord = make_double3(mgrid_pos.x-rcoord.x, + mgrid_pos.y-rcoord.y, + mgrid_pos.z-rcoord.z); + double dist = sqrt(coord.x * coord.x + coord.y * coord.y + coord.z * coord.z); + if (dist < rcut[atom_type]) + { + if (dist < 1.0E-9) + { dist += 1.0E-9; } + double dr[3] = { coord.x / dist, coord.y / dist, coord.z / dist }; + // since nwl is less or equal than 5, the size of ylma is (5+1)^2 + double ylma[36]; + const int nwl = ucell_atom_nwl[atom_type]; + sph_harm(nwl, ylmcoef, dr, ylma); + + const double pos = dist / dr_uniform; + const int ip = static_cast(pos); + const double dx = pos - ip; + const double dx2 = dx * dx; + const double dx3 = dx2 * dx; + + const double c3 = 3.0 * dx2 - 2.0 * dx3; + const double c1 = 1.0 - c3; + const double c2 = (dx - 2.0 * dx2 + dx3) * dr_uniform; + const double c4 = (dx3 - dx2) * dr_uniform; + + double psi = 0; + const int it_nw = atom_type * nwmax; + int iw_nr = it_nw * nrmax + ip; + int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + + bgrids_phi_len[bgrid_id] * mgrid_id; + + for (int iw = 0; iw < atom_nw[atom_type]; iw++) + { + if (atom_iw2_new[it_nw + iw]) + { + psi = c1 * psi_u[iw_nr] + c2 * dpsi_u[iw_nr] + + c3 * psi_u[iw_nr + 1] + c4 * dpsi_u[iw_nr + 1]; + } + phi[phi_idx + iw] = psi * ylma[atom_iw2_ylm[it_nw + iw]]; + iw_nr += nrmax; + } + } + } +} + +__global__ void phi_mul_vldr3_kernel( + const double* __restrict__ vl, + const double dr3, + const double* __restrict__ phi, + const int mgrids_per_bgrid, + const int* __restrict__ mgrids_local_idx, + const int* __restrict__ bgrids_phi_len, + const int* __restrict__ bgrids_phi_start, + double* __restrict__ result) +{ + const int bgrid_id = blockIdx.y; + const int mgrid_id = blockIdx.x; + const int phi_len = bgrids_phi_len[bgrid_id]; + const int phi_start = bgrids_phi_start[bgrid_id] + mgrid_id * phi_len; + const int mgrid_id_in_batch = bgrid_id * mgrids_per_bgrid + mgrid_id; + const double vldr3 = vl[mgrids_local_idx[mgrid_id_in_batch]] * dr3; + for(int i = threadIdx.x; i < phi_len; i += blockDim.x) + { + result[phi_start + i] = phi[phi_start + i] * vldr3; + } +} +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh new file mode 100644 index 0000000000..1cc049f308 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh @@ -0,0 +1,40 @@ +#pragma once + +#include + +namespace ModuleGint +{ + +__global__ void set_phi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ phi); + +__global__ void phi_mul_vldr3_kernel( + const double* __restrict__ vl, + const double dr3, + const double* __restrict__ phi, + const int mgrids_per_bgrid, + const int* __restrict__ mgrids_local_idx, + const int* __restrict__ bgrids_phi_len, + const int* __restrict__ bgrids_phi_start, + double* __restrict__ result); + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu index b3b807a0b9..314020fd45 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu @@ -1,5 +1,7 @@ #include "set_const_mem.cuh" -#include "module_gint/kernels/cuda/cuda_tools.cuh" +#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" + +__constant__ double ylmcoe_d[100]; namespace ModuleGint { diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh index c063d622dd..715fa98cde 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh @@ -1,8 +1,6 @@ #pragma once #include -__constant__ double ylmcoe_d[100]; - namespace ModuleGint { __host__ void set_ylmcoe_d(const double* ylmcoe_h, double** ylmcoe_d_addr); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh new file mode 100644 index 0000000000..94d4fcdaa4 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh @@ -0,0 +1,392 @@ +#pragma once + +namespace ModuleGint +{ + +static __device__ void sph_harm( + const int nwl, + const double* __restrict__ ylmcoef, + const double* __restrict__ dr, + double* __restrict__ ylma +) +{ + /*************************** + L = 0 + ***************************/ + ylma[0] = ylmcoef[0]; // l=0, m=0 + double tmp0; + if (nwl == 0) + return; + + /*************************** + L = 1 + ***************************/ + ylma[1] = ylmcoef[1] * dr[2]; // l=1, m=0 + ylma[2] = -ylmcoef[1] * dr[0]; // l=1, m=1 + ylma[3] = -ylmcoef[1] * dr[1]; // l=1, m=-1 + if (nwl == 1) + return; + + /*************************** + L = 2 + ***************************/ + tmp0=ylmcoef[3] * ylma[0]; + ylma[4] = ylmcoef[2] * dr[2] * ylma[1] - tmp0 ; // l=2, m=0 + tmp0 = ylmcoef[4] * dr[2]; + ylma[5] = tmp0 * ylma[2]; // l=2,m=1 + ylma[6] = tmp0 * ylma[3]; // l=2,m=-1 + + tmp0 = ylmcoef[4] * dr[0]; + ylma[7] = ylmcoef[5] * ylma[4] - ylmcoef[6] * ylma[0] + - tmp0 * ylma[2]; // l=2,m=2 + ylma[8] = -tmp0 * ylma[3]; + if (nwl == 2) + return; + + /*************************** + L = 3 + ***************************/ + tmp0=ylmcoef[8] * ylma[1]; + ylma[9] = ylmcoef[7] * dr[2] * ylma[4] - tmp0; // l=3, m=0 + + tmp0 = ylmcoef[9] * dr[2]; + ylma[10] = tmp0 * ylma[5] - ylmcoef[10] * ylma[2]; // l=3,m=1 + ylma[11] = tmp0 * ylma[6] - ylmcoef[10] * ylma[3]; // l=3,m=-1 + + tmp0 = ylmcoef[11] * dr[2]; + ylma[12] = tmp0 * ylma[7]; // l=3,m=2 + ylma[13] = tmp0 * ylma[8]; // l=3,m=-2 + + tmp0 = ylmcoef[14] * dr[0]; + ylma[14] = ylmcoef[12] * ylma[10] - ylmcoef[13] * ylma[2] + - tmp0 * ylma[7]; // l=3,m=3 + ylma[15] = ylmcoef[12] * ylma[11] - ylmcoef[13] * ylma[3] + - tmp0 * ylma[8]; // l=3,m=-3 + if (nwl == 3) + return; + + /*************************** + L = 4 + ***************************/ + tmp0=ylmcoef[16] * ylma[4]; + ylma[16] = ylmcoef[15] * dr[2] * ylma[9] - tmp0; // l=4,m=0 + + tmp0 = ylmcoef[17] * dr[2]; + ylma[17] = tmp0 * ylma[10] - ylmcoef[18] * ylma[5]; // l=4,m=1 + ylma[18] = tmp0 * ylma[11] - ylmcoef[18] * ylma[6]; // l=4,m=-1 + + tmp0 = ylmcoef[19] * dr[2]; + ylma[19] = tmp0 * ylma[12] - ylmcoef[20] * ylma[7]; // l=4,m=2 + ylma[20] = tmp0 * ylma[13] - ylmcoef[20] * ylma[8]; // l=4,m=-2 + + tmp0 = 3.0 * dr[2]; + ylma[21] = tmp0 * ylma[14]; // l=4,m=3 + ylma[22] = tmp0 * ylma[15]; // l=4,m=-3 + + tmp0 = ylmcoef[23] * dr[0]; + ylma[23] = ylmcoef[21] * ylma[19] - ylmcoef[22] * ylma[7] + - tmp0 * ylma[14]; // l=4,m=4 + ylma[24] = ylmcoef[21] * ylma[20] - ylmcoef[22] * ylma[8] + - tmp0 * ylma[15]; // l=4,m=-4 + if (nwl == 4) + return; + + /*************************** + L = 5 + ***************************/ + tmp0=ylmcoef[25] * ylma[9]; + ylma[25] + = ylmcoef[24] * dr[2] * ylma[16] - tmp0; // l=5,m=0 + + tmp0 = ylmcoef[26] * dr[2]; + ylma[26] = tmp0 * ylma[17] - ylmcoef[27] * ylma[10]; // l=5,m=1 + ylma[27] = tmp0 * ylma[18] - ylmcoef[27] * ylma[11]; // l=5,m=-1 + + tmp0 = ylmcoef[28] * dr[2]; + ylma[28] = tmp0 * ylma[19] - ylmcoef[29] * ylma[12]; // l=5,m=2 + ylma[29] = tmp0 * ylma[20] - ylmcoef[29] * ylma[13]; // l=5,m=-2 + + tmp0 = ylmcoef[30] * dr[2]; + ylma[30] = tmp0 * ylma[21] - ylmcoef[31] * ylma[14]; // l=5,m=3 + ylma[31] = tmp0 * ylma[22] - ylmcoef[31] * ylma[15]; // l=5,m=-3 + + tmp0 = ylmcoef[32] * dr[2]; + ylma[32] = tmp0 * ylma[23]; // l=5,m=4 + ylma[33] = tmp0 * ylma[24]; // l=5,m=-4 + + tmp0 = ylmcoef[35] * dr[0]; + ylma[34] = ylmcoef[33] * ylma[30] - ylmcoef[34] * ylma[14] + - tmp0 * ylma[23]; // l=5,m=5 + ylma[35] = ylmcoef[33] * ylma[31] - ylmcoef[34] * ylma[15] + - tmp0 * ylma[24]; // l=5,m=-5 + if (nwl == 5) + return; + /* + // if nwl > 5 + for (int il = 6; il <= nwl; il++) + { + int istart = il * il; + int istart1 = (il - 1) * (il - 1); + int istart2 = (il - 2) * (il - 2); + + double fac2 = sqrt(4.0 * istart - 1.0); + double fac4 = sqrt(4.0 * istart1 - 1.0); + + for (int im = 0; im < 2 * il - 1; im++) + { + int imm = (im + 1) / 2; + ylma[istart + im] = fac2 / sqrt((double)istart - imm * imm) * (dr[2] + * ylma[istart1 + im] - sqrt((double)istart1 - imm * imm) / fac4 * + ylma[istart2 + im]); + } + + double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0)); + double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0)); + double bl3 = sqrt(2.0) / fac2; + + ylma[istart + 2 * il - 1] = (bl3 * ylma[istart + 2 * il - 5] - bl2 * + ylma[istart2 + 2 * il - 5] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 3]) / + bl1; ylma[istart + 2 * il] = (bl3 * ylma[istart + 2 * il - 4] - bl2 * + ylma[istart2 + 2 * il - 4] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 2]) / + bl1; + }*/ +} + +static __device__ void grad_rl_sph_harm( + const int nwl, + const double* __restrict__ ylmcoef, + const double* __restrict__ dr, + double* __restrict__ rly, + double* __restrict__ grly +) +{ + double r2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; + double tx = dr[0] * 2; + double ty = dr[1] * 2; + double tz = dr[2] * 2; + + //begin calculation + /*************************** + L = 0 + ***************************/ + rly[0] = ylmcoef[0]; //l=0, m=0 + grly[0] = grly[1] = grly[2] = 0.0; + if (nwl == 0) return; + + /*************************** + L = 1 + ***************************/ + rly[1] = ylmcoef[1]*dr[2]; //l=1, m=0 + grly[3] = grly[4] = 0.0; + grly[5] = ylmcoef[1]; + + rly[2] = -ylmcoef[1]*dr[0]; //l=1, m=1 + grly[7] = grly[8] = 0.0; + grly[6] = -ylmcoef[1]; + + rly[3] = -ylmcoef[1]*dr[1]; //l=1, m=-1 + grly[9] = grly[11] = 0.0; + grly[10] = -ylmcoef[1]; + + if (nwl == 1) return; + + /*************************** + L = 2 + ***************************/ + rly[4] = ylmcoef[2]*dr[2]*rly[1]-ylmcoef[3]*rly[0]*r2;//l=2, m=0 + grly[12] = ylmcoef[2]*dr[2]*grly[3]-ylmcoef[3]*(grly[0]*r2+rly[0]*tx);//l=2, m=0 + grly[13] = ylmcoef[2]*dr[2]*grly[4]-ylmcoef[3]*(grly[1]*r2+rly[0]*ty);//l=2, m=0 + grly[14] = ylmcoef[2]*(dr[2]*grly[5]+rly[1])-ylmcoef[3]*(grly[2]*r2+rly[0]*tz);//l=2, m=0 + + + double tmp0 = ylmcoef[4]*dr[2]; + rly[5] = tmp0*rly[2];//l=2,m=1 + grly[15] = tmp0*grly[6]; + grly[16] = tmp0*grly[7]; + grly[17] = ylmcoef[4]*(rly[2]+dr[2]*grly[8]); + + rly[6] = tmp0*rly[3];//l=2,m=-1 + grly[18] = tmp0*grly[9]; + grly[19] = tmp0*grly[10]; + grly[20] = ylmcoef[4]*(rly[3]+dr[2]*grly[11]); + + double tmp2 = ylmcoef[4]*dr[0]; + rly[7]= ylmcoef[5]*rly[4]-ylmcoef[6]*rly[0]*r2 - tmp2*rly[2];//l=2,m=2 + grly[21] = ylmcoef[5]*grly[12]-ylmcoef[6]*(rly[0]*tx+grly[0]*r2)-ylmcoef[4]*(dr[0]*grly[6]+rly[2]); + +// std::cout << "\np1 = "<< ylmcoef[5]*grly[12] << " p2 = " << -ylmcoef[6]*rly[0]*tx +// << " p3 = " << -ylmcoef[4]*dr[0]*grly[6] << " p4 = " << -ylmcoef[4]*rly[2] << std::endl; + + grly[22] = ylmcoef[5]*grly[13]-ylmcoef[6]*(rly[0]*ty+grly[1]*r2)-tmp2*grly[7]; + grly[23] = ylmcoef[5]*grly[14]-ylmcoef[6]*(rly[0]*tz+grly[2]*r2)-tmp2*grly[8]; + + rly[8] = -tmp2*rly[3]; + grly[24] = -ylmcoef[4]*(rly[3]+dr[0]*grly[9]); + grly[25] = -tmp2*grly[10]; + grly[26] = -tmp2*grly[11]; +// rly[8] = tmp1+tmp2*rly[3];//l=2,m=-2 + if (nwl == 2) return; + + /*************************** + L = 3 + ***************************/ + rly[9] = ylmcoef[7]*dr[2]*rly[4]-ylmcoef[8]*rly[1]*r2; //l=3, m=0 + grly[27] = ylmcoef[7]*dr[2]*grly[12]-ylmcoef[8]*(rly[1]*tx+grly[3]*r2); + grly[28] = ylmcoef[7]*dr[2]*grly[13]-ylmcoef[8]*(rly[1]*ty+grly[4]*r2); + grly[29] = ylmcoef[7]*(rly[4]+dr[2]*grly[14])-ylmcoef[8]*(rly[1]*tz+grly[5]*r2); + + double tmp3 = ylmcoef[9]*dr[2]; + rly[10] = tmp3*rly[5]-ylmcoef[10]*rly[2]*r2;//l=3,m=1 + grly[30] = tmp3*grly[15]-ylmcoef[10]*(grly[6]*r2+rly[2]*tx); + grly[31] = tmp3*grly[16]-ylmcoef[10]*(grly[7]*r2+rly[2]*ty); + grly[32] = ylmcoef[9]*(dr[2]*grly[17]+rly[5])-ylmcoef[10]*(grly[8]*r2+rly[2]*tz); + + rly[11] = tmp3*rly[6]-ylmcoef[10]*rly[3]*r2;//l=3,m=-1 + grly[33] = tmp3*grly[18]-ylmcoef[10]*(grly[9]*r2+rly[3]*tx); + grly[34] = tmp3*grly[19]-ylmcoef[10]*(grly[10]*r2+rly[3]*ty); + grly[35] = ylmcoef[9]*(dr[2]*grly[20]+rly[6])-ylmcoef[10]*(grly[11]*r2+rly[3]*tz); + + double tmp4 = ylmcoef[11]*dr[2]; + rly[12] = tmp4*rly[7];//l=3,m=2 + grly[36] = tmp4*grly[21]; + grly[37] = tmp4*grly[22]; + grly[38] = ylmcoef[11]*(dr[2]*grly[23]+rly[7]); + + rly[13] = tmp4*rly[8];//l=3,m=-2 + grly[39] = tmp4*grly[24]; + grly[40] = tmp4*grly[25]; + grly[41] = ylmcoef[11]*(dr[2]*grly[26]+rly[8]); + + double tmp5 = ylmcoef[14]*dr[0]; + rly[14] = ylmcoef[12]*rly[10]-ylmcoef[13]*rly[2]*r2-tmp5*rly[7];//l=3,m=3 + grly[42] = ylmcoef[12]*grly[30]-ylmcoef[13]*(rly[2]*tx+grly[6]*r2)-ylmcoef[14]*(rly[7]+dr[0]*grly[21]); + grly[43] = ylmcoef[12]*grly[31]-ylmcoef[13]*(rly[2]*ty+grly[7]*r2)-tmp5*grly[22]; + grly[44] = ylmcoef[12]*grly[32]-ylmcoef[13]*(rly[2]*tz+grly[8]*r2)-tmp5*grly[23]; + + rly[15] = ylmcoef[12]*rly[11]-ylmcoef[13]*rly[3]*r2-tmp5*rly[8];//l=3,m=-3 + grly[45] = ylmcoef[12]*grly[33]-ylmcoef[13]*(rly[3]*tx+grly[9]*r2)-ylmcoef[14]*(rly[8]+dr[0]*grly[24]); + grly[46] = ylmcoef[12]*grly[34]-ylmcoef[13]*(rly[3]*ty+grly[10]*r2)-tmp5*grly[25]; + grly[47] = ylmcoef[12]*grly[35]-ylmcoef[13]*(rly[3]*tz+grly[11]*r2)-tmp5*grly[26]; + if (nwl == 3) return; + + /*************************** + L = 4 + ***************************/ + rly[16] = ylmcoef[15]*dr[2]*rly[9]-ylmcoef[16]*rly[4]*r2;//l=4,m=0 + grly[48] = ylmcoef[15]*dr[2]*grly[27]-ylmcoef[16]*(rly[4]*tx+grly[12]*r2); + grly[49] = ylmcoef[15]*dr[2]*grly[28]-ylmcoef[16]*(rly[4]*ty+grly[13]*r2); + grly[50] = ylmcoef[15]*(dr[2]*grly[29]+rly[9])-ylmcoef[16]*(rly[4]*tz+grly[14]*r2); + + double tmp6 = ylmcoef[17]*dr[2]; + rly[17] = tmp6*rly[10]-ylmcoef[18]*rly[5]*r2;//l=4,m=1 + grly[51] = tmp6*grly[30]-ylmcoef[18]*(rly[5]*tx+grly[15]*r2); + grly[52] = tmp6*grly[31]-ylmcoef[18]*(rly[5]*ty+grly[16]*r2); + grly[53] = ylmcoef[17]*(dr[2]*grly[32]+rly[10])-ylmcoef[18]*(rly[5]*tz+grly[17]*r2); + + rly[18] = tmp6*rly[11]-ylmcoef[18]*rly[6]*r2;//l=4,m=-1 + grly[54] = tmp6*grly[33]-ylmcoef[18]*(rly[6]*tx+grly[18]*r2); + grly[55] = tmp6*grly[34]-ylmcoef[18]*(rly[6]*ty+grly[19]*r2); + grly[56] = ylmcoef[17]*(dr[2]*grly[35]+rly[11])-ylmcoef[18]*(rly[6]*tz+grly[20]*r2); + + double tmp7 = ylmcoef[19]*dr[2]; + rly[19] = tmp7*rly[12]-ylmcoef[20]*rly[7]*r2;//l=4,m=2 + grly[57] = tmp7*grly[36]-ylmcoef[20]*(rly[7]*tx+grly[21]*r2); + grly[58] = tmp7*grly[37]-ylmcoef[20]*(rly[7]*ty+grly[22]*r2); + grly[59] = ylmcoef[19]*(dr[2]*grly[38]+rly[12])-ylmcoef[20]*(rly[7]*tz+grly[23]*r2); + + rly[20] = tmp7*rly[13]-ylmcoef[20]*rly[8]*r2;//l=4,m=-2 + grly[60] = tmp7*grly[39]-ylmcoef[20]*(rly[8]*tx+grly[24]*r2); + grly[61] = tmp7*grly[40]-ylmcoef[20]*(rly[8]*ty+grly[25]*r2); + grly[62] = ylmcoef[19]*(dr[2]*grly[41]+rly[13])-ylmcoef[20]*(rly[8]*tz+grly[26]*r2); + + double tmp8 = 3.0*dr[2]; + rly[21] = tmp8*rly[14];//l=4,m=3 + grly[63] = tmp8*grly[42]; + grly[64] = tmp8*grly[43]; + grly[65] = 3.0*(dr[2]*grly[44]+rly[14]); + + + rly[22] = tmp8*rly[15];//l=4,m=-3 + grly[66] = tmp8*grly[45]; + grly[67] = tmp8*grly[46]; + grly[68] = 3.0*(dr[2]*grly[47]+rly[15]); + + double tmp9 = ylmcoef[23]*dr[0]; + rly[23] = ylmcoef[21]*rly[19]-ylmcoef[22]*rly[7]*r2-tmp9*rly[14];//l=4,m=4 + grly[69] = ylmcoef[21]*grly[57]-ylmcoef[22]*(rly[7]*tx+grly[21]*r2)-ylmcoef[23]*(dr[0]*grly[42]+rly[14]); + grly[70] = ylmcoef[21]*grly[58]-ylmcoef[22]*(rly[7]*ty+grly[22]*r2)-tmp9*grly[43]; + grly[71] = ylmcoef[21]*grly[59]-ylmcoef[22]*(rly[7]*tz+grly[23]*r2)-tmp9*grly[44]; + + rly[24] = ylmcoef[21]*rly[20]-ylmcoef[22]*rly[8]*r2-tmp9*rly[15];//l=4,m=-4 + grly[72] = ylmcoef[21]*grly[60]-ylmcoef[22]*(rly[8]*tx+grly[24]*r2)-ylmcoef[23]*(dr[0]*grly[45]+rly[15]); + grly[73] = ylmcoef[21]*grly[61]-ylmcoef[22]*(rly[8]*ty+grly[25]*r2)-tmp9*grly[46]; + grly[74] = ylmcoef[21]*grly[62]-ylmcoef[22]*(rly[8]*tz+grly[26]*r2)-tmp9*grly[47]; + + if (nwl == 4) return; + + /*************************** + L = 5 + ***************************/ + rly[25] = ylmcoef[24]*dr[2]*rly[16]-ylmcoef[25]*rly[9]*r2;//l=5,m=0 + grly[75] = ylmcoef[24]*dr[2]*grly[48]-ylmcoef[25]*(rly[9]*tx+grly[27]*r2); + grly[76] = ylmcoef[24]*dr[2]*grly[49]-ylmcoef[25]*(rly[9]*ty+grly[28]*r2); + grly[77] = ylmcoef[24]*(dr[2]*grly[50]+rly[16])-ylmcoef[25]*(rly[9]*tz+grly[29]*r2); + + double tmp10 = ylmcoef[26]*dr[2]; + rly[26] = tmp10*rly[17]-ylmcoef[27]*rly[10]*r2;//l=5,m=1 + grly[78] = tmp10*grly[51]-ylmcoef[27]*(rly[10]*tx+grly[30]*r2); + grly[79] = tmp10*grly[52]-ylmcoef[27]*(rly[10]*ty+grly[31]*r2); + grly[80] = ylmcoef[26]*(dr[2]*grly[53]+rly[17])-ylmcoef[27]*(rly[10]*tz+grly[32]*r2); + + rly[27] = tmp10*rly[18]-ylmcoef[27]*rly[11]*r2;//l=5,m=-1 + grly[81] = tmp10*grly[54]-ylmcoef[27]*(rly[11]*tx+grly[33]*r2); + grly[82] = tmp10*grly[55]-ylmcoef[27]*(rly[11]*ty+grly[34]*r2); + grly[83] = ylmcoef[26]*(dr[2]*grly[56]+rly[18])-ylmcoef[27]*(rly[11]*tz+grly[35]*r2); + + double tmp11 = ylmcoef[28]*dr[2]; + rly[28] = tmp11*rly[19]-ylmcoef[29]*rly[12]*r2;//l=5,m=2 + grly[84] = tmp11*grly[57]-ylmcoef[29]*(rly[12]*tx+grly[36]*r2); + grly[85] = tmp11*grly[58]-ylmcoef[29]*(rly[12]*ty+grly[37]*r2); + grly[86] = ylmcoef[28]*(dr[2]*grly[59]+rly[19])-ylmcoef[29]*(rly[12]*tz+grly[38]*r2); + + rly[29] = tmp11*rly[20]-ylmcoef[29]*rly[13]*r2;//l=5,m=-2 + grly[87] = tmp11*grly[60]-ylmcoef[29]*(rly[13]*tx+grly[39]*r2); + grly[88] = tmp11*grly[61]-ylmcoef[29]*(rly[13]*ty+grly[40]*r2); + grly[89] = ylmcoef[28]*(dr[2]*grly[62]+rly[20])-ylmcoef[29]*(rly[13]*tz+grly[41]*r2); + + double tmp12 = ylmcoef[30]*dr[2]; + rly[30] = tmp12*rly[21]-ylmcoef[31]*rly[14]*r2;//l=5,m=3 + grly[90] = tmp12*grly[63]-ylmcoef[31]*(grly[42]*r2+rly[14]*tx); + grly[91] = tmp12*grly[64]-ylmcoef[31]*(grly[43]*r2+rly[14]*ty); + grly[92] = ylmcoef[30]*(dr[2]*grly[65]+rly[21])-ylmcoef[31]*(grly[44]*r2+rly[14]*tz); + + rly[31] = tmp12*rly[22]-ylmcoef[31]*rly[15]*r2;//l=5,m=-3 + grly[93] = tmp12*grly[66]-ylmcoef[31]*(grly[45]*r2+rly[15]*tx); + grly[94] = tmp12*grly[67]-ylmcoef[31]*(grly[46]*r2+rly[15]*ty); + grly[95] = ylmcoef[30]*(dr[2]*grly[68]+rly[22])-ylmcoef[31]*(grly[47]*r2+rly[15]*tz); + + double tmp13 = ylmcoef[32]*dr[2]; + rly[32] = tmp13*rly[23];//l=5,m=4 + grly[96] = tmp13*grly[69]; + grly[97] = tmp13*grly[70]; + grly[98] = ylmcoef[32]*(rly[23]+dr[2]*grly[71]); + + rly[33] = tmp13*rly[24];//l=5,m=-4 + grly[99] = tmp13*grly[72]; + grly[100] = tmp13*grly[73]; + grly[101] = ylmcoef[32]*(rly[24]+dr[2]*grly[74]); + + double tmp14 = ylmcoef[35]*dr[0]; + rly[34] = ylmcoef[33]*rly[30]-ylmcoef[34]*rly[14]*r2-tmp14*rly[23];//l=5,m=5 + grly[102] = ylmcoef[33]*grly[90]-ylmcoef[34]*(rly[14]*tx+grly[42]*r2)-ylmcoef[35]*(dr[0]*grly[69]+rly[23]); + grly[103] = ylmcoef[33]*grly[91]-ylmcoef[34]*(rly[14]*ty+grly[43]*r2)-tmp14*grly[70]; + grly[104] = ylmcoef[33]*grly[92]-ylmcoef[34]*(rly[14]*tz+grly[44]*r2)-tmp14*grly[71]; + + rly[35] = ylmcoef[33]*rly[31]-ylmcoef[34]*rly[15]*r2-tmp14*rly[24];//l=5,m=-5 + grly[105] = ylmcoef[33]*grly[93]-ylmcoef[34]*(rly[15]*tx+grly[45]*r2)-ylmcoef[35]*(dr[0]*grly[72]+rly[24]); + grly[106] = ylmcoef[33]*grly[94]-ylmcoef[34]*(rly[15]*ty+grly[46]*r2)-tmp14*grly[73]; + grly[107] = ylmcoef[33]*grly[95]-ylmcoef[34]*(rly[15]*tz+grly[47]*r2)-tmp14*grly[74]; + + if (nwl == 5) return; +} +} \ No newline at end of file diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp index e0791577a9..15022436ea 100644 --- a/source/source_esolver/esolver_ks_lcao.cpp +++ b/source/source_esolver/esolver_ks_lcao.cpp @@ -61,6 +61,8 @@ // test RDMFT #include "module_rdmft/rdmft.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_info.h" + #include namespace ModuleESolver @@ -92,6 +94,10 @@ ESolver_KS_LCAO::ESolver_KS_LCAO() template ESolver_KS_LCAO::~ESolver_KS_LCAO() { +#ifdef __NEW_GINT + // release gint_info + ModuleGint::Gint::set_gint_info(nullptr); +#endif } template diff --git a/source/source_esolver/lcao_before_scf.cpp b/source/source_esolver/lcao_before_scf.cpp index f921bab0a4..7a9ddc7a49 100644 --- a/source/source_esolver/lcao_before_scf.cpp +++ b/source/source_esolver/lcao_before_scf.cpp @@ -31,10 +31,6 @@ #ifdef __EXX #include "module_io/restart_exx_csr.h" #endif -#ifdef __CUDA -#include "module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h" -#include "module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h" -#endif namespace ModuleESolver { @@ -115,13 +111,6 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) orb_.Phi, ucell, this->gd); -#ifdef __CUDA - auto gint_gpu_vars = std::make_shared( - gint_info, - ucell, - orb_.Phi); - ModuleGint::PhiOperatorGpu::set_gint_gpu_vars(gint_gpu_vars); -#endif ModuleGint::Gint::set_gint_info(gint_info); #endif From cb05e0360b6ac8162cd5ef5d0144dae5150de2b3 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 8 May 2025 17:19:55 +0800 Subject: [PATCH 04/63] add new gemm function --- .../temp_gint/kernel/dgemm_vbatch.cu | 88 ++++ .../temp_gint/kernel/dgemm_vbatch.h | 23 + .../temp_gint/kernel/gemm_nn_vbatch.cuh | 452 ++++++++++++++++++ .../temp_gint/kernel/gemm_tn_vbatch.cuh | 452 ++++++++++++++++++ 4 files changed, 1015 insertions(+) create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu new file mode 100644 index 0000000000..cfca916e6f --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu @@ -0,0 +1,88 @@ +#include "gemm_tn_vbatch.cuh" +#include "gemm_nn_vbatch.cuh" +#include "dgemm_vbatch.h" + +// The template parameter settings for the function are based on the MAGMA source code settings. +// Specifically, they refer to the settings for the "tt" shape in dgemm_vbatched_core. +void dgemm_nn_vbatch( + int max_m, int max_n, int max_k, + const int* m_d, const int* n_d, const int* k_d, + const double* const* A_array_d, const int* lda_d, + const double* const* B_array_d, const int* ldb_d, + double** C_array_d, const int* ldc_d, + int batchCount, cudaStream_t stream, + const double* alpha) +{ + if (max_k < 128) + { + vbatched_gemm_nn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); + } + else + { + if (max_n < 256) + { + vbatched_gemm_nn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); + } + else + { + vbatched_gemm_nn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); + } + } +} + +// the template parameters refer to the settings for the "nt" shape in dgemm_vbatched_core. +void dgemm_tn_vbatch( + int max_m, int max_n, int max_k, + const int* m_d, const int* n_d, const int* k_d, + const double* const* A_array_d, const int* lda_d, + const double* const* B_array_d, const int* ldb_d, + double** C_array_d, const int* ldc_d, + int batchCount, cudaStream_t stream, + const double* alpha) +{ + if (max_k < 128) + { + vbatched_gemm_tn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); + } + else + { + if (max_n < 256) + { + vbatched_gemm_tn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); + } + else + { + vbatched_gemm_tn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); + } + } +} diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h new file mode 100644 index 0000000000..3cd878b950 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +// C(batch_id) = alpha * A(batch_id) * B(batch_id) + C(batch_id) +void dgemm_nn_vbatch( + int max_m, int max_n, int max_k, + const int* m_d, const int* n_d, const int* k_d, + const double* const* A_array_d, const int* lda_d, + const double* const* B_array_d, const int* ldb_d, + double** C_array_d, const int* ldc_d, + int batchCount, cudaStream_t stream, + const double* alpha = nullptr); + +// C(batch_id) = alpha * A(batch_id)^T * B(batch_id) + C(batch_id) +void dgemm_tn_vbatch( + int max_m, int max_n, int max_k, + const int* m_d, const int* n_d, const int* k_d, + const double* const* A_array_d, const int* lda_d, + const double* const* B_array_d, const int* ldb_d, + double** C_array_d, const int* ldc_d, + int batchCount, cudaStream_t stream, + const double* alpha); \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh new file mode 100644 index 0000000000..857d9d9081 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh @@ -0,0 +1,452 @@ +#ifndef GEMM_NN_VBATCH_CUH +#define GEMM_NN_VBATCH_CUH +#include // for assert +#include +#include // for CUDA_VERSION +#include +#include // for fprintf and stderr + +#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include + + +#define sA(i, j) sA[(j)*slda + (i)] +#define sB(i, j) sB[(j)*sldb + (i)] +#define fetch(A, m, n, bound) offs_d##A[min(n * LD##A + m, bound)] + +template +static __device__ void vbatched_gemm_tt_device(int M, + int N, + int K, + const T* __restrict__ A, + int LDA, + const T* __restrict__ B, + int LDB, + T* __restrict__ C, + int LDC, + T* sA, + int slda, + T* sB, + int sldb, + T alpha) +{ + int idx = threadIdx.x; // thread's m dimension + int idy = threadIdx.y; // thread's n dimension + + int idt = DIM_X * idy + idx; // thread's global number + + int idxA = idt % DIM_XA; // idx within A + int idyA = idt / DIM_XA; // idy within A + + int idxB = idt % DIM_XB; // idx within B + int idyB = idt / DIM_XB; // idy within B + + int blx = blockIdx.x; // block's m dimension + int bly = blockIdx.y; // block's n dimension + + // Registers for the innermost loop + T rC[THR_N][THR_M]; + T rA[THR_M]; + T rB[THR_N]; + + // Registers for the dev->shmem copy + T ra[BLK_M / DIM_YA][BLK_K / DIM_XA]; + T rb[BLK_K / DIM_YB][BLK_N / DIM_XB]; + + // bound is the correction to offs_d in order to not get out of memory bound + // so bound could be negative value since offs_d could be out of bound + const T* offs_dA = A + blx * BLK_M * LDA + idyA * LDA + idxA; + int boundA + = (LDA * (M - 1) + K) - (blx * BLK_M * LDA + idyA * LDA + idxA) - 1; + + const T* offs_dB = B + bly * BLK_N + idyB * LDB + idxB; + int boundB + = (LDB * (K - 1) + N) - (bly * BLK_N + idyB * LDB + idxB) - 1; + + int m, n, k, kk; + +// Zero C +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] = 0.0; + } + } + +// Load A dev->shmem +#pragma unroll + for (n = 0; n < BLK_M; n += DIM_YA) + { +#pragma unroll + for (m = 0; m < BLK_K; m += DIM_XA) + { + sA(n + idyA, m + idxA) = fetch(A, m, n, boundA); + } + } + +#pragma unroll + for (n = 0; n < BLK_K; n += DIM_YB) + { +#pragma unroll + for (m = 0; m < BLK_N; m += DIM_XB) + { + sB(n + idyB, m + idxB) = fetch(B, m, n, boundB); + } + } + + __syncthreads(); + + for (kk = 0; kk < K - BLK_K; kk += BLK_K) + { + offs_dA += BLK_K; + boundA -= BLK_K; + + offs_dB += BLK_K * LDB; + boundB -= BLK_K * LDB; + +// Load A dev->regs +#pragma unroll + for (n = 0; n < BLK_M / DIM_YA; n++) + { +#pragma unroll + for (m = 0; m < BLK_K / DIM_XA; m++) + { + ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA); + } + } + +// Load B dev->regs +#pragma unroll + for (n = 0; n < BLK_K / DIM_YB; n++) + { +#pragma unroll + for (m = 0; m < BLK_N / DIM_XB; m++) + { + rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB); + } + } + +// Multiply +#pragma unroll + for (k = 0; k < BLK_K; k++) + { +// Load A shmem->regs +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rA[m] = sA(m * DIM_X + idx, k); + } + +// Load B shmem->regs +#pragma unroll + for (n = 0; n < THR_N; n++) + { + rB[n] = sB(k, n * DIM_Y + idy); + } + +// Compute +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] += rA[m] * rB[n]; + } + } + } + + __syncthreads(); + +// Load A regs->shmem +#pragma unroll + for (n = 0; n < BLK_M / DIM_YA; n++) + { +#pragma unroll + for (m = 0; m < BLK_K / DIM_XA; m++) + { + sA(n * DIM_YA + idyA, m * DIM_XA + idxA) = ra[n][m]; + } + } + +// Load B regs->shmem +#pragma unroll + for (n = 0; n < BLK_K / DIM_YB; n++) + { +#pragma unroll + for (m = 0; m < BLK_N / DIM_XB; m++) + { + sB(n * DIM_YB + idyB, m * DIM_XB + idxB) = rb[n][m]; + } + } + __syncthreads(); + } + + // Multiply last full (BLK_K) or partial block of + // columns of op(A) and rows of op(B). + // It's okay that m,n exceed matrix bounds as all work is in registers + // or shared memory, and out-of-bounds rC[n][m] will not be saved later. + kk = K - kk; +#pragma unroll + for (k = 0; k < kk; k++) + { +// Load A shmem->regs +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rA[m] = sA(m * DIM_X + idx, k); + } + +// Load B shmem->regs +#pragma unroll + for (n = 0; n < THR_N; n++) + { + rB[n] = sB(k, n * DIM_Y + idy); + } + +// Compute +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] += rA[m] * rB[n]; + } + } + } + +// Store C regs->dev +#pragma unroll + for (n = 0; n < THR_N; n++) + { + int coord_dCn = bly * BLK_N + n * DIM_Y + idy; +#pragma unroll + for (m = 0; m < THR_M; m++) + { + int coord_dCm = blx * BLK_M + m * DIM_X + idx; + if (coord_dCm < M && coord_dCn < N) + { + int offsC = coord_dCn * LDC + coord_dCm; + + atomicAdd(C + offsC, rC[n][m] * alpha); + } + } + } +} + +/******************************************************************************/ +template +static __global__ void vbatched_gemm_tt_kernel(const int* M, + const int* N, + const int* K, + const T* const* global_A_array, + const int* global_lda, + const T* const* global_B_array, + const int* global_ldb, + T** global_C_array, + const int* global_ldc, + const T* alpha) +{ + extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + T* shared_mem = reinterpret_cast(smem); + + int batchid = blockIdx.z; + int local_M = (int)M[batchid]; + int local_N = (int)N[batchid]; + int local_K = (int)K[batchid]; + + if (blockIdx.x >= (local_M + BLK_M - 1) / BLK_M) + return; + if (blockIdx.y >= (local_N + BLK_N - 1) / BLK_N) + return; + + int shared_lda = BLK_M + 1; + int shared_ldb = BLK_K + 1; + T* shared_A = (T*)shared_mem; + T* shared_B = shared_A + shared_lda * BLK_K; + double alpha_tmp = 1.0; + if (alpha != nullptr) + { + alpha_tmp = alpha[batchid]; + } + vbatched_gemm_tt_device(local_M, + local_N, + local_K, + global_A_array[batchid], + (int)global_lda[batchid], + global_B_array[batchid], + (int)global_ldb[batchid], + global_C_array[batchid], + (int)global_ldc[batchid], + shared_A, + shared_lda, + shared_B, + shared_ldb, + alpha_tmp); +} + +/** + * Performs a batched matrix multiplication using the vbatched_gemm_impl + * function. + * + * C = alpha * A * B + C + * @tparam T The data type of the matrices. + * @tparam DIM_X The number of threads in the x-dimension of each block. + * @tparam DIM_Y The number of threads in the y-dimension of each block. + * @tparam BLK_M The number of rows processed by each thread block. + * @tparam BLK_N The number of columns processed by each thread block. + * @tparam BLK_K The number of elements processed by each thread block along the + * K dimension. + * @tparam DIM_XA The number of threads in the x-dimension used for loading + * matrix A. + * @tparam DIM_YA The number of threads in the y-dimension used for loading + * matrix A. + * @tparam DIM_XB The number of threads in the x-dimension used for loading + * matrix B. + * @tparam DIM_YB The number of threads in the y-dimension used for loading + * matrix B. + * @param max_m The maximum number of rows in the matrices. + * @param max_n The maximum number of columns in the matrices. + * @param m An array of batch sizes for the number of rows in each matrix. + * @param n An array of batch sizes for the number of columns in each matrix. + * @param k An array of batch sizes for the number of elements in each matrix + * along the K dimension. + * @param global_A_array An array of pointers to the input matrices A. + * @param global_lda An array of leading dimensions for the input matrices A. + * @param global_B_array An array of pointers to the input matrices B. + * @param global_ldb An array of leading dimensions for the input matrices B. + * @param global_C_array An array of pointers to the output matrices C. + * @param global_ldc An array of leading dimensions for the output matrices C. + * @param batchCount The number of matrices in the batch. + * @param stream The CUDA stream to use for the computation. + * @param alpha The scalar value to multiply the matrices by (optional, default + * is nullptr). generate by copilot + */ + +/* + * Why do we need to implement our own matrix multiplication based on the magma + * code? There are two main reasons. First is when we are doing batch matrix + * multiplication, since we need to accumulate the results of the + * multiplications, it is necessary to pass the same memory address of matrix C + * to different multiplications. This way, the accumulation can be done directly + * through atomic operations during the matrix multiplication, avoiding the + * reduction operations after the multiplication. Secondly, when calculating the + * charge density, where C = alpha * A * B + C, the value of alpha might be + * different for the same batch of matrices. Using the standard matrix + * multiplication interface would require breaking down the batch matrix + * multiplication into smaller batches. In practice, it is difficult to + * accumulate a batch. + * + * Moreover, taking into account the specific requirements of our application, + * especially the fact that we can relatively easily control the arrangement of + * the matrix elements, we have only implemented one type of requirement for + * matrix transposition. That is, we have implemented the operation C = alpha * + * A * trans(B) + C under the constraint of column-major order. + * + * Finally, we would like to thank Magma for its contributions to the field of + * scientific computing. + */ + +template +void vbatched_gemm_nn_impl(int max_m, + int max_n, + const int* m, + const int* n, + const int* k, + const T* const* global_A_array, + const int* global_lda, + const T* const* global_B_array, + const int* global_ldb, + T** global_C_array, + const int* global_ldc, + int batchCount, + cudaStream_t stream, + const T* alpha = nullptr) +{ + // The positions of A and B have been swapped here. + // This is because vbatch_gemm__tt_kernel is column major, + // but vatched_gemm_nn_impl is designed to be row major, + + size_t shared_mem_size = 0; + shared_mem_size += (BLK_M + 1) * BLK_K * sizeof(T); + shared_mem_size += (BLK_K + 1) * BLK_N * sizeof(T); + dim3 dimBlock(DIM_X, DIM_Y); + const int max_batch_count = 32768; + + for (int i = 0; i < batchCount; i += max_batch_count) + { + const int ibatch = min(max_batch_count, batchCount - i); + dim3 dimGrid(ceildiv(max_n, BLK_M), + ceildiv(max_m, BLK_N), + ibatch); + const T* alpha_tmp = nullptr; + if (alpha != nullptr) + { + alpha_tmp = alpha + i; + } + + vbatched_gemm_tt_kernel + <<>>( + n + i, m + i, k + i, + global_B_array + i, global_ldb + i, + global_A_array + i, global_lda + i, + global_C_array + i, global_ldc + i, + alpha_tmp); + checkCudaLastError(); + } +} + +#endif // GEMM_VBATCH_CUH \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh new file mode 100644 index 0000000000..a91c406326 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh @@ -0,0 +1,452 @@ +#ifndef GEMM_TN_VBATCH_CUH +#define GEMM_TN_VBATCH_CUH +#include // for assert +#include +#include // for CUDA_VERSION +#include +#include // for fprintf and stderr + +#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include + + +#define sA(i, j) sA[(j)*slda + (i)] +#define sB(i, j) sB[(j)*sldb + (i)] +#define fetch(A, m, n, bound) offs_d##A[min(n * LD##A + m, bound)] + +template +static __device__ void vbatched_gemm_nt_device(int M, + int N, + int K, + const T* __restrict__ A, + int LDA, + const T* __restrict__ B, + int LDB, + T* __restrict__ C, + int LDC, + T* sA, + int slda, + T* sB, + int sldb, + T alpha) +{ + int idx = threadIdx.x; // thread's m dimension + int idy = threadIdx.y; // thread's n dimension + + int idt = DIM_X * idy + idx; // thread's global number + + int idxA = idt % DIM_XA; // idx within A + int idyA = idt / DIM_XA; // idy within A + + int idxB = idt % DIM_XB; // idx within B + int idyB = idt / DIM_XB; // idy within B + + int blx = blockIdx.x; // block's m dimension + int bly = blockIdx.y; // block's n dimension + + // Registers for the innermost loop + T rC[THR_N][THR_M]; + T rA[THR_M]; + T rB[THR_N]; + + // Registers for the dev->shmem copy + T ra[BLK_K / DIM_YA][BLK_M / DIM_XA]; + T rb[BLK_K / DIM_YB][BLK_N / DIM_XB]; + + // bound is the correction to offs_d in order to not get out of memory bound + // so bound could be negative value since offs_d could be out of bound + const T* offs_dA = A + blx * BLK_M + idyA * LDA + idxA; + int boundA + = (LDA * (K - 1) + M) - (blx * BLK_M + idyA * LDA + idxA) - 1; + + const T* offs_dB = B + bly * BLK_N + idyB * LDB + idxB; + int boundB + = (LDB * (K - 1) + N) - (bly * BLK_N + idyB * LDB + idxB) - 1; + + int m, n, k, kk; + +// Zero C +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] = 0.0; + } + } + +// Load A dev->shmem +#pragma unroll + for (n = 0; n < BLK_K; n += DIM_YA) + { +#pragma unroll + for (m = 0; m < BLK_M; m += DIM_XA) + { + sA(m + idxA, n + idyA) = fetch(A, m, n, boundA); + } + } + +#pragma unroll + for (n = 0; n < BLK_K; n += DIM_YB) + { +#pragma unroll + for (m = 0; m < BLK_N; m += DIM_XB) + { + sB(n + idyB, m + idxB) = fetch(B, m, n, boundB); + } + } + + __syncthreads(); + + for (kk = 0; kk < K - BLK_K; kk += BLK_K) + { + offs_dA += BLK_K * LDA; + boundA -= BLK_K * LDA; + + offs_dB += BLK_K * LDB; + boundB -= BLK_K * LDB; + +// Load A dev->regs +#pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) + { +#pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) + { + ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA); + } + } + +// Load B dev->regs +#pragma unroll + for (n = 0; n < BLK_K / DIM_YB; n++) + { +#pragma unroll + for (m = 0; m < BLK_N / DIM_XB; m++) + { + rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB); + } + } + +// Multiply +#pragma unroll + for (k = 0; k < BLK_K; k++) + { +// Load A shmem->regs +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rA[m] = sA(m * DIM_X + idx, k); + } + +// Load B shmem->regs +#pragma unroll + for (n = 0; n < THR_N; n++) + { + rB[n] = sB(k, n * DIM_Y + idy); + } + +// Compute +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] += rA[m] * rB[n]; + } + } + } + + __syncthreads(); + +// Load A regs->shmem +#pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) + { +#pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) + { + sA(m * DIM_XA + idxA, n * DIM_YA + idyA) = ra[n][m]; + } + } + +// Load B regs->shmem +#pragma unroll + for (n = 0; n < BLK_K / DIM_YB; n++) + { +#pragma unroll + for (m = 0; m < BLK_N / DIM_XB; m++) + { + sB(n * DIM_YB + idyB, m * DIM_XB + idxB) = rb[n][m]; + } + } + __syncthreads(); + } + + // Multiply last full (BLK_K) or partial block of + // columns of op(A) and rows of op(B). + // It's okay that m,n exceed matrix bounds as all work is in registers + // or shared memory, and out-of-bounds rC[n][m] will not be saved later. + kk = K - kk; +#pragma unroll + for (k = 0; k < kk; k++) + { +// Load A shmem->regs +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rA[m] = sA(m * DIM_X + idx, k); + } + +// Load B shmem->regs +#pragma unroll + for (n = 0; n < THR_N; n++) + { + rB[n] = sB(k, n * DIM_Y + idy); + } + +// Compute +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] += rA[m] * rB[n]; + } + } + } + +// Store C regs->dev +#pragma unroll + for (n = 0; n < THR_N; n++) + { + int coord_dCn = bly * BLK_N + n * DIM_Y + idy; +#pragma unroll + for (m = 0; m < THR_M; m++) + { + int coord_dCm = blx * BLK_M + m * DIM_X + idx; + if (coord_dCm < M && coord_dCn < N) + { + int offsC = coord_dCn * LDC + coord_dCm; + + atomicAdd(C + offsC, rC[n][m] * alpha); + } + } + } +} + +/******************************************************************************/ +template +static __global__ void vbatched_gemm_nt_kernel(const int* M, + const int* N, + const int* K, + const T* const* global_A_array, + const int* global_lda, + const T* const* global_B_array, + const int* global_ldb, + T** global_C_array, + const int* global_ldc, + const T* alpha) +{ + extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + T* shared_mem = reinterpret_cast(smem); + + int batchid = blockIdx.z; + int local_M = (int)M[batchid]; + int local_N = (int)N[batchid]; + int local_K = (int)K[batchid]; + + if (blockIdx.x >= (local_M + BLK_M - 1) / BLK_M) + return; + if (blockIdx.y >= (local_N + BLK_N - 1) / BLK_N) + return; + + int shared_lda = BLK_M + 1; + int shared_ldb = BLK_K + 1; + T* shared_A = (T*)shared_mem; + T* shared_B = shared_A + shared_lda * BLK_K; + double alpha_tmp = 1.0; + if (alpha != nullptr) + { + alpha_tmp = alpha[batchid]; + } + vbatched_gemm_nt_device(local_M, + local_N, + local_K, + global_A_array[batchid], + (int)global_lda[batchid], + global_B_array[batchid], + (int)global_ldb[batchid], + global_C_array[batchid], + (int)global_ldc[batchid], + shared_A, + shared_lda, + shared_B, + shared_ldb, + alpha_tmp); +} + +/** + * Performs a batched matrix multiplication using the vbatched_gemm_impl + * function. + * + * C = alpha * A * B + C + * @tparam T The data type of the matrices. + * @tparam DIM_X The number of threads in the x-dimension of each block. + * @tparam DIM_Y The number of threads in the y-dimension of each block. + * @tparam BLK_M The number of rows processed by each thread block. + * @tparam BLK_N The number of columns processed by each thread block. + * @tparam BLK_K The number of elements processed by each thread block along the + * K dimension. + * @tparam DIM_XA The number of threads in the x-dimension used for loading + * matrix A. + * @tparam DIM_YA The number of threads in the y-dimension used for loading + * matrix A. + * @tparam DIM_XB The number of threads in the x-dimension used for loading + * matrix B. + * @tparam DIM_YB The number of threads in the y-dimension used for loading + * matrix B. + * @param max_m The maximum number of rows in the matrices. + * @param max_n The maximum number of columns in the matrices. + * @param m An array of batch sizes for the number of rows in each matrix. + * @param n An array of batch sizes for the number of columns in each matrix. + * @param k An array of batch sizes for the number of elements in each matrix + * along the K dimension. + * @param global_A_array An array of pointers to the input matrices A. + * @param global_lda An array of leading dimensions for the input matrices A. + * @param global_B_array An array of pointers to the input matrices B. + * @param global_ldb An array of leading dimensions for the input matrices B. + * @param global_C_array An array of pointers to the output matrices C. + * @param global_ldc An array of leading dimensions for the output matrices C. + * @param batchCount The number of matrices in the batch. + * @param stream The CUDA stream to use for the computation. + * @param alpha The scalar value to multiply the matrices by (optional, default + * is nullptr). generate by copilot + */ + +/* + * Why do we need to implement our own matrix multiplication based on the magma + * code? There are two main reasons. First is when we are doing batch matrix + * multiplication, since we need to accumulate the results of the + * multiplications, it is necessary to pass the same memory address of matrix C + * to different multiplications. This way, the accumulation can be done directly + * through atomic operations during the matrix multiplication, avoiding the + * reduction operations after the multiplication. Secondly, when calculating the + * charge density, where C = alpha * A * B + C, the value of alpha might be + * different for the same batch of matrices. Using the standard matrix + * multiplication interface would require breaking down the batch matrix + * multiplication into smaller batches. In practice, it is difficult to + * accumulate a batch. + * + * Moreover, taking into account the specific requirements of our application, + * especially the fact that we can relatively easily control the arrangement of + * the matrix elements, we have only implemented one type of requirement for + * matrix transposition. That is, we have implemented the operation C = alpha * + * A * trans(B) + C under the constraint of column-major order. + * + * Finally, we would like to thank Magma for its contributions to the field of + * scientific computing. + */ + +template +void vbatched_gemm_tn_impl(int max_m, + int max_n, + const int* m, + const int* n, + const int* k, + const T* const* global_A_array, + const int* global_lda, + const T* const* global_B_array, + const int* global_ldb, + T** global_C_array, + const int* global_ldc, + int batchCount, + cudaStream_t stream, + const T* alpha = nullptr) +{ + // The positions of A and B have been swapped here. + // This is because vbatch_gemm__tn_kernel is column major, + // but vatched_gemm_nt_impl is designed to be row major, + + size_t shared_mem_size = 0; + shared_mem_size += (BLK_M + 1) * BLK_K * sizeof(T); + shared_mem_size += (BLK_K + 1) * BLK_N * sizeof(T); + dim3 dimBlock(DIM_X, DIM_Y); + const int max_batch_count = 32768; + + for (int i = 0; i < batchCount; i += max_batch_count) + { + const int ibatch = min(max_batch_count, batchCount - i); + dim3 dimGrid(ceildiv(max_n, BLK_M), + ceildiv(max_m, BLK_N), + ibatch); + const T* alpha_tmp = nullptr; + if (alpha != nullptr) + { + alpha_tmp = alpha + i; + } + + vbatched_gemm_nt_kernel + <<>>( + n + i, m + i, k + i, + global_B_array + i, global_ldb + i, + global_A_array + i, global_lda + i, + global_C_array + i, global_ldc + i, + alpha_tmp); + checkCudaLastError(); + } +} + +#endif // GEMM_TN_VBATCH_CUH \ No newline at end of file From 7dc0cdec138f2563941f14127b3b60b5187de5b2 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 8 May 2025 17:20:18 +0800 Subject: [PATCH 05/63] support rho calculation --- .../module_gint/CMakeLists.txt | 2 + .../module_gint/kernels/cuda/cuda_tools.cuh | 5 + .../kernels/cuda/vbatch_matrix_mul.cuh | 5 - .../module_gint/temp_gint/gint_interface.cpp | 14 +- .../module_gint/temp_gint/gint_rho_gpu.cpp | 84 +++++++ .../module_gint/temp_gint/gint_rho_gpu.h | 49 ++++ .../temp_gint/kernel/cuda_mem_wrapper.h | 14 +- .../temp_gint/kernel/dgemm_vbatch.cu | 43 +++- .../temp_gint/kernel/dgemm_vbatch.h | 2 +- .../temp_gint/kernel/gemm_nn_vbatch.cuh | 87 +++----- .../temp_gint/kernel/gemm_tn_vbatch.cuh | 2 +- .../temp_gint/kernel/gint_helper.cuh | 38 ++++ .../temp_gint/kernel/phi_operator_gpu.cu | 210 +++++++++++++++--- .../temp_gint/kernel/phi_operator_gpu.h | 39 ++-- .../temp_gint/kernel/phi_operator_kernel.cu | 175 ++++++++++++++- .../temp_gint/kernel/phi_operator_kernel.cuh | 36 +++ .../module_gint/temp_gint/kernel/sph.cuh | 188 ++++++++-------- .../module_gint/temp_gint/phi_operator.hpp | 2 +- 18 files changed, 774 insertions(+), 221 deletions(-) create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index 78be519ab1..c284188c3c 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -60,6 +60,8 @@ if(NEW_GINT) temp_gint/kernel/set_const_mem.cu temp_gint/batch_biggrid temp_gint/gint_vl_gpu.cpp + temp_gint/gint_rho_gpu.cpp + temp_gint/kernel/dgemm_vbatch.cu ) endif() endif() diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh index 4a820b9ce7..02f9a1d4ca 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh @@ -15,6 +15,11 @@ cudaError_t check(cudaError_t result, const char *const func, const char *const file, const int line); cudaError_t __checkCudaLastError(const char *file, const int line); +static inline int ceildiv(int x, int y) +{ + return (x + y - 1) / y; +} + void dump_cuda_array_to_file(const double* cuda_array, int width, int hight, diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh index 39bc336e77..fbe12b318e 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh @@ -320,11 +320,6 @@ static __global__ void vbatched_gemm_kernel(const int* M, alpha_tmp); } -static inline int ceildiv(int x, int y) -{ - return (x + y - 1) / y; -} - /** * Performs a batched matrix multiplication using the vbatched_gemm_impl * function. diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index c5e800f020..fda3879780 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -8,6 +8,7 @@ #include "gint_fvl.h" #include "gint_fvl_meta.h" #include "gint_rho.h" +#include "gint_rho_gpu.h" #include "gint_tau.h" namespace ModuleGint @@ -70,8 +71,17 @@ void cal_gint_rho( double **rho) { ModuleBase::timer::tick("Gint", "cal_gint_rho"); - Gint_rho gint_rho(dm_vec, nspin, rho); - gint_rho.cal_gint(); + #ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_rho_gpu gint_rho_gpu(dm_vec, nspin, rho); + gint_rho_gpu.cal_gint(); + } else + #endif + { + Gint_rho gint_rho(dm_vec, nspin, rho); + gint_rho.cal_gint(); + } ModuleBase::timer::tick("Gint", "cal_gint_rho"); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp new file mode 100644 index 0000000000..a4d80d3abe --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp @@ -0,0 +1,84 @@ +#include "gint_rho_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_rho_gpu::cal_gint() +{ + init_dm_gint_(); + transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_cpu_to_gpu_(); + cal_rho_(); + transfer_gpu_to_cpu_(); +} + +void Gint_rho_gpu::init_dm_gint_() +{ + dm_gint_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_vec_[is] = gint_info_->get_hr(); + } +} + +void Gint_rho_gpu::transfer_cpu_to_gpu_() +{ + dm_gint_d_vec_.resize(nspin_); + rho_d_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is]->get_nnr(), 0, false); + rho_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is]->get_wrapper(), + dm_gint_vec_[is]->get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + } +} + +void Gint_rho_gpu::transfer_gpu_to_cpu_() +{ + for (int is = 0; is < nspin_; is++) + { + checkCuda(cudaMemcpy(rho_[is], rho_d_vec_[is].get_device_ptr(), + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyDeviceToHost)); + } +} + +void Gint_rho_gpu::cal_rho_() +{ +#pragma omp parallel + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_dm(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi(phi.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + const bool is_symm = true; + phi_op.phi_mul_dm(phi.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), *dm_gint_vec_[is], + is_symm, phi_dm.get_device_ptr()); + phi_op.phi_dot_phi(phi.get_device_ptr(), phi_dm.get_device_ptr(), rho_d_vec_[is].get_device_ptr()); + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } +} + +} // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h new file mode 100644 index 0000000000..9026bfb05e --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h @@ -0,0 +1,49 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_rho_gpu: public Gint +{ + public: + Gint_rho_gpu( + const std::vector*>& dm_vec, + const int nspin, + double **rho) + : dm_vec_(dm_vec), nspin_(nspin), rho_(rho) {}; + + void cal_gint() override; + + private: + void init_dm_gint_(); + + void cal_rho_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + // input + const std::vector*> dm_vec_; + const int nspin_; + + // output + double **rho_; + + //======================== + // Intermediate variables + //======================== + std::vector>> dm_gint_vec_; + + std::vector> dm_gint_d_vec_; + std::vector> rho_d_vec_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h index ee2f2e10e2..c38ac39030 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h @@ -59,11 +59,15 @@ class CudaMemWrapper stream_ = stream; if (malloc_host) - { checkCuda(cudaMallocHost((void**)&host_ptr_, capacity * sizeof(T))); } + { + checkCuda(cudaMallocHost((void**)&host_ptr_, capacity * sizeof(T))); + memset(host_ptr_, 0, capacity * sizeof(T)); + } else - { host_ptr_ = nullptr; } + { host_ptr_ = nullptr; } checkCuda(cudaMalloc((void**)&device_ptr_, capacity * sizeof(T))); + checkCuda(cudaMemset(device_ptr_, 0, capacity_ * sizeof(T))); }; ~CudaMemWrapper() @@ -159,6 +163,8 @@ class CudaMemWrapper T* get_device_ptr() { return device_ptr_; }; T* get_host_ptr() { return host_ptr_; }; + const T* get_device_ptr() const { return device_ptr_; }; + const T* get_host_ptr() const { return host_ptr_; }; // Only supports setting size to a value less than or equal to capacity void set_size(int new_size) @@ -170,8 +176,8 @@ class CudaMemWrapper size_ = new_size; }; - int get_size() { return size_; }; - int get_capacity() { return capacity_; }; + int get_size() const { return size_; }; + int get_capacity() const { return capacity_; }; private: T* device_ptr_ = nullptr; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu index cfca916e6f..b09abd3fd5 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu @@ -3,7 +3,7 @@ #include "dgemm_vbatch.h" // The template parameter settings for the function are based on the MAGMA source code settings. -// Specifically, they refer to the settings for the "tt" shape in dgemm_vbatched_core. +// Specifically, they refer to the settings for the "nn" shape in dgemm_vbatched_core. void dgemm_nn_vbatch( int max_m, int max_n, int max_k, const int* m_d, const int* n_d, const int* k_d, @@ -13,20 +13,41 @@ void dgemm_nn_vbatch( int batchCount, cudaStream_t stream, const double* alpha) { - if (max_k < 128) + if (max_k < 32) { - vbatched_gemm_nn_impl - (max_m, max_n, m_d, n_d, k_d, - A_array_d, lda_d, - B_array_d, ldb_d, - C_array_d, ldc_d, - batchCount, stream, alpha); + if(max_k == 8 && max_m ==24) + { + vbatched_gemm_nn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); + } + else if (max_m < 32) + { + vbatched_gemm_nn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); + } + else + { + vbatched_gemm_nn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); + } } else { - if (max_n < 256) + if (max_n < 80) { - vbatched_gemm_nn_impl + vbatched_gemm_nn_impl (max_m, max_n, m_d, n_d, k_d, A_array_d, lda_d, B_array_d, ldb_d, @@ -35,7 +56,7 @@ void dgemm_nn_vbatch( } else { - vbatched_gemm_nn_impl + vbatched_gemm_nn_impl (max_m, max_n, m_d, n_d, k_d, A_array_d, lda_d, B_array_d, ldb_d, diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h index 3cd878b950..8589bcf62e 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h @@ -20,4 +20,4 @@ void dgemm_tn_vbatch( const double* const* B_array_d, const int* ldb_d, double** C_array_d, const int* ldc_d, int batchCount, cudaStream_t stream, - const double* alpha); \ No newline at end of file + const double* alpha = nullptr); \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh index 857d9d9081..2ada532854 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh @@ -26,7 +26,7 @@ template -static __device__ void vbatched_gemm_tt_device(int M, +static __device__ void vbatched_gemm_nn_device(int M, int N, int K, const T* __restrict__ A, @@ -61,18 +61,18 @@ static __device__ void vbatched_gemm_tt_device(int M, T rB[THR_N]; // Registers for the dev->shmem copy - T ra[BLK_M / DIM_YA][BLK_K / DIM_XA]; - T rb[BLK_K / DIM_YB][BLK_N / DIM_XB]; + T ra[BLK_K / DIM_YA][BLK_M / DIM_XA]; + T rb[BLK_N / DIM_YB][BLK_K / DIM_XB]; // bound is the correction to offs_d in order to not get out of memory bound // so bound could be negative value since offs_d could be out of bound - const T* offs_dA = A + blx * BLK_M * LDA + idyA * LDA + idxA; + const T* offs_dA = A + blx * BLK_M + idyA * LDA + idxA; int boundA - = (LDA * (M - 1) + K) - (blx * BLK_M * LDA + idyA * LDA + idxA) - 1; + = (LDA * (K - 1) + M) - (blx * BLK_M + idyA * LDA + idxA) - 1; - const T* offs_dB = B + bly * BLK_N + idyB * LDB + idxB; + const T* offs_dB = B + bly * BLK_N * LDB + idyB * LDB + idxB; int boundB - = (LDB * (K - 1) + N) - (bly * BLK_N + idyB * LDB + idxB) - 1; + = (LDB * (N - 1) + K) - (bly * BLK_N * LDB + idyB * LDB + idxB) - 1; int m, n, k, kk; @@ -89,22 +89,22 @@ static __device__ void vbatched_gemm_tt_device(int M, // Load A dev->shmem #pragma unroll - for (n = 0; n < BLK_M; n += DIM_YA) + for (n = 0; n < BLK_K; n += DIM_YA) { #pragma unroll - for (m = 0; m < BLK_K; m += DIM_XA) + for (m = 0; m < BLK_M; m += DIM_XA) { - sA(n + idyA, m + idxA) = fetch(A, m, n, boundA); + sA(m + idxA, n + idyA) = fetch(A, m, n, boundA); } } #pragma unroll - for (n = 0; n < BLK_K; n += DIM_YB) + for (n = 0; n < BLK_N; n += DIM_YB) { #pragma unroll - for (m = 0; m < BLK_N; m += DIM_XB) + for (m = 0; m < BLK_K; m += DIM_XB) { - sB(n + idyB, m + idxB) = fetch(B, m, n, boundB); + sB(m + idxB, n + idyB) = fetch(B, m, n, boundB); } } @@ -112,18 +112,18 @@ static __device__ void vbatched_gemm_tt_device(int M, for (kk = 0; kk < K - BLK_K; kk += BLK_K) { - offs_dA += BLK_K; - boundA -= BLK_K; + offs_dA += BLK_K * LDA; + boundA -= BLK_K * LDA; - offs_dB += BLK_K * LDB; - boundB -= BLK_K * LDB; + offs_dB += BLK_K; + boundB -= BLK_K; // Load A dev->regs #pragma unroll - for (n = 0; n < BLK_M / DIM_YA; n++) + for (n = 0; n < BLK_K / DIM_YA; n++) { #pragma unroll - for (m = 0; m < BLK_K / DIM_XA; m++) + for (m = 0; m < BLK_M / DIM_XA; m++) { ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA); } @@ -131,10 +131,10 @@ static __device__ void vbatched_gemm_tt_device(int M, // Load B dev->regs #pragma unroll - for (n = 0; n < BLK_K / DIM_YB; n++) + for (n = 0; n < BLK_N / DIM_YB; n++) { #pragma unroll - for (m = 0; m < BLK_N / DIM_XB; m++) + for (m = 0; m < BLK_K / DIM_XB; m++) { rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB); } @@ -174,23 +174,23 @@ static __device__ void vbatched_gemm_tt_device(int M, // Load A regs->shmem #pragma unroll - for (n = 0; n < BLK_M / DIM_YA; n++) + for (n = 0; n < BLK_K / DIM_YA; n++) { #pragma unroll - for (m = 0; m < BLK_K / DIM_XA; m++) + for (m = 0; m < BLK_M / DIM_XA; m++) { - sA(n * DIM_YA + idyA, m * DIM_XA + idxA) = ra[n][m]; + sA(m * DIM_XA + idxA, n * DIM_YA + idyA) = ra[n][m]; } } // Load B regs->shmem #pragma unroll - for (n = 0; n < BLK_K / DIM_YB; n++) + for (n = 0; n < BLK_N / DIM_YB; n++) { #pragma unroll - for (m = 0; m < BLK_N / DIM_XB; m++) + for (m = 0; m < BLK_K / DIM_XB; m++) { - sB(n * DIM_YB + idyB, m * DIM_XB + idxB) = rb[n][m]; + sB(m * DIM_XB + idxB, n * DIM_YB + idyB) = rb[n][m]; } } __syncthreads(); @@ -260,7 +260,7 @@ template -static __global__ void vbatched_gemm_tt_kernel(const int* M, +static __global__ void vbatched_gemm_nn_kernel(const int* M, const int* N, const int* K, const T* const* global_A_array, @@ -293,7 +293,7 @@ static __global__ void vbatched_gemm_tt_kernel(const int* M, { alpha_tmp = alpha[batchid]; } - vbatched_gemm_tt_device +__forceinline__ __device__ T pow_int(const T base, const int exp) +{ + switch (exp) + { + case 0: + return 1.0; + case 1: + return base; + case 2: + return base * base; + case 3: + return base * base * base; + case 4: + return base * base * base * base; + case 5: + return base * base * base * base * base; + default: + double result = std::pow(base, exp); + return result; + } +}; + +template +__forceinline__ __device__ T warpReduceSum(T val) +{ + val += __shfl_xor_sync(0xffffffff, val, 16, 32); + val += __shfl_xor_sync(0xffffffff, val, 8, 32); + val += __shfl_xor_sync(0xffffffff, val, 4, 32); + val += __shfl_xor_sync(0xffffffff, val, 2, 32); + val += __shfl_xor_sync(0xffffffff, val, 1, 32); + return val; +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu index 3c9b6e723f..d7a6e3a4d8 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu @@ -1,5 +1,6 @@ #include "phi_operator_gpu.h" #include "phi_operator_kernel.cuh" +#include "dgemm_vbatch.h" #include namespace ModuleGint @@ -93,9 +94,9 @@ void PhiOperatorGpu::set_bgrid_batch(std::shared_ptr bgrid_batch) checkCuda(cudaEventRecord(event_, stream_)); } -void PhiOperatorGpu::set_phi(double* phi_d) +void PhiOperatorGpu::set_phi(double* phi_d) const { - checkCuda(cudaMemsetAsync(phi_d, 0, phi_len_ * sizeof(double), stream_)); + // checkCuda(cudaMemsetAsync(phi_d, 0, phi_len_ * sizeof(double), stream_)); dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); dim3 threads_per_block(64); set_phi_kernel<<>>( @@ -121,13 +122,43 @@ void PhiOperatorGpu::set_phi(double* phi_d) phi_d); } +void PhiOperatorGpu::set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_y_d, double* dphi_z_d) +{ + dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); + dim3 threads_per_block(64); + set_phi_dphi_kernel<<>>( + gint_gpu_vars_->nwmax, + mgrids_num_, + gint_gpu_vars_->nr_max, + gint_gpu_vars_->dr_uniform, + gint_gpu_vars_->ylmcoef_d, + gint_gpu_vars_->ucell_atom_nwl_d, + gint_gpu_vars_->atom_iw2_new_d, + gint_gpu_vars_->atom_iw2_ylm_d, + gint_gpu_vars_->atom_iw2_l_d, + gint_gpu_vars_->atom_nw_d, + gint_gpu_vars_->iat2it_d, + gint_gpu_vars_->rcut_d, + gint_gpu_vars_->psi_u_d, + gint_gpu_vars_->dpsi_u_d, + gint_gpu_vars_->mgrids_pos_d, + atoms_iat_.get_device_ptr(), + atoms_bgrids_rcoords_.get_device_ptr(), + atoms_num_info_.get_device_ptr(), + atoms_phi_start_.get_device_ptr(), + bgrids_phi_len_.get_device_ptr(), + phi_d, + dphi_x_d, + dphi_y_d, + dphi_z_d); +} + void PhiOperatorGpu::phi_mul_vldr3( const double* vl_d, const double dr3, const double* phi_d, - double* result_d) + double* result_d) const { - checkCuda(cudaMemsetAsync(result_d, 0, phi_len_ * sizeof(double), stream_)); dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); dim3 threads_per_block(64); phi_mul_vldr3_kernel<<>>( @@ -145,12 +176,13 @@ void PhiOperatorGpu::phi_mul_phi_vldr3( const double* phi_d, const double* phi_vldr3_d, std::shared_ptr> hRGint, - double* hr_d) + double* hr_d) const { // ap_num means number of atom pairs int ap_num = 0; int max_m = 0; int max_n = 0; + int max_k = mgrids_num_; checkCuda(cudaEventSynchronize(event_)); for (int i = 0; i < bgrid_batch_->get_batch_size(); i++) { @@ -158,20 +190,20 @@ void PhiOperatorGpu::phi_mul_phi_vldr3( // the length of phi on a mesh grid const int phi_len_mgrid = bgrid->get_phi_len(); const int pre_atoms = atoms_num_info_.get_host_ptr()[i].y; - for (int j = 0; j < bgrid->get_atoms_num(); j++) + for (int ia_1 = 0; ia_1 < bgrid->get_atoms_num(); ia_1++) { - auto atom_1 = bgrid->get_atoms()[j]; + auto atom_1 = bgrid->get_atoms()[ia_1]; const int iat_1 = atom_1->get_iat(); const auto& r_1 = atom_1->get_R(); - const int m = atom_1->get_nw(); - const int phi_1_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + j]; + const int nw1 = atom_1->get_nw(); + const int phi_1_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_1]; - for (int k = 0; k < bgrid->get_atoms_num(); k++) + for (int ia_2 = 0; ia_2 < bgrid->get_atoms_num(); ia_2++) { - auto atom_2 = bgrid->get_atoms()[k]; + auto atom_2 = bgrid->get_atoms()[ia_2]; const int iat_2 = atom_2->get_iat(); const auto& r_2 = atom_2->get_R(); - const int n = atom_2->get_nw(); + const int nw2 = atom_2->get_nw(); if(iat_1 > iat_2) { continue; } @@ -180,22 +212,21 @@ void PhiOperatorGpu::phi_mul_phi_vldr3( if (hr_offset == -1) { continue; } - const int phi_2_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + k]; - + const int phi_2_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_2]; gemm_A_.get_host_ptr()[ap_num] = phi_d + phi_1_offset; gemm_B_.get_host_ptr()[ap_num] = phi_vldr3_d + phi_2_offset; gemm_C_.get_host_ptr()[ap_num] = hr_d + hr_offset; gemm_lda_.get_host_ptr()[ap_num] = phi_len_mgrid; gemm_ldb_.get_host_ptr()[ap_num] = phi_len_mgrid; - gemm_ldc_.get_host_ptr()[ap_num] = n; - gemm_m_.get_host_ptr()[ap_num] = m; - gemm_n_.get_host_ptr()[ap_num] = n; + gemm_ldc_.get_host_ptr()[ap_num] = nw2; + gemm_m_.get_host_ptr()[ap_num] = nw1; + gemm_n_.get_host_ptr()[ap_num] = nw2; gemm_k_.get_host_ptr()[ap_num] = bgrid->get_mgrids_num(); ap_num++; - max_m = std::max(max_m, m); - max_n = std::max(max_n, n); + max_m = std::max(max_m, nw1); + max_n = std::max(max_n, nw2); } } } @@ -211,21 +242,134 @@ void PhiOperatorGpu::phi_mul_phi_vldr3( gemm_k_.copy_host_to_device_async(ap_num); checkCuda(cudaEventRecord(event_, stream_)); - gint_gpu_vars_->fastest_matrix_mul(max_m, - max_n, - gemm_m_.get_device_ptr(), - gemm_n_.get_device_ptr(), - gemm_k_.get_device_ptr(), - gemm_A_.get_device_ptr(), - gemm_lda_.get_device_ptr(), - gemm_B_.get_device_ptr(), - gemm_ldb_.get_device_ptr(), - gemm_C_.get_device_ptr(), - gemm_ldc_.get_device_ptr(), - ap_num, - stream_, - nullptr); + dgemm_tn_vbatch(max_m, + max_n, + max_k, + gemm_m_.get_device_ptr(), + gemm_n_.get_device_ptr(), + gemm_k_.get_device_ptr(), + gemm_A_.get_device_ptr(), + gemm_lda_.get_device_ptr(), + gemm_B_.get_device_ptr(), + gemm_ldb_.get_device_ptr(), + gemm_C_.get_device_ptr(), + gemm_ldc_.get_device_ptr(), + ap_num, + stream_, + nullptr); checkCudaLastError(); } +void PhiOperatorGpu::phi_mul_dm( + const double* phi_d, + const double* dm_d, + const HContainer& dm, + const bool is_symm, + double* phi_dm_d) +{ + checkCuda(cudaMemsetAsync(phi_dm_d, 0, phi_len_ * sizeof(double), stream_)); + // ap_num means number of atom pairs + int ap_num = 0; + int max_m = mgrids_num_; + int max_n = 0; + int max_k = 0; + checkCuda(cudaEventSynchronize(event_)); + for (int i = 0; i < bgrid_batch_->get_batch_size(); i++) + { + auto bgrid = bgrid_batch_->get_bgrids()[i]; + // the length of phi on a mesh grid + const int phi_len_mgrid = bgrid->get_phi_len(); + const int pre_atoms = atoms_num_info_.get_host_ptr()[i].y; + for (int ia_1 = 0; ia_1 < bgrid->get_atoms_num(); ia_1++) + { + auto atom_1 = bgrid->get_atoms()[ia_1]; + const int iat_1 = atom_1->get_iat(); + const auto& r_1 = atom_1->get_R(); + const int nw1 = atom_1->get_nw(); + const int phi_1_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_1]; + int ia_2 = is_symm ? ia_1 : 0; + for (; ia_2 < bgrid->get_atoms_num(); ia_2++) + { + auto atom_2 = bgrid->get_atoms()[ia_2]; + const int iat_2 = atom_2->get_iat(); + const auto& r_2 = atom_2->get_R(); + const int nw2 = atom_2->get_nw(); + + int dm_offset = dm.find_matrix_offset(iat_1, iat_2, r_1-r_2); + if (dm_offset == -1) + { continue; } + + const int phi_dm_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_2]; + + gemm_A_.get_host_ptr()[ap_num] = phi_d + phi_1_offset; + gemm_B_.get_host_ptr()[ap_num] = dm_d + dm_offset; + gemm_C_.get_host_ptr()[ap_num] = phi_dm_d + phi_dm_offset; + gemm_lda_.get_host_ptr()[ap_num] = phi_len_mgrid; + gemm_ldb_.get_host_ptr()[ap_num] = nw2; + gemm_ldc_.get_host_ptr()[ap_num] = phi_len_mgrid; + gemm_m_.get_host_ptr()[ap_num] = mgrids_num_; + gemm_n_.get_host_ptr()[ap_num] = nw2; + gemm_k_.get_host_ptr()[ap_num] = nw1; + gemm_alpha_.get_host_ptr()[ap_num] = ia_1 == ia_2 ? 1.0 : 2.0; + ap_num++; + + max_n = std::max(max_n, nw2); + max_k = std::max(max_k, nw1); + } + } + } + + gemm_A_.copy_host_to_device_async(ap_num); + gemm_B_.copy_host_to_device_async(ap_num); + gemm_C_.copy_host_to_device_async(ap_num); + gemm_lda_.copy_host_to_device_async(ap_num); + gemm_ldb_.copy_host_to_device_async(ap_num); + gemm_ldc_.copy_host_to_device_async(ap_num); + gemm_m_.copy_host_to_device_async(ap_num); + gemm_n_.copy_host_to_device_async(ap_num); + gemm_k_.copy_host_to_device_async(ap_num); + if(is_symm) + { + // if is_symm == false, gemm_alpha_ always equals 1.0, + // so we don't need to copy it to device + gemm_alpha_.copy_host_to_device_async(ap_num); + } + checkCuda(cudaEventRecord(event_, stream_)); + + auto alpha_ptr = is_symm ? gemm_alpha_.get_device_ptr() : nullptr; + dgemm_nn_vbatch(max_m, + max_n, + max_k, + gemm_m_.get_device_ptr(), + gemm_n_.get_device_ptr(), + gemm_k_.get_device_ptr(), + gemm_A_.get_device_ptr(), + gemm_lda_.get_device_ptr(), + gemm_B_.get_device_ptr(), + gemm_ldb_.get_device_ptr(), + gemm_C_.get_device_ptr(), + gemm_ldc_.get_device_ptr(), + ap_num, + stream_, + alpha_ptr); + checkCudaLastError(); +} + +void PhiOperatorGpu::phi_dot_phi( + const double* phi_i_d, + const double* phi_j_d, + double* rho_d) const +{ + dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); + dim3 threads_per_block(64); + phi_dot_phi_kernel<<>>( + phi_i_d, + phi_j_d, + mgrids_num_, + mgrids_local_idx_batch_.get_device_ptr(), + bgrids_phi_len_.get_device_ptr(), + bgrids_phi_start_.get_device_ptr(), + rho_d); +} + } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h index 4469457741..ae9ca61ad0 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h @@ -19,20 +19,33 @@ class PhiOperatorGpu void set_bgrid_batch(std::shared_ptr bgrid_batch); - void set_phi(double* phi_d); + void set_phi(double* phi_d) const; + + void set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_y_d, double* dphi_z_d); void phi_mul_vldr3( const double* vl_d, const double dr3, const double* phi_d, - double* result_d); + double* result_d) const; void phi_mul_phi_vldr3( const double* phi_d, const double* phi_vldr3_d, std::shared_ptr> hRGint, - double* hr_d); + double* hr_d) const; + void phi_mul_dm( + const double* phi_d, + const double* dm_d, + const HContainer& dm, + const bool is_symm, + double* phi_dm_d); + + void phi_dot_phi( + const double* phi_i, + const double* phi_j, + double* rho) const; private: std::shared_ptr bgrid_batch_; @@ -65,16 +78,16 @@ class PhiOperatorGpu // Mapping of the index of meshgrid in the batch of biggrids to the index of meshgrid in the local cell CudaMemWrapper mgrids_local_idx_batch_; - CudaMemWrapper gemm_m_; - CudaMemWrapper gemm_n_; - CudaMemWrapper gemm_k_; - CudaMemWrapper gemm_lda_; - CudaMemWrapper gemm_ldb_; - CudaMemWrapper gemm_ldc_; - CudaMemWrapper gemm_A_; - CudaMemWrapper gemm_B_; - CudaMemWrapper gemm_C_; - CudaMemWrapper gemm_alpha_; + mutable CudaMemWrapper gemm_m_; + mutable CudaMemWrapper gemm_n_; + mutable CudaMemWrapper gemm_k_; + mutable CudaMemWrapper gemm_lda_; + mutable CudaMemWrapper gemm_ldb_; + mutable CudaMemWrapper gemm_ldc_; + mutable CudaMemWrapper gemm_A_; + mutable CudaMemWrapper gemm_B_; + mutable CudaMemWrapper gemm_C_; + mutable CudaMemWrapper gemm_alpha_; }; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu index 30cdab0bef..5dcdabd2ff 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu @@ -1,4 +1,5 @@ #include "phi_operator_kernel.cuh" +#include "gint_helper.cuh" #include "sph.cuh" namespace ModuleGint @@ -44,11 +45,10 @@ __global__ void set_phi_kernel( { if (dist < 1.0E-9) { dist += 1.0E-9; } - double dr[3] = { coord.x / dist, coord.y / dist, coord.z / dist }; // since nwl is less or equal than 5, the size of ylma is (5+1)^2 double ylma[36]; const int nwl = ucell_atom_nwl[atom_type]; - sph_harm(nwl, ylmcoef, dr, ylma); + sph_harm(nwl, ylmcoef, coord.x/dist, coord.y/dist, coord.z/dist, ylma); const double pos = dist / dr_uniform; const int ip = static_cast(pos); @@ -78,6 +78,126 @@ __global__ void set_phi_kernel( iw_nr += nrmax; } } + else + { + int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + + bgrids_phi_len[bgrid_id] * mgrid_id; + for (int iw = 0; iw < atom_nw[atom_type]; iw++) + { + phi[phi_idx + iw] = 0.0; + } + } + } +} + +__global__ void set_phi_dphi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_iw2_l, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ phi, + double* __restrict__ dphi_x, + double* __restrict__ dphi_y, + double* __restrict__ dphi_z) +{ + const int bgrid_id = blockIdx.y; + const int mgrid_id = blockIdx.x; + const int atoms_num = atoms_num_info[bgrid_id].x; + const int pre_atoms_num = atoms_num_info[bgrid_id].y; + const double3 mgrid_pos = mgrids_pos[mgrid_id]; + + for (int atom_id = threadIdx.x; atom_id < atoms_num; atom_id += blockDim.x) + { + const int atom_type = iat2it[atoms_iat[atom_id + pre_atoms_num]]; + const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num]; + const double3 coord = make_double3(mgrid_pos.x-rcoord.x, + mgrid_pos.y-rcoord.y, + mgrid_pos.z-rcoord.z); + double dist = sqrt(coord.x * coord.x + coord.y * coord.y + coord.z * coord.z); + if (dist < rcut[atom_type]) + { + if (dist < 1.0E-9) + { dist += 1.0E-9; } + // since nwl is less or equal than 5, the size of rly is (5+1)^2 + // size of grly = 36 * 3 + double rly[36]; + double grly[36 * 3]; + const int nwl = ucell_atom_nwl[atom_type]; + grad_rl_sph_harm(nwl, ylmcoef, coord.x, coord.y, coord.z, rly, grly); + + // interpolation + const double pos = dist / dr_uniform; + const int ip = static_cast(pos); + const double x0 = pos - ip; + const double x1 = 1.0 - x0; + const double x2 = 2.0 - x0; + const double x3 = 3.0 - x0; + const double x12 = x1 * x2 / 6; + const double x03 = x0 * x3 / 2; + double tmp = 0; + double dtmp = 0; + const int it_nw = atom_type * nwmax; + int iw_nr = it_nw * nrmax + ip; + int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + + bgrids_phi_len[bgrid_id] * mgrid_id; + for (int iw = 0; iw < atom_nw[atom_type]; iw++) + { + if (atom_iw2_new[it_nw + iw]) + { + tmp = x12 * (psi_u[iw_nr] * x3 + psi_u[iw_nr + 3] * x0) + + x03 * (psi_u[iw_nr + 1] * x2 - psi_u[iw_nr + 2] * x1); + dtmp = x12 * (dpsi_u[iw_nr] * x3 + dpsi_u[iw_nr + 3] * x0) + + x03 * (dpsi_u[iw_nr + 1] * x2 - dpsi_u[iw_nr + 2] * x1); + } + const int iw_l = atom_iw2_l[it_nw + iw]; + const int idx_ylm = atom_iw2_ylm [it_nw + iw]; + const double rl = pow_int(dist, iw_l); + const double tmprl = tmp / rl; + + // if phi == nullptr, it means that we only need dphi. + if(phi != nullptr) + { + phi[phi_idx + iw] = tmprl * rly[idx_ylm]; + } + // derivative of wave functions with respect to atom positions. + const double tmpdphi_rly = (dtmp - tmp * iw_l / dist) / rl * rly[idx_ylm] / dist; + + dphi_x[phi_idx + iw] = tmpdphi_rly * coord.x + tmprl * grly[idx_ylm * 3 + 0]; + dphi_y[phi_idx + iw] = tmpdphi_rly * coord.y + tmprl * grly[idx_ylm * 3 + 1]; + dphi_z[phi_idx + iw] = tmpdphi_rly * coord.z + tmprl * grly[idx_ylm * 3 + 2]; + } + } + else + { + int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + + bgrids_phi_len[bgrid_id] * mgrid_id; + for (int iw = 0; iw < atom_nw[atom_type]; iw++) + { + if(phi != nullptr) + { + phi[phi_idx + iw] = 0.0; + } + dphi_x[phi_idx + iw] = 0.0; + dphi_y[phi_idx + iw] = 0.0; + dphi_z[phi_idx + iw] = 0.0; + } + } } } @@ -102,4 +222,55 @@ __global__ void phi_mul_vldr3_kernel( result[phi_start + i] = phi[phi_start + i] * vldr3; } } + +// rho(ir) = \sum_{iwt} \phi_i(ir,iwt) * \phi_j^*(ir,iwt) +// each block calculate the dot product of phi_i and phi_j of a meshgrid +__global__ void phi_dot_phi_kernel( + const double* __restrict__ phi_i, + const double* __restrict__ phi_j, + const int mgrids_per_bgrid, + const int* __restrict__ mgrids_local_idx, + const int* __restrict__ bgrids_phi_len, + const int* __restrict__ bgrids_phi_start, + double* __restrict__ rho) +{ + __shared__ double s_data[32]; // the length of s_data equals the max warp num of a block + const int bgrid_id = blockIdx.y; + const int mgrid_id = blockIdx.x; + const int phi_len = bgrids_phi_len[bgrid_id]; + const int phi_start = bgrids_phi_start[bgrid_id] + mgrid_id * phi_len; + const double* phi_i_mgrid = phi_i + phi_start; + const double* phi_j_mgrid = phi_j + phi_start; + const int mgrid_id_in_batch = bgrid_id * mgrids_per_bgrid + mgrid_id; + const int mgrid_local_idx = mgrids_local_idx[mgrid_id_in_batch]; + const int tid = threadIdx.x; + const int warp_id = tid / 32; + const int lane_id = tid % 32; + double tmp_sum = 0; + + for (int i = tid; i < phi_len; i += blockDim.x) + { + tmp_sum += phi_i_mgrid[i] * phi_j_mgrid[i]; + } + + tmp_sum = warpReduceSum(tmp_sum); + + if (lane_id == 0) + { + s_data[warp_id] = tmp_sum; + } + __syncthreads(); + + tmp_sum = (tid < blockDim.x / 32) ? s_data[tid] : 0; + if(warp_id == 0) + { + tmp_sum = warpReduceSum(tmp_sum); + } + + if(tid == 0) + { + rho[mgrid_local_idx] = tmp_sum; + } +} + } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh index 1cc049f308..ff6c82a434 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh @@ -27,6 +27,32 @@ __global__ void set_phi_kernel( const int* __restrict__ bgrids_phi_len, double* __restrict__ phi); +__global__ void set_phi_dphi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_iw2_l, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ phi, + double* __restrict__ dphi_x, + double* __restrict__ dphi_y, + double* __restrict__ dphi_z); + __global__ void phi_mul_vldr3_kernel( const double* __restrict__ vl, const double dr3, @@ -37,4 +63,14 @@ __global__ void phi_mul_vldr3_kernel( const int* __restrict__ bgrids_phi_start, double* __restrict__ result); +// rho(ir) = \sum_{iwt} \phi_i(ir,iwt) * \phi_j^*(ir,iwt) +// each block calculate the dot product of phi_i and phi_j of a meshgrid +__global__ void phi_dot_phi_kernel( + const double* __restrict__ phi_i, // phi_i(ir,iwt) + const double* __restrict__ phi_j, // phi_j(ir,iwt) + const int mgrids_per_bgrid, // the number of mgrids of each biggrid + const int* __restrict__ mgrids_local_idx, // the idx of mgrid in local cell + const int* __restrict__ bgrids_phi_len, // the length of phi on a mgrid of a biggrid + const int* __restrict__ bgrids_phi_start, // the start idx in phi of each biggrid + double* __restrict__ rho); // rho(ir) } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh index 94d4fcdaa4..b36828222b 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh @@ -6,7 +6,9 @@ namespace ModuleGint static __device__ void sph_harm( const int nwl, const double* __restrict__ ylmcoef, - const double* __restrict__ dr, + const double x, + const double y, + const double z, double* __restrict__ ylma ) { @@ -21,9 +23,9 @@ static __device__ void sph_harm( /*************************** L = 1 ***************************/ - ylma[1] = ylmcoef[1] * dr[2]; // l=1, m=0 - ylma[2] = -ylmcoef[1] * dr[0]; // l=1, m=1 - ylma[3] = -ylmcoef[1] * dr[1]; // l=1, m=-1 + ylma[1] = ylmcoef[1] * z; // l=1, m=0 + ylma[2] = -ylmcoef[1] * x; // l=1, m=1 + ylma[3] = -ylmcoef[1] * y; // l=1, m=-1 if (nwl == 1) return; @@ -31,12 +33,12 @@ static __device__ void sph_harm( L = 2 ***************************/ tmp0=ylmcoef[3] * ylma[0]; - ylma[4] = ylmcoef[2] * dr[2] * ylma[1] - tmp0 ; // l=2, m=0 - tmp0 = ylmcoef[4] * dr[2]; + ylma[4] = ylmcoef[2] * z * ylma[1] - tmp0 ; // l=2, m=0 + tmp0 = ylmcoef[4] * z; ylma[5] = tmp0 * ylma[2]; // l=2,m=1 ylma[6] = tmp0 * ylma[3]; // l=2,m=-1 - tmp0 = ylmcoef[4] * dr[0]; + tmp0 = ylmcoef[4] * x; ylma[7] = ylmcoef[5] * ylma[4] - ylmcoef[6] * ylma[0] - tmp0 * ylma[2]; // l=2,m=2 ylma[8] = -tmp0 * ylma[3]; @@ -47,17 +49,17 @@ static __device__ void sph_harm( L = 3 ***************************/ tmp0=ylmcoef[8] * ylma[1]; - ylma[9] = ylmcoef[7] * dr[2] * ylma[4] - tmp0; // l=3, m=0 + ylma[9] = ylmcoef[7] * z * ylma[4] - tmp0; // l=3, m=0 - tmp0 = ylmcoef[9] * dr[2]; + tmp0 = ylmcoef[9] * z; ylma[10] = tmp0 * ylma[5] - ylmcoef[10] * ylma[2]; // l=3,m=1 ylma[11] = tmp0 * ylma[6] - ylmcoef[10] * ylma[3]; // l=3,m=-1 - tmp0 = ylmcoef[11] * dr[2]; + tmp0 = ylmcoef[11] * z; ylma[12] = tmp0 * ylma[7]; // l=3,m=2 ylma[13] = tmp0 * ylma[8]; // l=3,m=-2 - tmp0 = ylmcoef[14] * dr[0]; + tmp0 = ylmcoef[14] * x; ylma[14] = ylmcoef[12] * ylma[10] - ylmcoef[13] * ylma[2] - tmp0 * ylma[7]; // l=3,m=3 ylma[15] = ylmcoef[12] * ylma[11] - ylmcoef[13] * ylma[3] @@ -69,21 +71,21 @@ static __device__ void sph_harm( L = 4 ***************************/ tmp0=ylmcoef[16] * ylma[4]; - ylma[16] = ylmcoef[15] * dr[2] * ylma[9] - tmp0; // l=4,m=0 + ylma[16] = ylmcoef[15] * z * ylma[9] - tmp0; // l=4,m=0 - tmp0 = ylmcoef[17] * dr[2]; + tmp0 = ylmcoef[17] * z; ylma[17] = tmp0 * ylma[10] - ylmcoef[18] * ylma[5]; // l=4,m=1 ylma[18] = tmp0 * ylma[11] - ylmcoef[18] * ylma[6]; // l=4,m=-1 - tmp0 = ylmcoef[19] * dr[2]; + tmp0 = ylmcoef[19] * z; ylma[19] = tmp0 * ylma[12] - ylmcoef[20] * ylma[7]; // l=4,m=2 ylma[20] = tmp0 * ylma[13] - ylmcoef[20] * ylma[8]; // l=4,m=-2 - tmp0 = 3.0 * dr[2]; + tmp0 = 3.0 * z; ylma[21] = tmp0 * ylma[14]; // l=4,m=3 ylma[22] = tmp0 * ylma[15]; // l=4,m=-3 - tmp0 = ylmcoef[23] * dr[0]; + tmp0 = ylmcoef[23] * x; ylma[23] = ylmcoef[21] * ylma[19] - ylmcoef[22] * ylma[7] - tmp0 * ylma[14]; // l=4,m=4 ylma[24] = ylmcoef[21] * ylma[20] - ylmcoef[22] * ylma[8] @@ -96,25 +98,25 @@ static __device__ void sph_harm( ***************************/ tmp0=ylmcoef[25] * ylma[9]; ylma[25] - = ylmcoef[24] * dr[2] * ylma[16] - tmp0; // l=5,m=0 + = ylmcoef[24] * z * ylma[16] - tmp0; // l=5,m=0 - tmp0 = ylmcoef[26] * dr[2]; + tmp0 = ylmcoef[26] * z; ylma[26] = tmp0 * ylma[17] - ylmcoef[27] * ylma[10]; // l=5,m=1 ylma[27] = tmp0 * ylma[18] - ylmcoef[27] * ylma[11]; // l=5,m=-1 - tmp0 = ylmcoef[28] * dr[2]; + tmp0 = ylmcoef[28] * z; ylma[28] = tmp0 * ylma[19] - ylmcoef[29] * ylma[12]; // l=5,m=2 ylma[29] = tmp0 * ylma[20] - ylmcoef[29] * ylma[13]; // l=5,m=-2 - tmp0 = ylmcoef[30] * dr[2]; + tmp0 = ylmcoef[30] * z; ylma[30] = tmp0 * ylma[21] - ylmcoef[31] * ylma[14]; // l=5,m=3 ylma[31] = tmp0 * ylma[22] - ylmcoef[31] * ylma[15]; // l=5,m=-3 - tmp0 = ylmcoef[32] * dr[2]; + tmp0 = ylmcoef[32] * z; ylma[32] = tmp0 * ylma[23]; // l=5,m=4 ylma[33] = tmp0 * ylma[24]; // l=5,m=-4 - tmp0 = ylmcoef[35] * dr[0]; + tmp0 = ylmcoef[35] * x; ylma[34] = ylmcoef[33] * ylma[30] - ylmcoef[34] * ylma[14] - tmp0 * ylma[23]; // l=5,m=5 ylma[35] = ylmcoef[33] * ylma[31] - ylmcoef[34] * ylma[15] @@ -135,7 +137,7 @@ static __device__ void sph_harm( for (int im = 0; im < 2 * il - 1; im++) { int imm = (im + 1) / 2; - ylma[istart + im] = fac2 / sqrt((double)istart - imm * imm) * (dr[2] + ylma[istart + im] = fac2 / sqrt((double)istart - imm * imm) * (z * ylma[istart1 + im] - sqrt((double)istart1 - imm * imm) / fac4 * ylma[istart2 + im]); } @@ -145,9 +147,9 @@ static __device__ void sph_harm( double bl3 = sqrt(2.0) / fac2; ylma[istart + 2 * il - 1] = (bl3 * ylma[istart + 2 * il - 5] - bl2 * - ylma[istart2 + 2 * il - 5] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 3]) / + ylma[istart2 + 2 * il - 5] - 2.0 * x * ylma[istart1 + 2 * il - 3]) / bl1; ylma[istart + 2 * il] = (bl3 * ylma[istart + 2 * il - 4] - bl2 * - ylma[istart2 + 2 * il - 4] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 2]) / + ylma[istart2 + 2 * il - 4] - 2.0 * x * ylma[istart1 + 2 * il - 2]) / bl1; }*/ } @@ -155,15 +157,17 @@ static __device__ void sph_harm( static __device__ void grad_rl_sph_harm( const int nwl, const double* __restrict__ ylmcoef, - const double* __restrict__ dr, + const double x, + const double y, + const double z, double* __restrict__ rly, double* __restrict__ grly ) { - double r2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; - double tx = dr[0] * 2; - double ty = dr[1] * 2; - double tz = dr[2] * 2; + double r2 = x * x + y * y + z * z; + double tx = x * 2; + double ty = y * 2; + double tz = z * 2; //begin calculation /*************************** @@ -176,15 +180,15 @@ static __device__ void grad_rl_sph_harm( /*************************** L = 1 ***************************/ - rly[1] = ylmcoef[1]*dr[2]; //l=1, m=0 + rly[1] = ylmcoef[1]*z; //l=1, m=0 grly[3] = grly[4] = 0.0; grly[5] = ylmcoef[1]; - rly[2] = -ylmcoef[1]*dr[0]; //l=1, m=1 + rly[2] = -ylmcoef[1]*x; //l=1, m=1 grly[7] = grly[8] = 0.0; grly[6] = -ylmcoef[1]; - rly[3] = -ylmcoef[1]*dr[1]; //l=1, m=-1 + rly[3] = -ylmcoef[1]*y; //l=1, m=-1 grly[9] = grly[11] = 0.0; grly[10] = -ylmcoef[1]; @@ -193,35 +197,35 @@ static __device__ void grad_rl_sph_harm( /*************************** L = 2 ***************************/ - rly[4] = ylmcoef[2]*dr[2]*rly[1]-ylmcoef[3]*rly[0]*r2;//l=2, m=0 - grly[12] = ylmcoef[2]*dr[2]*grly[3]-ylmcoef[3]*(grly[0]*r2+rly[0]*tx);//l=2, m=0 - grly[13] = ylmcoef[2]*dr[2]*grly[4]-ylmcoef[3]*(grly[1]*r2+rly[0]*ty);//l=2, m=0 - grly[14] = ylmcoef[2]*(dr[2]*grly[5]+rly[1])-ylmcoef[3]*(grly[2]*r2+rly[0]*tz);//l=2, m=0 + rly[4] = ylmcoef[2]*z*rly[1]-ylmcoef[3]*rly[0]*r2;//l=2, m=0 + grly[12] = ylmcoef[2]*z*grly[3]-ylmcoef[3]*(grly[0]*r2+rly[0]*tx);//l=2, m=0 + grly[13] = ylmcoef[2]*z*grly[4]-ylmcoef[3]*(grly[1]*r2+rly[0]*ty);//l=2, m=0 + grly[14] = ylmcoef[2]*(z*grly[5]+rly[1])-ylmcoef[3]*(grly[2]*r2+rly[0]*tz);//l=2, m=0 - double tmp0 = ylmcoef[4]*dr[2]; + double tmp0 = ylmcoef[4]*z; rly[5] = tmp0*rly[2];//l=2,m=1 grly[15] = tmp0*grly[6]; grly[16] = tmp0*grly[7]; - grly[17] = ylmcoef[4]*(rly[2]+dr[2]*grly[8]); + grly[17] = ylmcoef[4]*(rly[2]+z*grly[8]); rly[6] = tmp0*rly[3];//l=2,m=-1 grly[18] = tmp0*grly[9]; grly[19] = tmp0*grly[10]; - grly[20] = ylmcoef[4]*(rly[3]+dr[2]*grly[11]); + grly[20] = ylmcoef[4]*(rly[3]+z*grly[11]); - double tmp2 = ylmcoef[4]*dr[0]; + double tmp2 = ylmcoef[4]*x; rly[7]= ylmcoef[5]*rly[4]-ylmcoef[6]*rly[0]*r2 - tmp2*rly[2];//l=2,m=2 - grly[21] = ylmcoef[5]*grly[12]-ylmcoef[6]*(rly[0]*tx+grly[0]*r2)-ylmcoef[4]*(dr[0]*grly[6]+rly[2]); + grly[21] = ylmcoef[5]*grly[12]-ylmcoef[6]*(rly[0]*tx+grly[0]*r2)-ylmcoef[4]*(x*grly[6]+rly[2]); // std::cout << "\np1 = "<< ylmcoef[5]*grly[12] << " p2 = " << -ylmcoef[6]*rly[0]*tx -// << " p3 = " << -ylmcoef[4]*dr[0]*grly[6] << " p4 = " << -ylmcoef[4]*rly[2] << std::endl; +// << " p3 = " << -ylmcoef[4]*x*grly[6] << " p4 = " << -ylmcoef[4]*rly[2] << std::endl; grly[22] = ylmcoef[5]*grly[13]-ylmcoef[6]*(rly[0]*ty+grly[1]*r2)-tmp2*grly[7]; grly[23] = ylmcoef[5]*grly[14]-ylmcoef[6]*(rly[0]*tz+grly[2]*r2)-tmp2*grly[8]; rly[8] = -tmp2*rly[3]; - grly[24] = -ylmcoef[4]*(rly[3]+dr[0]*grly[9]); + grly[24] = -ylmcoef[4]*(rly[3]+x*grly[9]); grly[25] = -tmp2*grly[10]; grly[26] = -tmp2*grly[11]; // rly[8] = tmp1+tmp2*rly[3];//l=2,m=-2 @@ -230,41 +234,41 @@ static __device__ void grad_rl_sph_harm( /*************************** L = 3 ***************************/ - rly[9] = ylmcoef[7]*dr[2]*rly[4]-ylmcoef[8]*rly[1]*r2; //l=3, m=0 - grly[27] = ylmcoef[7]*dr[2]*grly[12]-ylmcoef[8]*(rly[1]*tx+grly[3]*r2); - grly[28] = ylmcoef[7]*dr[2]*grly[13]-ylmcoef[8]*(rly[1]*ty+grly[4]*r2); - grly[29] = ylmcoef[7]*(rly[4]+dr[2]*grly[14])-ylmcoef[8]*(rly[1]*tz+grly[5]*r2); + rly[9] = ylmcoef[7]*z*rly[4]-ylmcoef[8]*rly[1]*r2; //l=3, m=0 + grly[27] = ylmcoef[7]*z*grly[12]-ylmcoef[8]*(rly[1]*tx+grly[3]*r2); + grly[28] = ylmcoef[7]*z*grly[13]-ylmcoef[8]*(rly[1]*ty+grly[4]*r2); + grly[29] = ylmcoef[7]*(rly[4]+z*grly[14])-ylmcoef[8]*(rly[1]*tz+grly[5]*r2); - double tmp3 = ylmcoef[9]*dr[2]; + double tmp3 = ylmcoef[9]*z; rly[10] = tmp3*rly[5]-ylmcoef[10]*rly[2]*r2;//l=3,m=1 grly[30] = tmp3*grly[15]-ylmcoef[10]*(grly[6]*r2+rly[2]*tx); grly[31] = tmp3*grly[16]-ylmcoef[10]*(grly[7]*r2+rly[2]*ty); - grly[32] = ylmcoef[9]*(dr[2]*grly[17]+rly[5])-ylmcoef[10]*(grly[8]*r2+rly[2]*tz); + grly[32] = ylmcoef[9]*(z*grly[17]+rly[5])-ylmcoef[10]*(grly[8]*r2+rly[2]*tz); rly[11] = tmp3*rly[6]-ylmcoef[10]*rly[3]*r2;//l=3,m=-1 grly[33] = tmp3*grly[18]-ylmcoef[10]*(grly[9]*r2+rly[3]*tx); grly[34] = tmp3*grly[19]-ylmcoef[10]*(grly[10]*r2+rly[3]*ty); - grly[35] = ylmcoef[9]*(dr[2]*grly[20]+rly[6])-ylmcoef[10]*(grly[11]*r2+rly[3]*tz); + grly[35] = ylmcoef[9]*(z*grly[20]+rly[6])-ylmcoef[10]*(grly[11]*r2+rly[3]*tz); - double tmp4 = ylmcoef[11]*dr[2]; + double tmp4 = ylmcoef[11]*z; rly[12] = tmp4*rly[7];//l=3,m=2 grly[36] = tmp4*grly[21]; grly[37] = tmp4*grly[22]; - grly[38] = ylmcoef[11]*(dr[2]*grly[23]+rly[7]); + grly[38] = ylmcoef[11]*(z*grly[23]+rly[7]); rly[13] = tmp4*rly[8];//l=3,m=-2 grly[39] = tmp4*grly[24]; grly[40] = tmp4*grly[25]; - grly[41] = ylmcoef[11]*(dr[2]*grly[26]+rly[8]); + grly[41] = ylmcoef[11]*(z*grly[26]+rly[8]); - double tmp5 = ylmcoef[14]*dr[0]; + double tmp5 = ylmcoef[14]*x; rly[14] = ylmcoef[12]*rly[10]-ylmcoef[13]*rly[2]*r2-tmp5*rly[7];//l=3,m=3 - grly[42] = ylmcoef[12]*grly[30]-ylmcoef[13]*(rly[2]*tx+grly[6]*r2)-ylmcoef[14]*(rly[7]+dr[0]*grly[21]); + grly[42] = ylmcoef[12]*grly[30]-ylmcoef[13]*(rly[2]*tx+grly[6]*r2)-ylmcoef[14]*(rly[7]+x*grly[21]); grly[43] = ylmcoef[12]*grly[31]-ylmcoef[13]*(rly[2]*ty+grly[7]*r2)-tmp5*grly[22]; grly[44] = ylmcoef[12]*grly[32]-ylmcoef[13]*(rly[2]*tz+grly[8]*r2)-tmp5*grly[23]; rly[15] = ylmcoef[12]*rly[11]-ylmcoef[13]*rly[3]*r2-tmp5*rly[8];//l=3,m=-3 - grly[45] = ylmcoef[12]*grly[33]-ylmcoef[13]*(rly[3]*tx+grly[9]*r2)-ylmcoef[14]*(rly[8]+dr[0]*grly[24]); + grly[45] = ylmcoef[12]*grly[33]-ylmcoef[13]*(rly[3]*tx+grly[9]*r2)-ylmcoef[14]*(rly[8]+x*grly[24]); grly[46] = ylmcoef[12]*grly[34]-ylmcoef[13]*(rly[3]*ty+grly[10]*r2)-tmp5*grly[25]; grly[47] = ylmcoef[12]*grly[35]-ylmcoef[13]*(rly[3]*tz+grly[11]*r2)-tmp5*grly[26]; if (nwl == 3) return; @@ -272,53 +276,53 @@ static __device__ void grad_rl_sph_harm( /*************************** L = 4 ***************************/ - rly[16] = ylmcoef[15]*dr[2]*rly[9]-ylmcoef[16]*rly[4]*r2;//l=4,m=0 - grly[48] = ylmcoef[15]*dr[2]*grly[27]-ylmcoef[16]*(rly[4]*tx+grly[12]*r2); - grly[49] = ylmcoef[15]*dr[2]*grly[28]-ylmcoef[16]*(rly[4]*ty+grly[13]*r2); - grly[50] = ylmcoef[15]*(dr[2]*grly[29]+rly[9])-ylmcoef[16]*(rly[4]*tz+grly[14]*r2); + rly[16] = ylmcoef[15]*z*rly[9]-ylmcoef[16]*rly[4]*r2;//l=4,m=0 + grly[48] = ylmcoef[15]*z*grly[27]-ylmcoef[16]*(rly[4]*tx+grly[12]*r2); + grly[49] = ylmcoef[15]*z*grly[28]-ylmcoef[16]*(rly[4]*ty+grly[13]*r2); + grly[50] = ylmcoef[15]*(z*grly[29]+rly[9])-ylmcoef[16]*(rly[4]*tz+grly[14]*r2); - double tmp6 = ylmcoef[17]*dr[2]; + double tmp6 = ylmcoef[17]*z; rly[17] = tmp6*rly[10]-ylmcoef[18]*rly[5]*r2;//l=4,m=1 grly[51] = tmp6*grly[30]-ylmcoef[18]*(rly[5]*tx+grly[15]*r2); grly[52] = tmp6*grly[31]-ylmcoef[18]*(rly[5]*ty+grly[16]*r2); - grly[53] = ylmcoef[17]*(dr[2]*grly[32]+rly[10])-ylmcoef[18]*(rly[5]*tz+grly[17]*r2); + grly[53] = ylmcoef[17]*(z*grly[32]+rly[10])-ylmcoef[18]*(rly[5]*tz+grly[17]*r2); rly[18] = tmp6*rly[11]-ylmcoef[18]*rly[6]*r2;//l=4,m=-1 grly[54] = tmp6*grly[33]-ylmcoef[18]*(rly[6]*tx+grly[18]*r2); grly[55] = tmp6*grly[34]-ylmcoef[18]*(rly[6]*ty+grly[19]*r2); - grly[56] = ylmcoef[17]*(dr[2]*grly[35]+rly[11])-ylmcoef[18]*(rly[6]*tz+grly[20]*r2); + grly[56] = ylmcoef[17]*(z*grly[35]+rly[11])-ylmcoef[18]*(rly[6]*tz+grly[20]*r2); - double tmp7 = ylmcoef[19]*dr[2]; + double tmp7 = ylmcoef[19]*z; rly[19] = tmp7*rly[12]-ylmcoef[20]*rly[7]*r2;//l=4,m=2 grly[57] = tmp7*grly[36]-ylmcoef[20]*(rly[7]*tx+grly[21]*r2); grly[58] = tmp7*grly[37]-ylmcoef[20]*(rly[7]*ty+grly[22]*r2); - grly[59] = ylmcoef[19]*(dr[2]*grly[38]+rly[12])-ylmcoef[20]*(rly[7]*tz+grly[23]*r2); + grly[59] = ylmcoef[19]*(z*grly[38]+rly[12])-ylmcoef[20]*(rly[7]*tz+grly[23]*r2); rly[20] = tmp7*rly[13]-ylmcoef[20]*rly[8]*r2;//l=4,m=-2 grly[60] = tmp7*grly[39]-ylmcoef[20]*(rly[8]*tx+grly[24]*r2); grly[61] = tmp7*grly[40]-ylmcoef[20]*(rly[8]*ty+grly[25]*r2); - grly[62] = ylmcoef[19]*(dr[2]*grly[41]+rly[13])-ylmcoef[20]*(rly[8]*tz+grly[26]*r2); + grly[62] = ylmcoef[19]*(z*grly[41]+rly[13])-ylmcoef[20]*(rly[8]*tz+grly[26]*r2); - double tmp8 = 3.0*dr[2]; + double tmp8 = 3.0*z; rly[21] = tmp8*rly[14];//l=4,m=3 grly[63] = tmp8*grly[42]; grly[64] = tmp8*grly[43]; - grly[65] = 3.0*(dr[2]*grly[44]+rly[14]); + grly[65] = 3.0*(z*grly[44]+rly[14]); rly[22] = tmp8*rly[15];//l=4,m=-3 grly[66] = tmp8*grly[45]; grly[67] = tmp8*grly[46]; - grly[68] = 3.0*(dr[2]*grly[47]+rly[15]); + grly[68] = 3.0*(z*grly[47]+rly[15]); - double tmp9 = ylmcoef[23]*dr[0]; + double tmp9 = ylmcoef[23]*x; rly[23] = ylmcoef[21]*rly[19]-ylmcoef[22]*rly[7]*r2-tmp9*rly[14];//l=4,m=4 - grly[69] = ylmcoef[21]*grly[57]-ylmcoef[22]*(rly[7]*tx+grly[21]*r2)-ylmcoef[23]*(dr[0]*grly[42]+rly[14]); + grly[69] = ylmcoef[21]*grly[57]-ylmcoef[22]*(rly[7]*tx+grly[21]*r2)-ylmcoef[23]*(x*grly[42]+rly[14]); grly[70] = ylmcoef[21]*grly[58]-ylmcoef[22]*(rly[7]*ty+grly[22]*r2)-tmp9*grly[43]; grly[71] = ylmcoef[21]*grly[59]-ylmcoef[22]*(rly[7]*tz+grly[23]*r2)-tmp9*grly[44]; rly[24] = ylmcoef[21]*rly[20]-ylmcoef[22]*rly[8]*r2-tmp9*rly[15];//l=4,m=-4 - grly[72] = ylmcoef[21]*grly[60]-ylmcoef[22]*(rly[8]*tx+grly[24]*r2)-ylmcoef[23]*(dr[0]*grly[45]+rly[15]); + grly[72] = ylmcoef[21]*grly[60]-ylmcoef[22]*(rly[8]*tx+grly[24]*r2)-ylmcoef[23]*(x*grly[45]+rly[15]); grly[73] = ylmcoef[21]*grly[61]-ylmcoef[22]*(rly[8]*ty+grly[25]*r2)-tmp9*grly[46]; grly[74] = ylmcoef[21]*grly[62]-ylmcoef[22]*(rly[8]*tz+grly[26]*r2)-tmp9*grly[47]; @@ -327,63 +331,63 @@ static __device__ void grad_rl_sph_harm( /*************************** L = 5 ***************************/ - rly[25] = ylmcoef[24]*dr[2]*rly[16]-ylmcoef[25]*rly[9]*r2;//l=5,m=0 - grly[75] = ylmcoef[24]*dr[2]*grly[48]-ylmcoef[25]*(rly[9]*tx+grly[27]*r2); - grly[76] = ylmcoef[24]*dr[2]*grly[49]-ylmcoef[25]*(rly[9]*ty+grly[28]*r2); - grly[77] = ylmcoef[24]*(dr[2]*grly[50]+rly[16])-ylmcoef[25]*(rly[9]*tz+grly[29]*r2); + rly[25] = ylmcoef[24]*z*rly[16]-ylmcoef[25]*rly[9]*r2;//l=5,m=0 + grly[75] = ylmcoef[24]*z*grly[48]-ylmcoef[25]*(rly[9]*tx+grly[27]*r2); + grly[76] = ylmcoef[24]*z*grly[49]-ylmcoef[25]*(rly[9]*ty+grly[28]*r2); + grly[77] = ylmcoef[24]*(z*grly[50]+rly[16])-ylmcoef[25]*(rly[9]*tz+grly[29]*r2); - double tmp10 = ylmcoef[26]*dr[2]; + double tmp10 = ylmcoef[26]*z; rly[26] = tmp10*rly[17]-ylmcoef[27]*rly[10]*r2;//l=5,m=1 grly[78] = tmp10*grly[51]-ylmcoef[27]*(rly[10]*tx+grly[30]*r2); grly[79] = tmp10*grly[52]-ylmcoef[27]*(rly[10]*ty+grly[31]*r2); - grly[80] = ylmcoef[26]*(dr[2]*grly[53]+rly[17])-ylmcoef[27]*(rly[10]*tz+grly[32]*r2); + grly[80] = ylmcoef[26]*(z*grly[53]+rly[17])-ylmcoef[27]*(rly[10]*tz+grly[32]*r2); rly[27] = tmp10*rly[18]-ylmcoef[27]*rly[11]*r2;//l=5,m=-1 grly[81] = tmp10*grly[54]-ylmcoef[27]*(rly[11]*tx+grly[33]*r2); grly[82] = tmp10*grly[55]-ylmcoef[27]*(rly[11]*ty+grly[34]*r2); - grly[83] = ylmcoef[26]*(dr[2]*grly[56]+rly[18])-ylmcoef[27]*(rly[11]*tz+grly[35]*r2); + grly[83] = ylmcoef[26]*(z*grly[56]+rly[18])-ylmcoef[27]*(rly[11]*tz+grly[35]*r2); - double tmp11 = ylmcoef[28]*dr[2]; + double tmp11 = ylmcoef[28]*z; rly[28] = tmp11*rly[19]-ylmcoef[29]*rly[12]*r2;//l=5,m=2 grly[84] = tmp11*grly[57]-ylmcoef[29]*(rly[12]*tx+grly[36]*r2); grly[85] = tmp11*grly[58]-ylmcoef[29]*(rly[12]*ty+grly[37]*r2); - grly[86] = ylmcoef[28]*(dr[2]*grly[59]+rly[19])-ylmcoef[29]*(rly[12]*tz+grly[38]*r2); + grly[86] = ylmcoef[28]*(z*grly[59]+rly[19])-ylmcoef[29]*(rly[12]*tz+grly[38]*r2); rly[29] = tmp11*rly[20]-ylmcoef[29]*rly[13]*r2;//l=5,m=-2 grly[87] = tmp11*grly[60]-ylmcoef[29]*(rly[13]*tx+grly[39]*r2); grly[88] = tmp11*grly[61]-ylmcoef[29]*(rly[13]*ty+grly[40]*r2); - grly[89] = ylmcoef[28]*(dr[2]*grly[62]+rly[20])-ylmcoef[29]*(rly[13]*tz+grly[41]*r2); + grly[89] = ylmcoef[28]*(z*grly[62]+rly[20])-ylmcoef[29]*(rly[13]*tz+grly[41]*r2); - double tmp12 = ylmcoef[30]*dr[2]; + double tmp12 = ylmcoef[30]*z; rly[30] = tmp12*rly[21]-ylmcoef[31]*rly[14]*r2;//l=5,m=3 grly[90] = tmp12*grly[63]-ylmcoef[31]*(grly[42]*r2+rly[14]*tx); grly[91] = tmp12*grly[64]-ylmcoef[31]*(grly[43]*r2+rly[14]*ty); - grly[92] = ylmcoef[30]*(dr[2]*grly[65]+rly[21])-ylmcoef[31]*(grly[44]*r2+rly[14]*tz); + grly[92] = ylmcoef[30]*(z*grly[65]+rly[21])-ylmcoef[31]*(grly[44]*r2+rly[14]*tz); rly[31] = tmp12*rly[22]-ylmcoef[31]*rly[15]*r2;//l=5,m=-3 grly[93] = tmp12*grly[66]-ylmcoef[31]*(grly[45]*r2+rly[15]*tx); grly[94] = tmp12*grly[67]-ylmcoef[31]*(grly[46]*r2+rly[15]*ty); - grly[95] = ylmcoef[30]*(dr[2]*grly[68]+rly[22])-ylmcoef[31]*(grly[47]*r2+rly[15]*tz); + grly[95] = ylmcoef[30]*(z*grly[68]+rly[22])-ylmcoef[31]*(grly[47]*r2+rly[15]*tz); - double tmp13 = ylmcoef[32]*dr[2]; + double tmp13 = ylmcoef[32]*z; rly[32] = tmp13*rly[23];//l=5,m=4 grly[96] = tmp13*grly[69]; grly[97] = tmp13*grly[70]; - grly[98] = ylmcoef[32]*(rly[23]+dr[2]*grly[71]); + grly[98] = ylmcoef[32]*(rly[23]+z*grly[71]); rly[33] = tmp13*rly[24];//l=5,m=-4 grly[99] = tmp13*grly[72]; grly[100] = tmp13*grly[73]; - grly[101] = ylmcoef[32]*(rly[24]+dr[2]*grly[74]); + grly[101] = ylmcoef[32]*(rly[24]+z*grly[74]); - double tmp14 = ylmcoef[35]*dr[0]; + double tmp14 = ylmcoef[35]*x; rly[34] = ylmcoef[33]*rly[30]-ylmcoef[34]*rly[14]*r2-tmp14*rly[23];//l=5,m=5 - grly[102] = ylmcoef[33]*grly[90]-ylmcoef[34]*(rly[14]*tx+grly[42]*r2)-ylmcoef[35]*(dr[0]*grly[69]+rly[23]); + grly[102] = ylmcoef[33]*grly[90]-ylmcoef[34]*(rly[14]*tx+grly[42]*r2)-ylmcoef[35]*(x*grly[69]+rly[23]); grly[103] = ylmcoef[33]*grly[91]-ylmcoef[34]*(rly[14]*ty+grly[43]*r2)-tmp14*grly[70]; grly[104] = ylmcoef[33]*grly[92]-ylmcoef[34]*(rly[14]*tz+grly[44]*r2)-tmp14*grly[71]; rly[35] = ylmcoef[33]*rly[31]-ylmcoef[34]*rly[15]*r2-tmp14*rly[24];//l=5,m=-5 - grly[105] = ylmcoef[33]*grly[93]-ylmcoef[34]*(rly[15]*tx+grly[45]*r2)-ylmcoef[35]*(dr[0]*grly[72]+rly[24]); + grly[105] = ylmcoef[33]*grly[93]-ylmcoef[34]*(rly[15]*tx+grly[45]*r2)-ylmcoef[35]*(x*grly[72]+rly[24]); grly[106] = ylmcoef[33]*grly[94]-ylmcoef[34]*(rly[15]*ty+grly[46]*r2)-tmp14*grly[73]; grly[107] = ylmcoef[33]*grly[95]-ylmcoef[34]*(rly[15]*tz+grly[47]*r2)-tmp14*grly[74]; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp index 44603560d2..79c4b29c23 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp @@ -103,7 +103,7 @@ void PhiOperator::phi_mul_vldr3( } } -// hr(iwt_i,iwt_j) = \sum_{ir} phi_i(ir,iwt_i) * phi_i(ir,iwt_j) +// hr(iwt_i,iwt_j) += \sum_{ir} phi_i(ir,iwt_i) * phi_i(ir,iwt_j) // this is a thread-safe function template void PhiOperator::phi_mul_phi( From 3f1b710df085edde8407cec73985a5b44e0064d8 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 9 May 2025 22:56:07 +0800 Subject: [PATCH 06/63] enable fvl calculation --- .../module_gint/CMakeLists.txt | 1 + .../module_gint/temp_gint/batch_biggrid.cpp | 1 + .../module_gint/temp_gint/batch_biggrid.h | 4 + .../module_gint/temp_gint/gint_fvl_gpu.cpp | 131 ++++++++++++++ .../module_gint/temp_gint/gint_fvl_gpu.h | 61 +++++++ .../module_gint/temp_gint/gint_info.h | 1 + .../module_gint/temp_gint/gint_interface.cpp | 14 +- .../temp_gint/kernel/phi_operator_gpu.cu | 52 ++++++ .../temp_gint/kernel/phi_operator_gpu.h | 20 ++- .../temp_gint/kernel/phi_operator_kernel.cu | 168 +++++++++++++++++- .../temp_gint/kernel/phi_operator_kernel.cuh | 33 +++- .../module_gint/temp_gint/phi_operator.cpp | 2 +- 12 files changed, 479 insertions(+), 9 deletions(-) create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index c284188c3c..6b771efc23 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -61,6 +61,7 @@ if(NEW_GINT) temp_gint/batch_biggrid temp_gint/gint_vl_gpu.cpp temp_gint/gint_rho_gpu.cpp + temp_gint/gint_fvl_gpu.cpp temp_gint/kernel/dgemm_vbatch.cu ) endif() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp index 5f514b669d..8372506e46 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp @@ -19,6 +19,7 @@ BatchBigGrid::BatchBigGrid(std::vector> biggrids) { max_nw_ = std::max(max_nw_, atom->get_nw()); } + max_atoms_num_per_bgrid_ = std::max(max_atoms_num_per_bgrid_, biggrid->get_atoms_num()); atoms_num_ += biggrid->get_atoms_num(); atom_pairs_num += std::pow(biggrid->get_atoms_num(), 2); phi_len_ += biggrid->get_phi_len() * biggrid->get_mgrids_num(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h index 401e1c38e9..4556fac707 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h @@ -16,6 +16,7 @@ class BatchBigGrid int get_batch_size() const { return biggrids_.size(); }; int get_atoms_num() const { return atoms_num_; }; int get_phi_len() const { return phi_len_;} + int get_max_atoms_num_per_bgrid() const { return max_atoms_num_per_bgrid_; }; bool empty() {return atoms_num_ == 0; }; static int get_max_batch_size() { return max_batch_size_; }; static int get_max_atoms_num() { return max_atoms_num_; }; @@ -33,6 +34,9 @@ class BatchBigGrid // number of atoms in the batch int atoms_num_ = 0; + // the max number of atoms of a single biggrid + int max_atoms_num_per_bgrid_ = 0; + // the max number of biggrids of a biggrids batch static int max_batch_size_; // the max number of total atoms of a biggrids batch diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp new file mode 100644 index 0000000000..e22cf00d69 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp @@ -0,0 +1,131 @@ +#include "gint_fvl_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_fvl_gpu::cal_gint() +{ + init_dm_gint_(); + transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_cpu_to_gpu_(); + cal_fvl_svl_(); + transfer_gpu_to_cpu_(); +} + +void Gint_fvl_gpu::init_dm_gint_() +{ + dm_gint_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_vec_[is] = gint_info_->get_hr(); + } +} + +void Gint_fvl_gpu::transfer_cpu_to_gpu_() +{ + dm_gint_d_vec_.resize(nspin_); + vr_eff_d_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is]->get_nnr(), 0, false); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is]->get_wrapper(), + dm_gint_vec_[is]->get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + vr_eff_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_vec_[is].get_device_ptr(), vr_eff_[is], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + } + if (isforce_) + { + fvl_d_ = CudaMemWrapper(gint_info_->get_nat() * 3, 0, true); + } + if (isstress_) + { + svl_d_ = CudaMemWrapper(6, 0, true); + } +} + +void Gint_fvl_gpu::transfer_gpu_to_cpu_() +{ + if (isforce_) + { + fvl_d_.copy_device_to_host_sync(); + for (int iat = 0; iat < gint_info_->get_nat(); iat++) + { + for (int j = 0; j < 3; j++) + { + fvl_[0](iat, j) += fvl_d_.get_host_ptr()[iat * 3 + j]; + } + } + } + if (isstress_) + { + svl_d_.copy_device_to_host_sync(); + svl_[0](0, 0) += svl_d_.get_host_ptr()[0]; + svl_[0](0, 1) += svl_d_.get_host_ptr()[1]; + svl_[0](0, 2) += svl_d_.get_host_ptr()[2]; + svl_[0](1, 1) += svl_d_.get_host_ptr()[3]; + svl_[0](1, 2) += svl_d_.get_host_ptr()[4]; + svl_[0](2, 2) += svl_d_.get_host_ptr()[5]; + } +} + +void Gint_fvl_gpu::cal_fvl_svl_() +{ +#pragma omp parallel + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z(BatchBigGrid::get_max_phi_len(), stream, false); + + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi_dphi(phi.get_device_ptr(), + dphi_x.get_device_ptr(), + dphi_y.get_device_ptr(), + dphi_z.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + const bool is_symm = false; + phi_op.phi_mul_vldr3(vr_eff_d_vec_[is].get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_dm(phi_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), + *dm_gint_vec_[is], is_symm, phi_vldr3_dm.get_device_ptr()); + if (isforce_) + { + phi_op.phi_dot_dphi(phi_vldr3_dm.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), + dphi_z.get_device_ptr(), fvl_d_.get_device_ptr()); + } + if (isstress_) + { + phi_op.phi_dot_dphi_r(phi_vldr3_dm.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), + dphi_z.get_device_ptr(), svl_d_.get_device_ptr()); + } + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } +} + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h new file mode 100644 index 0000000000..1ae52a0755 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "module_base/matrix.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_fvl_gpu : public Gint +{ + public: + Gint_fvl_gpu( + const int nspin, + const std::vector& vr_eff, + const std::vector*>& dm_vec, + const bool isforce, + const bool isstress, + ModuleBase::matrix* fvl, + ModuleBase::matrix* svl) + : nspin_(nspin), vr_eff_(vr_eff), dm_vec_(dm_vec), + isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), + dr3_(gint_info_->get_mgrid_volume()) {}; + + void cal_gint() override; + + private: + void init_dm_gint_(); + + void cal_fvl_svl_(); + + void transfer_cpu_to_gpu_(); + void transfer_gpu_to_cpu_(); + // input + const int nspin_; + std::vector vr_eff_; + std::vector*> dm_vec_; + const bool isforce_; + const bool isstress_; + + // output + ModuleBase::matrix* fvl_; + ModuleBase::matrix* svl_; + + // intermediate variables + std::vector>> dm_gint_vec_; + + double dr3_; + + // GPU memory + std::vector> vr_eff_d_vec_; + std::vector> dm_gint_d_vec_; + CudaMemWrapper fvl_d_; + CudaMemWrapper svl_d_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h index d79043fb2e..1356af7929 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h @@ -36,6 +36,7 @@ class GintInfo // getter functions std::vector>& get_biggrids() { return biggrids_; }; std::shared_ptr get_bgrid_info() const { return biggrid_info_; }; + int get_nat() const { return ucell_->nat; }; // return the number of atoms in the unitcell double get_local_mgrid_num() const { return localcell_info_->get_mgrids_num(); }; double get_mgrid_volume() const { return meshgrid_info_->get_volume(); }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index fda3879780..f3ac59930f 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -6,6 +6,7 @@ #include "gint_vl_nspin4.h" #include "gint_vl_metagga_nspin4.h" #include "gint_fvl.h" +#include "gint_fvl_gpu.h" #include "gint_fvl_meta.h" #include "gint_rho.h" #include "gint_rho_gpu.h" @@ -106,8 +107,17 @@ void cal_gint_fvl( ModuleBase::matrix* svl) { ModuleBase::timer::tick("Gint", "cal_gint_fvl"); - Gint_fvl gint_fvl(nspin, vr_eff, dm_vec, isforce, isstress, fvl, svl); - gint_fvl.cal_gint(); +#ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_fvl_gpu gint_fvl_gpu(nspin, vr_eff, dm_vec, isforce, isstress, fvl, svl); + gint_fvl_gpu.cal_gint(); + } else +#endif + { + Gint_fvl gint_fvl(nspin, vr_eff, dm_vec, isforce, isstress, fvl, svl); + gint_fvl.cal_gint(); + } ModuleBase::timer::tick("Gint", "cal_gint_fvl"); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu index d7a6e3a4d8..956bf54afc 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu @@ -372,4 +372,56 @@ void PhiOperatorGpu::phi_dot_phi( rho_d); } +void PhiOperatorGpu::phi_dot_dphi( + const double* phi_d, + const double* dphi_x_d, + const double* dphi_y_d, + const double* dphi_z_d, + double* fvl_d) const +{ + dim3 grid_dim(bgrid_batch_->get_max_atoms_num_per_bgrid(), + bgrid_batch_->get_batch_size()); + dim3 threads_per_block(32); + phi_dot_dphi_kernel<<>>( + phi_d, + dphi_x_d, + dphi_y_d, + dphi_z_d, + mgrids_num_, + bgrids_phi_len_.get_device_ptr(), + atoms_num_info_.get_device_ptr(), + atoms_phi_start_.get_device_ptr(), + atoms_iat_.get_device_ptr(), + gint_gpu_vars_->iat2it_d, + gint_gpu_vars_->atom_nw_d, + fvl_d); +} + +void PhiOperatorGpu::phi_dot_dphi_r( + const double* phi_d, + const double* dphi_x_d, + const double* dphi_y_d, + const double* dphi_z_d, + double* svl_d) const +{ + dim3 grid_dim(mgrids_num_, + bgrid_batch_->get_batch_size()); + dim3 threads_per_block(32); + phi_dot_dphi_r_kernel<<>>( + phi_d, + dphi_x_d, + dphi_y_d, + dphi_z_d, + mgrids_num_, + bgrids_phi_len_.get_device_ptr(), + atoms_num_info_.get_device_ptr(), + atoms_phi_start_.get_device_ptr(), + atoms_iat_.get_device_ptr(), + atoms_bgrids_rcoords_.get_device_ptr(), + gint_gpu_vars_->mgrids_pos_d, + gint_gpu_vars_->iat2it_d, + gint_gpu_vars_->atom_nw_d, + svl_d); +} + } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h index ae9ca61ad0..4cb43ffab4 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h @@ -43,9 +43,23 @@ class PhiOperatorGpu double* phi_dm_d); void phi_dot_phi( - const double* phi_i, - const double* phi_j, - double* rho) const; + const double* phi_i_d, + const double* phi_j_d, + double* rho_d) const; + + void phi_dot_dphi( + const double* phi_d, + const double* dphi_x_d, + const double* dphi_y_d, + const double* dphi_z_d, + double* fvl_d) const; + + void phi_dot_dphi_r( + const double* phi_d, + const double* dphi_x_d, + const double* dphi_y_d, + const double* dphi_z_d, + double* svl_d) const; private: std::shared_ptr bgrid_batch_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu index 5dcdabd2ff..9d2886107a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu @@ -36,8 +36,8 @@ __global__ void set_phi_kernel( for (int atom_id = threadIdx.x; atom_id < atoms_num; atom_id += blockDim.x) { const int atom_type = iat2it[atoms_iat[atom_id + pre_atoms_num]]; - const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num]; - const double3 coord = make_double3(mgrid_pos.x-rcoord.x, + const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num]; // rcoord is the ralative coordinate of an atom and a biggrid + const double3 coord = make_double3(mgrid_pos.x-rcoord.x, // coord is the relative coordinate of an atom and a meshgrid mgrid_pos.y-rcoord.y, mgrid_pos.z-rcoord.z); double dist = sqrt(coord.x * coord.x + coord.y * coord.y + coord.z * coord.z); @@ -181,6 +181,7 @@ __global__ void set_phi_dphi_kernel( dphi_x[phi_idx + iw] = tmpdphi_rly * coord.x + tmprl * grly[idx_ylm * 3 + 0]; dphi_y[phi_idx + iw] = tmpdphi_rly * coord.y + tmprl * grly[idx_ylm * 3 + 1]; dphi_z[phi_idx + iw] = tmpdphi_rly * coord.z + tmprl * grly[idx_ylm * 3 + 2]; + iw_nr += nrmax; } } else @@ -273,4 +274,167 @@ __global__ void phi_dot_phi_kernel( } } +__global__ void phi_dot_dphi_kernel( + const double* __restrict__ phi, + const double* __restrict__ dphi_x, + const double* __restrict__ dphi_y, + const double* __restrict__ dphi_z, + const int mgrids_per_bgrid, + const int* __restrict__ bgrids_phi_len, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ atoms_iat, + const int* __restrict__ iat2it, + const int* __restrict__ atom_nw, + double* force) +{ + __shared__ double s_data[32 * 3]; // the length of s_data equals the max warp num of a block times 3 + const int bgrid_id = blockIdx.y; + const int atoms_num = atoms_num_info[bgrid_id].x; + const int pre_atoms_num = atoms_num_info[bgrid_id].y; + const int bgrid_phi_len = bgrids_phi_len[bgrid_id]; + const int tid = threadIdx.x; + const int warp_id = tid / 32; + const int lane_id = tid % 32; + + for (int atom_id = blockIdx.x; atom_id < atoms_num; atom_id += gridDim.x) + { + const int atom_phi_start = atoms_phi_start[atom_id + pre_atoms_num]; + const int iat = atoms_iat[atom_id + pre_atoms_num]; + const int nw = atom_nw[iat2it[iat]]; + double f[3] = {0.0, 0.0, 0.0}; + for (int mgrid_id = 0; mgrid_id < mgrids_per_bgrid; mgrid_id++) + { + const int phi_start = atom_phi_start + mgrid_id * bgrid_phi_len; + for (int iw = tid; iw < nw; iw += blockDim.x) + { + int phi_idx = phi_start + iw; + f[0] += phi[phi_idx] * dphi_x[phi_idx]; + f[1] += phi[phi_idx] * dphi_y[phi_idx]; + f[2] += phi[phi_idx] * dphi_z[phi_idx]; + } + } + + // reduce the force in each block + for (int i = 0; i < 3; i++) + { + f[i] = warpReduceSum(f[i]); + } + + if (lane_id == 0) + { + for (int i = 0; i < 3; i++) + { + s_data[warp_id * 3 + i] = f[i]; + } + } + __syncthreads(); + + for (int i = 0; i < 3; i++) + { + f[i] = (tid < blockDim.x / 32) ? s_data[tid * 3 + i] : 0; + } + if (warp_id == 0) + { + for (int i = 0; i < 3; i++) + { + f[i] = warpReduceSum(f[i]); + } + } + if (tid == 0) + { + for (int i = 0; i < 3; i++) + { + atomicAdd(&force[iat * 3 + i], f[i] * 2); + } + } + } +} + +__global__ void phi_dot_dphi_r_kernel( + const double* __restrict__ phi, + const double* __restrict__ dphi_x, + const double* __restrict__ dphi_y, + const double* __restrict__ dphi_z, + const int mgrids_per_bgrid, + const int* __restrict__ bgrids_phi_len, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ iat2it, + const int* __restrict__ atom_nw, + double* __restrict__ svl) +{ + __shared__ double s_data[32 * 6]; // the length of s_data equals the max warp num of a block times 6 + const int tid = threadIdx.x; + const int bgrid_id = blockIdx.y; + const int atoms_num = atoms_num_info[bgrid_id].x; + const int pre_atoms_num = atoms_num_info[bgrid_id].y; + const int bgrid_phi_len = bgrids_phi_len[bgrid_id]; + const int warp_id = tid / 32; + const int lane_id = tid % 32; + + double stress[6]{0.0}; + for (int mgrid_id = blockIdx.x; mgrid_id < mgrids_per_bgrid; mgrid_id += gridDim.x) + { + const double3 mgrid_pos = mgrids_pos[mgrid_id]; + for (int atom_id = 0; atom_id < atoms_num; atom_id++) + { + const int atom_phi_start = atoms_phi_start[atom_id + pre_atoms_num] + mgrid_id * bgrid_phi_len; + const int iat = atoms_iat[atom_id + pre_atoms_num]; + const int nw = atom_nw[iat2it[iat]]; + const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num]; // rcoord is the ralative coordinate of an atom and a biggrid + const double3 coord = make_double3(mgrid_pos.x-rcoord.x, // coord is the relative coordinate of an atom and a meshgrid + mgrid_pos.y-rcoord.y, + mgrid_pos.z-rcoord.z); + for (int iw = tid; iw < nw; iw += blockDim.x) + { + int phi_idx = atom_phi_start + iw; + stress[0] += phi[phi_idx] * dphi_x[phi_idx] * coord.x; + stress[1] += phi[phi_idx] * dphi_x[phi_idx] * coord.y; + stress[2] += phi[phi_idx] * dphi_x[phi_idx] * coord.z; + stress[3] += phi[phi_idx] * dphi_y[phi_idx] * coord.y; + stress[4] += phi[phi_idx] * dphi_y[phi_idx] * coord.z; + stress[5] += phi[phi_idx] * dphi_z[phi_idx] * coord.z; + } + } + } + + // reduce the stress in each block + for (int i = 0; i < 6; i++) + { + stress[i] = warpReduceSum(stress[i]); + } + + if (lane_id == 0) + { + for (int i = 0; i < 6; i++) + { + s_data[warp_id * 6 + i] = stress[i]; + } + } + __syncthreads(); + + for (int i = 0; i < 6; i++) + { + stress[i] = (tid < blockDim.x / 32) ? s_data[tid * 6 + i] : 0; + } + if (warp_id == 0) + { + for (int i = 0; i < 6; i++) + { + stress[i] = warpReduceSum(stress[i]); + } + } + if (tid == 0) + { + for (int i = 0; i < 6; i++) + { + atomicAdd(&svl[i], stress[i] * 2); + } + } +} + } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh index ff6c82a434..d32c346231 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh @@ -73,4 +73,35 @@ __global__ void phi_dot_phi_kernel( const int* __restrict__ bgrids_phi_len, // the length of phi on a mgrid of a biggrid const int* __restrict__ bgrids_phi_start, // the start idx in phi of each biggrid double* __restrict__ rho); // rho(ir) -} \ No newline at end of file + +__global__ void phi_dot_dphi_kernel( + const double* __restrict__ phi, + const double* __restrict__ dphi_x, + const double* __restrict__ dphi_y, + const double* __restrict__ dphi_z, + const int mgrids_per_bgrid, + const int* __restrict__ bgrids_phi_len, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ atoms_iat, + const int* __restrict__ iat2it, + const int* __restrict__ atom_nw, + double* force); + +__global__ void phi_dot_dphi_r_kernel( + const double* __restrict__ phi, + const double* __restrict__ dphi_x, + const double* __restrict__ dphi_y, + const double* __restrict__ dphi_z, + const int mgrids_per_bgrid, + const int* __restrict__ bgrids_phi_len, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ iat2it, + const int* __restrict__ atom_nw, + double* __restrict__ svl); + +} diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp index f65c3369d6..b8924864c7 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp @@ -109,10 +109,10 @@ void PhiOperator::phi_dot_dphi_r( for(int j = 0; j < biggrid_->get_atoms_num(); ++j) { const int start_idx = atoms_startidx_[j]; + const Vec3d& r3 = atoms_relative_coords_[j][i]; for(int k = 0; k < atoms_phi_len_[j]; ++k) { const int idx = i * cols_ + start_idx + k; - const Vec3d& r3 = atoms_relative_coords_[j][i]; const double phi_val = phi[idx]; sxx += phi_val * dphi_x[idx] * r3[0]; sxy += phi_val * dphi_x[idx] * r3[1]; From 5fb4dce6cd27f0d7046c2d9cbe59619f1046b91d Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 15 May 2025 00:19:26 +0800 Subject: [PATCH 07/63] small change --- source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp index 1a56e8a017..c2c27f1cf3 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp @@ -51,7 +51,8 @@ GintInfo::GintInfo( #ifdef __CUDA if(PARAM.inp.device == "gpu") { - init_bgrid_batches_(nbz_local); + const int batch_size = nbz_local; + init_bgrid_batches_(batch_size); gpu_vars_ = std::make_shared(biggrid_info_, ucell, Phi); } #endif From 6fd248e2c33e169104c3c3e5902f7e6a876e0bd9 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 15 May 2025 11:50:09 +0800 Subject: [PATCH 08/63] enable vlocal-metagga and tau calculation --- .../module_gint/CMakeLists.txt | 2 + .../module_gint/temp_gint/gint_interface.cpp | 36 +++++-- .../module_gint/temp_gint/gint_tau_gpu.cpp | 95 +++++++++++++++++++ .../module_gint/temp_gint/gint_tau_gpu.h | 49 ++++++++++ .../temp_gint/gint_vl_metagga_gpu.cpp | 95 +++++++++++++++++++ .../temp_gint/gint_vl_metagga_gpu.h | 55 +++++++++++ .../temp_gint/kernel/phi_operator_kernel.cu | 2 +- 7 files changed, 325 insertions(+), 9 deletions(-) create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index 6b771efc23..199bfbeec0 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -62,6 +62,8 @@ if(NEW_GINT) temp_gint/gint_vl_gpu.cpp temp_gint/gint_rho_gpu.cpp temp_gint/gint_fvl_gpu.cpp + temp_gint/gint_vl_metagga_gpu.cpp + temp_gint/gint_tau_gpu.cpp temp_gint/kernel/dgemm_vbatch.cu ) endif() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index f3ac59930f..f1a807212d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -3,6 +3,7 @@ #include "gint_vl.h" #include "gint_vl_gpu.h" #include "gint_vl_metagga.h" +#include "gint_vl_metagga_gpu.h" #include "gint_vl_nspin4.h" #include "gint_vl_metagga_nspin4.h" #include "gint_fvl.h" @@ -11,6 +12,7 @@ #include "gint_rho.h" #include "gint_rho_gpu.h" #include "gint_tau.h" +#include "gint_tau_gpu.h" namespace ModuleGint { @@ -23,8 +25,8 @@ void cal_gint_vl( #ifdef __CUDA if(PARAM.inp.device == "gpu") { - Gint_vl_gpu gint_vl_gpu(vr_eff, hR); - gint_vl_gpu.cal_gint(); + Gint_vl_gpu gint_vl(vr_eff, hR); + gint_vl.cal_gint(); } else #endif { @@ -50,8 +52,17 @@ void cal_gint_vl_metagga( HContainer* hR) { ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); - Gint_vl_metagga gint_vl_metagga(vr_eff, vfork, hR); - gint_vl_metagga.cal_gint(); + #ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_vl_metagga_gpu gint_vl_metagga(vr_eff, vfork, hR); + gint_vl_metagga.cal_gint(); + } else +#endif + { + Gint_vl_metagga gint_vl_metagga(vr_eff, vfork, hR); + gint_vl_metagga.cal_gint(); + } ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); } @@ -75,8 +86,8 @@ void cal_gint_rho( #ifdef __CUDA if(PARAM.inp.device == "gpu") { - Gint_rho_gpu gint_rho_gpu(dm_vec, nspin, rho); - gint_rho_gpu.cal_gint(); + Gint_rho_gpu gint_rho(dm_vec, nspin, rho); + gint_rho.cal_gint(); } else #endif { @@ -92,8 +103,17 @@ void cal_gint_tau( double** tau) { ModuleBase::timer::tick("Gint", "cal_gint_tau"); - Gint_tau gint_tau(dm_vec, nspin, tau); - gint_tau.cal_gint(); + #ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_tau_gpu gint_tau(dm_vec, nspin, tau); + gint_tau.cal_gint(); + } else + #endif + { + Gint_tau gint_tau(dm_vec, nspin, tau); + gint_tau.cal_gint(); + } ModuleBase::timer::tick("Gint", "cal_gint_tau"); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp new file mode 100644 index 0000000000..1edbfc8147 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp @@ -0,0 +1,95 @@ +#include "gint_tau_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_tau_gpu::cal_gint() +{ + init_dm_gint_(); + transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_cpu_to_gpu_(); + cal_tau_(); + transfer_gpu_to_cpu_(); +} + +void Gint_tau_gpu::init_dm_gint_() +{ + dm_gint_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_vec_[is] = gint_info_->get_hr(); + } +} + +void Gint_tau_gpu::transfer_cpu_to_gpu_() +{ + dm_gint_d_vec_.resize(nspin_); + kin_d_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is]->get_nnr(), 0, false); + kin_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is]->get_wrapper(), + dm_gint_vec_[is]->get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + } +} + +void Gint_tau_gpu::transfer_gpu_to_cpu_() +{ + for (int is = 0; is < nspin_; is++) + { + checkCuda(cudaMemcpy(kin_[is], kin_d_vec_[is].get_device_ptr(), + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyDeviceToHost)); + } +} + +void Gint_tau_gpu::cal_tau_() +{ +#pragma omp parallel + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper dphi_x(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z_dm(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi_dphi(nullptr, + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), dphi_z.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + const bool is_symm = true; + phi_op.phi_mul_dm(dphi_x.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), *dm_gint_vec_[is], + is_symm, dphi_x_dm.get_device_ptr()); + phi_op.phi_mul_dm(dphi_y.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), *dm_gint_vec_[is], + is_symm, dphi_y_dm.get_device_ptr()); + phi_op.phi_mul_dm(dphi_z.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), *dm_gint_vec_[is], + is_symm, dphi_z_dm.get_device_ptr()); + phi_op.phi_dot_phi(dphi_x.get_device_ptr(), dphi_x_dm.get_device_ptr(), kin_d_vec_[is].get_device_ptr()); + phi_op.phi_dot_phi(dphi_y.get_device_ptr(), dphi_y_dm.get_device_ptr(), kin_d_vec_[is].get_device_ptr()); + phi_op.phi_dot_phi(dphi_z.get_device_ptr(), dphi_z_dm.get_device_ptr(), kin_d_vec_[is].get_device_ptr()); + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } +} + +} diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h new file mode 100644 index 0000000000..376fb919c3 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h @@ -0,0 +1,49 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_tau_gpu : public Gint +{ + public: + Gint_tau_gpu( + const std::vector*>& dm_vec, + const int nspin, + double** tau) + : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {}; + + void cal_gint() override; + + private: + void init_dm_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + void cal_tau_(); + + // input + const std::vector*> dm_vec_; + const int nspin_; + + // output + double **kin_; + + //======================== + // Intermediate variables + //======================== + std::vector>> dm_gint_vec_; + + std::vector> dm_gint_d_vec_; + std::vector> kin_d_vec_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp new file mode 100644 index 0000000000..e044cd0691 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp @@ -0,0 +1,95 @@ +#include "gint_vl_metagga_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_vl_metagga_gpu::cal_gint() +{ + init_hr_gint_(); + transfer_cpu_to_gpu_(); + cal_hr_gint_(); + transfer_gpu_to_cpu_(); + compose_hr_gint(hr_gint_); + transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_), hR_); +} + +//======================== +// Private functions +//======================== + +void Gint_vl_metagga_gpu::init_hr_gint_() +{ + hr_gint_ = gint_info_->get_hr(); +} + +void Gint_vl_metagga_gpu::transfer_cpu_to_gpu_() +{ + hr_gint_d_ = CudaMemWrapper(hr_gint_->get_nnr(), 0, false); + vr_eff_d_ = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + vofk_d_ = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_.get_device_ptr(), vr_eff_, + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + checkCuda(cudaMemcpy(vofk_d_.get_device_ptr(), vofk_, + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); +} + +void Gint_vl_metagga_gpu::transfer_gpu_to_cpu_() +{ + checkCuda(cudaMemcpy(hr_gint_->get_wrapper(), hr_gint_d_.get_device_ptr(), + hr_gint_->get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void Gint_vl_metagga_gpu::cal_hr_gint_() +{ +#pragma omp parallel + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi_dphi(phi.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), dphi_z.get_device_ptr()); + phi_op.phi_mul_vldr3(vr_eff_d_.get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_.get_device_ptr(), dr3_, + dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_.get_device_ptr(), dr3_, + dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_.get_device_ptr(), dr3_, + dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr()); + phi_op.phi_mul_phi_vldr3(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + phi_op.phi_mul_phi_vldr3(dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + phi_op.phi_mul_phi_vldr3(dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + phi_op.phi_mul_phi_vldr3(dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } +} +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h new file mode 100644 index 0000000000..5669c24384 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_vl_metagga_gpu : public Gint +{ + public: + Gint_vl_metagga_gpu( + const double* vr_eff, + const double* vofk, + HContainer* hR) + : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + + void cal_gint() override; + + private: + + void init_hr_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + // note that only the upper triangle matrix of hR is calculated + // that's why we need compose_hr_gint() to fill the lower triangle matrix. + void cal_hr_gint_(); + + // input + const double* vr_eff_; + const double* vofk_; + + // output + HContainer* hR_; + + //======================== + // Intermediate variables + //======================== + double dr3_; + + std::shared_ptr> hr_gint_; + + CudaMemWrapper hr_gint_d_; + CudaMemWrapper vr_eff_d_; + CudaMemWrapper vofk_d_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu index 9d2886107a..ff9d3de2f5 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu @@ -270,7 +270,7 @@ __global__ void phi_dot_phi_kernel( if(tid == 0) { - rho[mgrid_local_idx] = tmp_sum; + rho[mgrid_local_idx] += tmp_sum; } } From 584d1514c4d7bf5c6ffd18f4c2a1018b91178770 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 15 May 2025 23:33:48 +0800 Subject: [PATCH 09/63] add set ddphi --- .../temp_gint/kernel/phi_operator_gpu.cu | 44 +++++- .../temp_gint/kernel/phi_operator_gpu.h | 5 +- .../temp_gint/kernel/phi_operator_kernel.cu | 145 +++++++++++++++++- .../temp_gint/kernel/phi_operator_kernel.cuh | 28 ++++ 4 files changed, 218 insertions(+), 4 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu index 956bf54afc..4a265c6b1d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu @@ -122,7 +122,7 @@ void PhiOperatorGpu::set_phi(double* phi_d) const phi_d); } -void PhiOperatorGpu::set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_y_d, double* dphi_z_d) +void PhiOperatorGpu::set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_y_d, double* dphi_z_d) const { dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); dim3 threads_per_block(64); @@ -153,6 +153,48 @@ void PhiOperatorGpu::set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_ dphi_z_d); } +void PhiOperatorGpu::set_ddphi(double* ddphi_xx_d, double* ddphi_xy_d, double* ddphi_xz_d, + double* ddphi_yy_d, double* ddphi_yz_d, double* ddphi_zz_d) const +{ + // Since the underlying implementation of `set_ddphi` uses `ddphi +=` instead of `ddphi =`, + // the ddphi array needs to be zeroed out at the beginning of the function. + checkCuda(cudaMemsetAsync(ddphi_xx_d, 0, phi_len_ * sizeof(double), stream_)); + checkCuda(cudaMemsetAsync(ddphi_xy_d, 0, phi_len_ * sizeof(double), stream_)); + checkCuda(cudaMemsetAsync(ddphi_xz_d, 0, phi_len_ * sizeof(double), stream_)); + checkCuda(cudaMemsetAsync(ddphi_yy_d, 0, phi_len_ * sizeof(double), stream_)); + checkCuda(cudaMemsetAsync(ddphi_yz_d, 0, phi_len_ * sizeof(double), stream_)); + checkCuda(cudaMemsetAsync(ddphi_zz_d, 0, phi_len_ * sizeof(double), stream_)); + dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); + dim3 threads_per_block(64); + set_ddphi_kernel<<>>( + gint_gpu_vars_->nwmax, + mgrids_num_, + gint_gpu_vars_->nr_max, + gint_gpu_vars_->dr_uniform, + gint_gpu_vars_->ylmcoef_d, + gint_gpu_vars_->ucell_atom_nwl_d, + gint_gpu_vars_->atom_iw2_new_d, + gint_gpu_vars_->atom_iw2_ylm_d, + gint_gpu_vars_->atom_iw2_l_d, + gint_gpu_vars_->atom_nw_d, + gint_gpu_vars_->iat2it_d, + gint_gpu_vars_->rcut_d, + gint_gpu_vars_->psi_u_d, + gint_gpu_vars_->dpsi_u_d, + gint_gpu_vars_->mgrids_pos_d, + atoms_iat_.get_device_ptr(), + atoms_bgrids_rcoords_.get_device_ptr(), + atoms_num_info_.get_device_ptr(), + atoms_phi_start_.get_device_ptr(), + bgrids_phi_len_.get_device_ptr(), + ddphi_xx_d, + ddphi_xy_d, + ddphi_xz_d, + ddphi_yy_d, + ddphi_yz_d, + ddphi_zz_d); +} + void PhiOperatorGpu::phi_mul_vldr3( const double* vl_d, const double dr3, diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h index 4cb43ffab4..3997c5cc28 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h @@ -21,7 +21,10 @@ class PhiOperatorGpu void set_phi(double* phi_d) const; - void set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_y_d, double* dphi_z_d); + void set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_y_d, double* dphi_z_d) const; + + void set_ddphi(double* ddphi_xx_d, double* ddphi_xy_d, double* ddphi_xz_d, + double* ddphi_yy_d, double* ddphi_yz_d, double* ddphi_zz_d) const; void phi_mul_vldr3( const double* vl_d, diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu index ff9d3de2f5..2bfae822ab 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu @@ -40,7 +40,7 @@ __global__ void set_phi_kernel( const double3 coord = make_double3(mgrid_pos.x-rcoord.x, // coord is the relative coordinate of an atom and a meshgrid mgrid_pos.y-rcoord.y, mgrid_pos.z-rcoord.z); - double dist = sqrt(coord.x * coord.x + coord.y * coord.y + coord.z * coord.z); + double dist = norm3d(coord.x, coord.y, coord.z); if (dist < rcut[atom_type]) { if (dist < 1.0E-9) @@ -129,7 +129,7 @@ __global__ void set_phi_dphi_kernel( const double3 coord = make_double3(mgrid_pos.x-rcoord.x, mgrid_pos.y-rcoord.y, mgrid_pos.z-rcoord.z); - double dist = sqrt(coord.x * coord.x + coord.y * coord.y + coord.z * coord.z); + double dist = norm3d(coord.x, coord.y, coord.z); if (dist < rcut[atom_type]) { if (dist < 1.0E-9) @@ -202,6 +202,147 @@ __global__ void set_phi_dphi_kernel( } } +// The code for `set_ddphi_kernel` is quite difficult to understand. +// To grasp it, you better refer to the CPU function `set_ddphi` +__global__ void set_ddphi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_iw2_l, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ ddphi_xx, + double* __restrict__ ddphi_xy, + double* __restrict__ ddphi_xz, + double* __restrict__ ddphi_yy, + double* __restrict__ ddphi_yz, + double* __restrict__ ddphi_zz) +{ + const int bgrid_id = blockIdx.y; + const int mgrid_id = blockIdx.x; + const int atoms_num = atoms_num_info[bgrid_id].x; + const int pre_atoms_num = atoms_num_info[bgrid_id].y; + const double3 mgrid_pos = mgrids_pos[mgrid_id]; + + for (int atom_id = threadIdx.x; atom_id < atoms_num; atom_id += blockDim.x) + { + const int atom_type = iat2it[atoms_iat[atom_id + pre_atoms_num]]; + const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num]; + double coord[3]{mgrid_pos.x-rcoord.x, + mgrid_pos.y-rcoord.y, + mgrid_pos.z-rcoord.z}; + double dist = norm3d(coord[0], coord[1], coord[2]); + if (dist < rcut[atom_type]) + { + int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + + bgrids_phi_len[bgrid_id] * mgrid_id; + for(int i = 0; i < 6; i++) + { + coord[i/2] += std::pow(-1, i%2) * 0.0001; + double dist = norm3d(coord[0], coord[1], coord[2]); + if (dist < 1.0E-9) + { dist += 1.0E-9; } + // since nwl is less or equal than 5, the size of rly is (5+1)^2 + // size of grly = 36 * 3 + double rly[36]; + double grly[36 * 3]; + const int nwl = ucell_atom_nwl[atom_type]; + grad_rl_sph_harm(nwl, ylmcoef, coord[0], coord[1], coord[2], rly, grly); + + // interpolation + const double pos = dist / dr_uniform; + const int ip = static_cast(pos); + const double x0 = pos - ip; + const double x1 = 1.0 - x0; + const double x2 = 2.0 - x0; + const double x3 = 3.0 - x0; + const double x12 = x1 * x2 / 6; + const double x03 = x0 * x3 / 2; + double tmp = 0; + double dtmp = 0; + const int it_nw = atom_type * nwmax; + int iw_nr = it_nw * nrmax + ip; + for (int iw = 0; iw < atom_nw[atom_type]; iw++) + { + if (atom_iw2_new[it_nw + iw]) + { + tmp = x12 * (psi_u[iw_nr] * x3 + psi_u[iw_nr + 3] * x0) + + x03 * (psi_u[iw_nr + 1] * x2 - psi_u[iw_nr + 2] * x1); + dtmp = x12 * (dpsi_u[iw_nr] * x3 + dpsi_u[iw_nr + 3] * x0) + + x03 * (dpsi_u[iw_nr + 1] * x2 - dpsi_u[iw_nr + 2] * x1); + } + const int iw_l = atom_iw2_l[it_nw + iw]; + const int idx_ylm = atom_iw2_ylm [it_nw + iw]; + const double rl = pow_int(dist, iw_l); + const double tmprl = tmp / rl; + const double tmpdphi_rly = (dtmp - tmp * iw_l / dist) / rl * rly[idx_ylm] / dist; + + double dphi[3]; + dphi[0] = tmpdphi_rly * coord[0] + tmprl * grly[idx_ylm * 3 + 0]; + dphi[1] = tmpdphi_rly * coord[1] + tmprl * grly[idx_ylm * 3 + 1]; + dphi[2] = tmpdphi_rly * coord[2] + tmprl * grly[idx_ylm * 3 + 2]; + + if (i == 0) + { + ddphi_xx[phi_idx + iw] += dphi[0]; + ddphi_xy[phi_idx + iw] += dphi[1]; + ddphi_xz[phi_idx + iw] += dphi[2]; + } else if (i == 1) + { + ddphi_xx[phi_idx + iw] -= dphi[0]; + ddphi_xy[phi_idx + iw] -= dphi[1]; + ddphi_xz[phi_idx + iw] -= dphi[2]; + } else if (i == 2) + { + ddphi_xy[phi_idx + iw] += dphi[0]; + ddphi_yy[phi_idx + iw] += dphi[1]; + ddphi_yz[phi_idx + iw] += dphi[2]; + } else if (i == 3) + { + ddphi_xy[phi_idx + iw] -= dphi[0]; + ddphi_yy[phi_idx + iw] -= dphi[1]; + ddphi_yz[phi_idx + iw] -= dphi[2]; + } else if (i == 4) + { + ddphi_xz[phi_idx + iw] += dphi[0]; + ddphi_yz[phi_idx + iw] += dphi[1]; + ddphi_zz[phi_idx + iw] += dphi[2]; + } else // i == 5 + { + ddphi_xz[phi_idx + iw] -= dphi[0]; + ddphi_yz[phi_idx + iw] -= dphi[1]; + ddphi_zz[phi_idx + iw] -= dphi[2]; + } + } + coord[i/2] -= std::pow(-1, i%2) * 0.0001; // recover coord + } + for (int iw = 0; iw < atom_nw[atom_type]; iw++) + { + ddphi_xx[phi_idx + iw] /= 0.0002; + ddphi_xy[phi_idx + iw] /= 0.0004; + ddphi_xz[phi_idx + iw] /= 0.0004; + ddphi_yy[phi_idx + iw] /= 0.0002; + ddphi_yz[phi_idx + iw] /= 0.0004; + ddphi_zz[phi_idx + iw] /= 0.0002; + } + } + } +} + __global__ void phi_mul_vldr3_kernel( const double* __restrict__ vl, const double dr3, diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh index d32c346231..4d32475542 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh @@ -53,6 +53,34 @@ __global__ void set_phi_dphi_kernel( double* __restrict__ dphi_y, double* __restrict__ dphi_z); +__global__ void set_ddphi_kernel( + const int nwmax, + const int mgrids_num, + const int nrmax, + const double dr_uniform, + const double* __restrict__ ylmcoef, + const int* __restrict__ ucell_atom_nwl, + const bool* __restrict__ atom_iw2_new, + const int* __restrict__ atom_iw2_ylm, + const int* __restrict__ atom_iw2_l, + const int* __restrict__ atom_nw, + const int* __restrict__ iat2it, + const double* __restrict__ rcut, + const double* __restrict__ psi_u, + const double* __restrict__ dpsi_u, + const double3* __restrict__ mgrids_pos, + const int* __restrict__ atoms_iat, + const double3* __restrict__ atoms_bgrids_rcoords, + const int2* __restrict__ atoms_num_info, + const int* __restrict__ atoms_phi_start, + const int* __restrict__ bgrids_phi_len, + double* __restrict__ ddphi_xx, + double* __restrict__ ddphi_xy, + double* __restrict__ ddphi_xz, + double* __restrict__ ddphi_yy, + double* __restrict__ ddphi_yz, + double* __restrict__ ddphi_zz); + __global__ void phi_mul_vldr3_kernel( const double* __restrict__ vl, const double dr3, From 0aa783f90e31eddb221dda1a68fefd743bb113b7 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 16 May 2025 13:10:33 +0800 Subject: [PATCH 10/63] enable gint_vl_nspin4_gpu --- .../module_gint/CMakeLists.txt | 1 + .../module_gint/temp_gint/gint_interface.cpp | 25 ++++-- .../temp_gint/gint_vl_nspin4_gpu.cpp | 88 +++++++++++++++++++ .../temp_gint/gint_vl_nspin4_gpu.h | 55 ++++++++++++ 4 files changed, 163 insertions(+), 6 deletions(-) create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index 199bfbeec0..ab1371d4ca 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -63,6 +63,7 @@ if(NEW_GINT) temp_gint/gint_rho_gpu.cpp temp_gint/gint_fvl_gpu.cpp temp_gint/gint_vl_metagga_gpu.cpp + temp_gint/gint_vl_nspin4_gpu.cpp temp_gint/gint_tau_gpu.cpp temp_gint/kernel/dgemm_vbatch.cu ) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index f1a807212d..421ffb7a81 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -1,18 +1,22 @@ #include "gint_interface.h" #include "source_base/timer.h" #include "gint_vl.h" -#include "gint_vl_gpu.h" #include "gint_vl_metagga.h" -#include "gint_vl_metagga_gpu.h" #include "gint_vl_nspin4.h" #include "gint_vl_metagga_nspin4.h" #include "gint_fvl.h" -#include "gint_fvl_gpu.h" #include "gint_fvl_meta.h" #include "gint_rho.h" -#include "gint_rho_gpu.h" #include "gint_tau.h" + +#ifdef __CUDA +#include "gint_vl_gpu.h" +#include "gint_rho_gpu.h" +#include "gint_fvl_gpu.h" +#include "gint_vl_nspin4_gpu.h" +#include "gint_vl_metagga_gpu.h" #include "gint_tau_gpu.h" +#endif namespace ModuleGint { @@ -41,8 +45,17 @@ void cal_gint_vl( HContainer>* hR) { ModuleBase::timer::tick("Gint", "cal_gint_vl"); - Gint_vl_nspin4 gint_vl_nspin4(vr_eff, hR); - gint_vl_nspin4.cal_gint(); + #ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_vl_nspin4_gpu gint_vl_nspin4(vr_eff, hR); + gint_vl_nspin4.cal_gint(); + } else + #endif + { + Gint_vl_nspin4 gint_vl_nspin4(vr_eff, hR); + gint_vl_nspin4.cal_gint(); + } ModuleBase::timer::tick("Gint", "cal_gint_vl"); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp new file mode 100644 index 0000000000..e08cbfa875 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp @@ -0,0 +1,88 @@ +#include "gint_vl_nspin4_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_vl_nspin4_gpu::cal_gint() +{ + init_hr_gint_(); + transfer_cpu_to_gpu_(); + cal_hr_gint_(); + transfer_gpu_to_cpu_(); + compose_hr_gint(hr_gint_part_, hr_gint_full_); + transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_full_), hR_); +} + +void Gint_vl_nspin4_gpu::init_hr_gint_() +{ + hr_gint_part_.resize(nspin_); + for(int i = 0; i < nspin_; i++) + { + hr_gint_part_[i] = gint_info_->get_hr(); + } + const int npol = 2; + hr_gint_full_ = gint_info_->get_hr>(npol); +} + +void Gint_vl_nspin4_gpu::transfer_cpu_to_gpu_() +{ + vr_eff_d_.resize(nspin_); + hr_gint_part_d_.resize(nspin_); + for(int i = 0; i < nspin_; i++) + { + hr_gint_part_d_[i] = CudaMemWrapper(hr_gint_part_[i]->get_nnr(), 0, false); + vr_eff_d_[i] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_[i].get_device_ptr(), vr_eff_[i], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + } +} + +void Gint_vl_nspin4_gpu::transfer_gpu_to_cpu_() +{ + for(int i = 0; i < nspin_; i++) + { + checkCuda(cudaMemcpy(hr_gint_part_[i]->get_wrapper(), hr_gint_part_d_[i].get_device_ptr(), + hr_gint_part_[i]->get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); + } +} + + +void Gint_vl_nspin4_gpu::cal_hr_gint_() +{ +#pragma omp parallel + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi(phi.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + phi_op.phi_mul_vldr3(vr_eff_d_[is].get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_phi_vldr3(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } +} + +} // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h new file mode 100644 index 0000000000..d7033df167 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_vl_nspin4_gpu : public Gint +{ + public: + Gint_vl_nspin4_gpu( + std::vector vr_eff, + HContainer>* hR) + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + + void cal_gint() override; + + private: + + void init_hr_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + // note that only the upper triangle matrix of hR is calculated + // that's why we need compose_hr_gint() to fill the lower triangle matrix. + void cal_hr_gint_(); + + // input + std::vector vr_eff_; + + // output + HContainer>* hR_; + + //======================== + // Intermediate variables + //======================== + const double dr3_; + + const int nspin_ = 4; + + std::vector>> hr_gint_part_; + std::shared_ptr>> hr_gint_full_; + + std::vector> vr_eff_d_; + std::vector> hr_gint_part_d_; +}; + +} // namespace ModuleGint \ No newline at end of file From 2566e0d134aef05d64e837d70ade295ac4b9ec72 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 16 May 2025 15:07:30 +0800 Subject: [PATCH 11/63] add gint_vl_metagga_nspin4_gpu --- .../module_gint/CMakeLists.txt | 1 + .../module_gint/temp_gint/gint_interface.cpp | 18 ++- .../temp_gint/gint_vl_metagga_nspin4_gpu.cpp | 110 ++++++++++++++++++ .../temp_gint/gint_vl_metagga_nspin4_gpu.h | 54 +++++++++ 4 files changed, 180 insertions(+), 3 deletions(-) create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index ab1371d4ca..bb4d5af6e8 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -64,6 +64,7 @@ if(NEW_GINT) temp_gint/gint_fvl_gpu.cpp temp_gint/gint_vl_metagga_gpu.cpp temp_gint/gint_vl_nspin4_gpu.cpp + temp_gint/gint_vl_metagga_nspin4_gpu.cpp temp_gint/gint_tau_gpu.cpp temp_gint/kernel/dgemm_vbatch.cu ) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index 421ffb7a81..c871fcd96d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -15,6 +15,7 @@ #include "gint_fvl_gpu.h" #include "gint_vl_nspin4_gpu.h" #include "gint_vl_metagga_gpu.h" +#include "gint_vl_metagga_nspin4_gpu.h" #include "gint_tau_gpu.h" #endif @@ -40,6 +41,7 @@ void cal_gint_vl( ModuleBase::timer::tick("Gint", "cal_gint_vl"); } +// nspin == 4 case void cal_gint_vl( std::vector vr_eff, HContainer>* hR) @@ -65,7 +67,7 @@ void cal_gint_vl_metagga( HContainer* hR) { ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); - #ifdef __CUDA +#ifdef __CUDA if(PARAM.inp.device == "gpu") { Gint_vl_metagga_gpu gint_vl_metagga(vr_eff, vfork, hR); @@ -79,14 +81,24 @@ void cal_gint_vl_metagga( ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); } +// nspin == 4 case void cal_gint_vl_metagga( std::vector vr_eff, std::vector vofk, HContainer>* hR) { ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); - Gint_vl_metagga_nspin4 gint_vl_metagga_nspin4(vr_eff, vofk, hR); - gint_vl_metagga_nspin4.cal_gint(); +#ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_vl_metagga_nspin4_gpu gint_vl_metagga_nspin4(vr_eff, vofk, hR); + gint_vl_metagga_nspin4.cal_gint(); + } else +#endif + { + Gint_vl_metagga_nspin4 gint_vl_metagga_nspin4(vr_eff, vofk, hR); + gint_vl_metagga_nspin4.cal_gint(); + } ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp new file mode 100644 index 0000000000..f6c5f1a1c6 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp @@ -0,0 +1,110 @@ +#include "gint_vl_metagga_nspin4_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_vl_metagga_nspin4_gpu::cal_gint() +{ + init_hr_gint_(); + transfer_cpu_to_gpu_(); + cal_hr_gint_(); + transfer_gpu_to_cpu_(); + compose_hr_gint(hr_gint_part_, hr_gint_full_); + transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_full_), hR_); +} + +void Gint_vl_metagga_nspin4_gpu::init_hr_gint_() +{ + hr_gint_part_.resize(nspin_); + for(int i = 0; i < nspin_; i++) + { + hr_gint_part_[i] = gint_info_->get_hr(); + } + const int npol = 2; + hr_gint_full_ = gint_info_->get_hr>(npol); +} + +void Gint_vl_metagga_nspin4_gpu::transfer_cpu_to_gpu_() +{ + vr_eff_d_.resize(nspin_); + vofk_d_.resize(nspin_); + hr_gint_part_d_.resize(nspin_); + for(int i = 0; i < nspin_; i++) + { + hr_gint_part_d_[i] = CudaMemWrapper(hr_gint_part_[i]->get_nnr(), 0, false); + vr_eff_d_[i] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + vofk_d_[i] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_[i].get_device_ptr(), vr_eff_[i], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + checkCuda(cudaMemcpy(vofk_d_[i].get_device_ptr(), vofk_[i], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + } +} + +void Gint_vl_metagga_nspin4_gpu::transfer_gpu_to_cpu_() +{ + for(int i = 0; i < nspin_; i++) + { + checkCuda(cudaMemcpy(hr_gint_part_[i]->get_wrapper(), hr_gint_part_d_[i].get_device_ptr(), + hr_gint_part_[i]->get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); + } +} + +void Gint_vl_metagga_nspin4_gpu::cal_hr_gint_() +{ +#pragma omp parallel + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi_dphi(phi.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), dphi_z.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + phi_op.phi_mul_vldr3(vr_eff_d_[is].get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_[is].get_device_ptr(), dr3_, + dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_[is].get_device_ptr(), dr3_, + dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_[is].get_device_ptr(), dr3_, + dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr()); + phi_op.phi_mul_phi_vldr3(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + phi_op.phi_mul_phi_vldr3(dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + phi_op.phi_mul_phi_vldr3(dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + phi_op.phi_mul_phi_vldr3(dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } +} + +} // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h new file mode 100644 index 0000000000..395ea1544f --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h @@ -0,0 +1,54 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ + +class Gint_vl_metagga_nspin4_gpu : public Gint +{ + public: + Gint_vl_metagga_nspin4_gpu( + std::vector vr_eff, + std::vector vofk, + HContainer>* hR) + : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + + void cal_gint() override; + + private: + void init_hr_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + void cal_hr_gint_(); + + // input + std::vector vr_eff_; + std::vector vofk_; + // output + HContainer>* hR_; + + //======================== + // Intermediate variables + //======================== + const double dr3_; + + const int nspin_ = 4; + + std::vector>> hr_gint_part_; + std::shared_ptr>> hr_gint_full_; + + std::vector> vr_eff_d_; + std::vector> vofk_d_; + std::vector> hr_gint_part_d_; +}; + +} \ No newline at end of file From f62adeaebb32c2b37e11322173dbd1da8a642705 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 16 May 2025 17:55:35 +0800 Subject: [PATCH 12/63] enable fvl_meta_gpu --- .../module_gint/CMakeLists.txt | 1 + .../temp_gint/gint_fvl_meta_gpu.cpp | 179 ++++++++++++++++++ .../module_gint/temp_gint/gint_fvl_meta_gpu.h | 64 +++++++ .../module_gint/temp_gint/gint_interface.cpp | 14 +- .../temp_gint/kernel/phi_operator_kernel.cu | 9 +- 5 files changed, 260 insertions(+), 7 deletions(-) create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index bb4d5af6e8..3062fc9060 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -66,6 +66,7 @@ if(NEW_GINT) temp_gint/gint_vl_nspin4_gpu.cpp temp_gint/gint_vl_metagga_nspin4_gpu.cpp temp_gint/gint_tau_gpu.cpp + temp_gint/gint_fvl_meta_gpu.cpp temp_gint/kernel/dgemm_vbatch.cu ) endif() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp new file mode 100644 index 0000000000..3d4ce6d4c5 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp @@ -0,0 +1,179 @@ +#include "gint_fvl_meta_gpu.h" +#include "gint_common.h" +#include "gint_helper.h" +#include "batch_biggrid.h" +#include "kernel/phi_operator_gpu.h" + +namespace ModuleGint +{ + +void Gint_fvl_meta_gpu::cal_gint() +{ + init_dm_gint_(); + transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_cpu_to_gpu_(); + cal_fvl_svl_(); + transfer_gpu_to_cpu_(); +} + +void Gint_fvl_meta_gpu::init_dm_gint_() +{ + dm_gint_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_vec_[is] = gint_info_->get_hr(); + } +} + +void Gint_fvl_meta_gpu::transfer_cpu_to_gpu_() +{ + dm_gint_d_vec_.resize(nspin_); + vr_eff_d_vec_.resize(nspin_); + vofk_d_vec_.resize(nspin_); + for (int is = 0; is < nspin_; is++) + { + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is]->get_nnr(), 0, false); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is]->get_wrapper(), + dm_gint_vec_[is]->get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + vr_eff_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vr_eff_d_vec_[is].get_device_ptr(), vr_eff_[is], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + vofk_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); + checkCuda(cudaMemcpy(vofk_d_vec_[is].get_device_ptr(), vofk_[is], + gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); + } + if (isforce_) + { + fvl_d_ = CudaMemWrapper(gint_info_->get_nat() * 3, 0, true); + } + if (isstress_) + { + svl_d_ = CudaMemWrapper(6, 0, true); + } +} + +void Gint_fvl_meta_gpu::transfer_gpu_to_cpu_() +{ + if (isforce_) + { + fvl_d_.copy_device_to_host_sync(); + for (int iat = 0; iat < gint_info_->get_nat(); iat++) + { + for (int j = 0; j < 3; j++) + { + fvl_[0](iat, j) += fvl_d_.get_host_ptr()[iat * 3 + j]; + } + } + } + if (isstress_) + { + svl_d_.copy_device_to_host_sync(); + svl_[0](0, 0) += svl_d_.get_host_ptr()[0]; + svl_[0](0, 1) += svl_d_.get_host_ptr()[1]; + svl_[0](0, 2) += svl_d_.get_host_ptr()[2]; + svl_[0](1, 1) += svl_d_.get_host_ptr()[3]; + svl_[0](1, 2) += svl_d_.get_host_ptr()[4]; + svl_[0](2, 2) += svl_d_.get_host_ptr()[5]; + } +} + +void Gint_fvl_meta_gpu::cal_fvl_svl_() +{ +#pragma omp parallel + { + // 20240620 Note that it must be set again here because + // cuda's device is not safe in a multi-threaded environment. + checkCuda(cudaSetDevice(gint_info_->get_dev_id())); + cudaStream_t stream; + checkCuda(cudaStreamCreate(&stream)); + PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream); + CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper phi_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_x_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_y_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper dphi_z_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_xx(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_xy(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_xz(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_yy(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_yz(BatchBigGrid::get_max_phi_len(), stream, false); + CudaMemWrapper ddphi_zz(BatchBigGrid::get_max_phi_len(), stream, false); + #pragma omp for schedule(dynamic) + for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + { + if(bgrid_batch->empty()) + { + continue; + } + phi_op.set_bgrid_batch(bgrid_batch); + phi_op.set_phi_dphi(phi.get_device_ptr(), + dphi_x.get_device_ptr(), + dphi_y.get_device_ptr(), + dphi_z.get_device_ptr()); + phi_op.set_ddphi(ddphi_xx.get_device_ptr(), ddphi_xy.get_device_ptr(), + ddphi_xz.get_device_ptr(), ddphi_yy.get_device_ptr(), + ddphi_yz.get_device_ptr(), ddphi_zz.get_device_ptr()); + for(int is = 0; is < nspin_; is++) + { + const bool is_symm = false; + phi_op.phi_mul_vldr3(vr_eff_d_vec_[is].get_device_ptr(), dr3_, + phi.get_device_ptr(), phi_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_vec_[is].get_device_ptr(), dr3_, + dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_vec_[is].get_device_ptr(), dr3_, + dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr()); + phi_op.phi_mul_vldr3(vofk_d_vec_[is].get_device_ptr(), dr3_, + dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr()); + phi_op.phi_mul_dm(phi_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), + *dm_gint_vec_[is], is_symm, phi_vldr3_dm.get_device_ptr()); + phi_op.phi_mul_dm(dphi_x_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), + *dm_gint_vec_[is], is_symm, dphi_x_vldr3_dm.get_device_ptr()); + phi_op.phi_mul_dm(dphi_y_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), + *dm_gint_vec_[is], is_symm, dphi_y_vldr3_dm.get_device_ptr()); + phi_op.phi_mul_dm(dphi_z_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), + *dm_gint_vec_[is], is_symm, dphi_z_vldr3_dm.get_device_ptr()); + if (isforce_) + { + phi_op.phi_dot_dphi(phi_vldr3_dm.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), + dphi_z.get_device_ptr(), fvl_d_.get_device_ptr()); + phi_op.phi_dot_dphi(dphi_x_vldr3_dm.get_device_ptr(), + ddphi_xx.get_device_ptr(), ddphi_xy.get_device_ptr(), + ddphi_xz.get_device_ptr(), fvl_d_.get_device_ptr()); + phi_op.phi_dot_dphi(dphi_y_vldr3_dm.get_device_ptr(), + ddphi_xy.get_device_ptr(), ddphi_yy.get_device_ptr(), + ddphi_yz.get_device_ptr(), fvl_d_.get_device_ptr()); + phi_op.phi_dot_dphi(dphi_z_vldr3_dm.get_device_ptr(), + ddphi_xz.get_device_ptr(), ddphi_yz.get_device_ptr(), + ddphi_zz.get_device_ptr(), fvl_d_.get_device_ptr()); + } + if (isstress_) + { + phi_op.phi_dot_dphi_r(phi_vldr3_dm.get_device_ptr(), + dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), + dphi_z.get_device_ptr(), svl_d_.get_device_ptr()); + phi_op.phi_dot_dphi_r(dphi_x_vldr3_dm.get_device_ptr(), + ddphi_xx.get_device_ptr(), ddphi_xy.get_device_ptr(), + ddphi_xz.get_device_ptr(), svl_d_.get_device_ptr()); + phi_op.phi_dot_dphi_r(dphi_y_vldr3_dm.get_device_ptr(), + ddphi_xy.get_device_ptr(), ddphi_yy.get_device_ptr(), + ddphi_yz.get_device_ptr(), svl_d_.get_device_ptr()); + phi_op.phi_dot_dphi_r(dphi_z_vldr3_dm.get_device_ptr(), + ddphi_xz.get_device_ptr(), ddphi_yz.get_device_ptr(), + ddphi_zz.get_device_ptr(), svl_d_.get_device_ptr()); + } + } + } + checkCuda(cudaStreamSynchronize(stream)); + checkCuda(cudaStreamDestroy(stream)); + } +} + +} // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h new file mode 100644 index 0000000000..90a1e9871c --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h @@ -0,0 +1,64 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "module_base/matrix.h" +#include "gint.h" +#include "gint_info.h" +#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" + +namespace ModuleGint +{ +class Gint_fvl_meta_gpu : public Gint +{ + public: + Gint_fvl_meta_gpu( + const int nspin, + const std::vector& vr_eff, + const std::vector& vofk, + const std::vector*>& dm_vec, + const bool isforce, + const bool isstress, + ModuleBase::matrix* fvl, + ModuleBase::matrix* svl) + : nspin_(nspin), vr_eff_(vr_eff), vofk_(vofk), dm_vec_(dm_vec), + isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), + dr3_(gint_info_->get_mgrid_volume()) {}; + + void cal_gint() override; + + private: + void init_dm_gint_(); + + void transfer_cpu_to_gpu_(); + + void transfer_gpu_to_cpu_(); + + void cal_fvl_svl_(); + + // input + const int nspin_; + std::vector vr_eff_; + std::vector vofk_; + std::vector*> dm_vec_; + const bool isforce_; + const bool isstress_; + + // output + ModuleBase::matrix* fvl_; + ModuleBase::matrix* svl_; + + // intermediate variables + std::vector>> dm_gint_vec_; + + double dr3_; + + std::vector> vr_eff_d_vec_; + std::vector> vofk_d_vec_; + std::vector> dm_gint_d_vec_; + CudaMemWrapper fvl_d_; + CudaMemWrapper svl_d_; +}; + +} // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index c871fcd96d..dd8ba3c9e4 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -17,6 +17,7 @@ #include "gint_vl_metagga_gpu.h" #include "gint_vl_metagga_nspin4_gpu.h" #include "gint_tau_gpu.h" +#include "gint_fvl_meta_gpu.h" #endif namespace ModuleGint @@ -177,8 +178,17 @@ void cal_gint_fvl_meta( ModuleBase::matrix* svl) { ModuleBase::timer::tick("Gint", "cal_gint_fvl_meta"); - Gint_fvl_meta gint_fvl_meta(nspin, vr_eff, vofk, dm_vec, isforce, isstress, fvl, svl); - gint_fvl_meta.cal_gint(); +#ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Gint_fvl_meta_gpu gint_fvl_meta(nspin, vr_eff, vofk, dm_vec, isforce, isstress, fvl, svl); + gint_fvl_meta.cal_gint(); + } else +#endif + { + Gint_fvl_meta gint_fvl_meta(nspin, vr_eff, vofk, dm_vec, isforce, isstress, fvl, svl); + gint_fvl_meta.cal_gint(); + } ModuleBase::timer::tick("Gint", "cal_gint_fvl_meta"); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu index 2bfae822ab..5db767f501 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu @@ -67,7 +67,7 @@ __global__ void set_phi_kernel( int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + bgrids_phi_len[bgrid_id] * mgrid_id; - for (int iw = 0; iw < atom_nw[atom_type]; iw++) + for (int iw = 0; iw < atom_nw[atom_type]; iw++, iw_nr += nrmax) { if (atom_iw2_new[it_nw + iw]) { @@ -75,7 +75,6 @@ __global__ void set_phi_kernel( + c3 * psi_u[iw_nr + 1] + c4 * dpsi_u[iw_nr + 1]; } phi[phi_idx + iw] = psi * ylma[atom_iw2_ylm[it_nw + iw]]; - iw_nr += nrmax; } } else @@ -156,7 +155,7 @@ __global__ void set_phi_dphi_kernel( int iw_nr = it_nw * nrmax + ip; int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] + bgrids_phi_len[bgrid_id] * mgrid_id; - for (int iw = 0; iw < atom_nw[atom_type]; iw++) + for (int iw = 0; iw < atom_nw[atom_type]; iw++, iw_nr += nrmax) { if (atom_iw2_new[it_nw + iw]) { @@ -181,7 +180,6 @@ __global__ void set_phi_dphi_kernel( dphi_x[phi_idx + iw] = tmpdphi_rly * coord.x + tmprl * grly[idx_ylm * 3 + 0]; dphi_y[phi_idx + iw] = tmpdphi_rly * coord.y + tmprl * grly[idx_ylm * 3 + 1]; dphi_z[phi_idx + iw] = tmpdphi_rly * coord.z + tmprl * grly[idx_ylm * 3 + 2]; - iw_nr += nrmax; } } else @@ -276,7 +274,7 @@ __global__ void set_ddphi_kernel( double dtmp = 0; const int it_nw = atom_type * nwmax; int iw_nr = it_nw * nrmax + ip; - for (int iw = 0; iw < atom_nw[atom_type]; iw++) + for (int iw = 0; iw < atom_nw[atom_type]; iw++, iw_nr += nrmax) { if (atom_iw2_new[it_nw + iw]) { @@ -330,6 +328,7 @@ __global__ void set_ddphi_kernel( } coord[i/2] -= std::pow(-1, i%2) * 0.0001; // recover coord } + for (int iw = 0; iw < atom_nw[atom_type]; iw++) { ddphi_xx[phi_idx + iw] /= 0.0002; From 0bd2d9cf08dc39fafeb25535e624dbfe71ba5494 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 16 May 2025 17:56:02 +0800 Subject: [PATCH 13/63] optimize dgemm_vbatch --- .../temp_gint/kernel/dgemm_vbatch.cu | 85 ++----------------- 1 file changed, 9 insertions(+), 76 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu index b09abd3fd5..e7e212cc2c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu @@ -13,57 +13,14 @@ void dgemm_nn_vbatch( int batchCount, cudaStream_t stream, const double* alpha) { - if (max_k < 32) - { - if(max_k == 8 && max_m ==24) - { - vbatched_gemm_nn_impl - (max_m, max_n, m_d, n_d, k_d, - A_array_d, lda_d, - B_array_d, ldb_d, - C_array_d, ldc_d, - batchCount, stream, alpha); - } - else if (max_m < 32) - { - vbatched_gemm_nn_impl - (max_m, max_n, m_d, n_d, k_d, - A_array_d, lda_d, - B_array_d, ldb_d, - C_array_d, ldc_d, - batchCount, stream, alpha); - } - else - { - vbatched_gemm_nn_impl - (max_m, max_n, m_d, n_d, k_d, - A_array_d, lda_d, - B_array_d, ldb_d, - C_array_d, ldc_d, - batchCount, stream, alpha); - } - } - else - { - if (max_n < 80) - { - vbatched_gemm_nn_impl - (max_m, max_n, m_d, n_d, k_d, - A_array_d, lda_d, - B_array_d, ldb_d, - C_array_d, ldc_d, - batchCount, stream, alpha); - } - else - { - vbatched_gemm_nn_impl - (max_m, max_n, m_d, n_d, k_d, - A_array_d, lda_d, - B_array_d, ldb_d, - C_array_d, ldc_d, - batchCount, stream, alpha); - } - } + + vbatched_gemm_nn_impl + (max_m, max_n, m_d, n_d, k_d, + A_array_d, lda_d, + B_array_d, ldb_d, + C_array_d, ldc_d, + batchCount, stream, alpha); + } // the template parameters refer to the settings for the "nt" shape in dgemm_vbatched_core. @@ -76,34 +33,10 @@ void dgemm_tn_vbatch( int batchCount, cudaStream_t stream, const double* alpha) { - if (max_k < 128) - { - vbatched_gemm_tn_impl + vbatched_gemm_tn_impl (max_m, max_n, m_d, n_d, k_d, A_array_d, lda_d, B_array_d, ldb_d, C_array_d, ldc_d, batchCount, stream, alpha); - } - else - { - if (max_n < 256) - { - vbatched_gemm_tn_impl - (max_m, max_n, m_d, n_d, k_d, - A_array_d, lda_d, - B_array_d, ldb_d, - C_array_d, ldc_d, - batchCount, stream, alpha); - } - else - { - vbatched_gemm_tn_impl - (max_m, max_n, m_d, n_d, k_d, - A_array_d, lda_d, - B_array_d, ldb_d, - C_array_d, ldc_d, - batchCount, stream, alpha); - } - } } From caf04153b388a260e1a6bb731af99c2442be647f Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 19 May 2025 11:18:11 +0800 Subject: [PATCH 14/63] set streams num --- .../module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp | 2 +- .../module_gint/temp_gint/gint_fvl_meta_gpu.cpp | 2 +- source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp | 1 + source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h | 3 +++ .../module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp | 2 +- .../module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp | 2 +- .../module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp | 2 +- .../module_gint/temp_gint/gint_vl_metagga_gpu.cpp | 2 +- .../module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp | 2 +- .../module_gint/temp_gint/gint_vl_nspin4_gpu.cpp | 2 +- 10 files changed, 12 insertions(+), 8 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp index e22cf00d69..32489424a7 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp @@ -75,7 +75,7 @@ void Gint_fvl_gpu::transfer_gpu_to_cpu_() void Gint_fvl_gpu::cal_fvl_svl_() { -#pragma omp parallel +#pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because // cuda's device is not safe in a multi-threaded environment. diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp index 3d4ce6d4c5..293390fbbc 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp @@ -79,7 +79,7 @@ void Gint_fvl_meta_gpu::transfer_gpu_to_cpu_() void Gint_fvl_meta_gpu::cal_fvl_svl_() { -#pragma omp parallel +#pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because // cuda's device is not safe in a multi-threaded environment. diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp index c2c27f1cf3..806de9bf94 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp @@ -51,6 +51,7 @@ GintInfo::GintInfo( #ifdef __CUDA if(PARAM.inp.device == "gpu") { + streams_num_ = PARAM.inp.nstream; // the default value of num_stream is 4 const int batch_size = nbz_local; init_bgrid_batches_(batch_size); gpu_vars_ = std::make_shared(biggrid_info_, ucell, Phi); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h index 1356af7929..ceb669b635 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h @@ -90,11 +90,14 @@ class GintInfo std::vector>& get_bgrid_batches() { return bgrid_batches_; }; std::shared_ptr get_gpu_vars() const { return gpu_vars_; }; int get_dev_id() const { return gpu_vars_->dev_id_; }; + int get_streams_num() const { return streams_num_; }; private: void init_bgrid_batches_(int batch_size); std::vector> bgrid_batches_; std::shared_ptr gpu_vars_; + // More streams can improve parallelism and may speed up grid integration, at the cost of higher GPU memory usage. + int streams_num_; #endif }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp index a4d80d3abe..a5ef6a335a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp @@ -49,7 +49,7 @@ void Gint_rho_gpu::transfer_gpu_to_cpu_() void Gint_rho_gpu::cal_rho_() { -#pragma omp parallel +#pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because // cuda's device is not safe in a multi-threaded environment. diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp index 1edbfc8147..6ed4b19c05 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp @@ -49,7 +49,7 @@ void Gint_tau_gpu::transfer_gpu_to_cpu_() void Gint_tau_gpu::cal_tau_() { -#pragma omp parallel +#pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because // cuda's device is not safe in a multi-threaded environment. diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp index 8d0120cbc6..8d953a50c0 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp @@ -38,7 +38,7 @@ void Gint_vl_gpu::transfer_gpu_to_cpu_() void Gint_vl_gpu::cal_hr_gint_() { -#pragma omp parallel +#pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because // cuda's device is not safe in a multi-threaded environment. diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp index e044cd0691..850fd20770 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp @@ -45,7 +45,7 @@ void Gint_vl_metagga_gpu::transfer_gpu_to_cpu_() void Gint_vl_metagga_gpu::cal_hr_gint_() { -#pragma omp parallel +#pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because // cuda's device is not safe in a multi-threaded environment. diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp index f6c5f1a1c6..1a43981e1e 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp @@ -56,7 +56,7 @@ void Gint_vl_metagga_nspin4_gpu::transfer_gpu_to_cpu_() void Gint_vl_metagga_nspin4_gpu::cal_hr_gint_() { -#pragma omp parallel +#pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because // cuda's device is not safe in a multi-threaded environment. diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp index e08cbfa875..e6ae47dfcf 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp @@ -53,7 +53,7 @@ void Gint_vl_nspin4_gpu::transfer_gpu_to_cpu_() void Gint_vl_nspin4_gpu::cal_hr_gint_() { -#pragma omp parallel +#pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because // cuda's device is not safe in a multi-threaded environment. From 531122461568d74a1ee7d20b09827ca0d63b3a86 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 22 May 2025 18:47:21 +0800 Subject: [PATCH 15/63] small fix --- source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h index f8bcb79665..9c6c96c243 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h @@ -24,7 +24,7 @@ class BigGridInfo Vec3d get_cartesian_coord(const Vec3d& index_3d) const { return index_3d * biggrid_latvec0_; }; Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * biggrid_latvec0_; }; - const Vec3d get_direct_coord(const Vec3d& cart_coord) const { return cart_coord * biggrid_GT_; }; + Vec3d get_direct_coord(const Vec3d& cart_coord) const { return cart_coord * biggrid_GT_; }; // Return the maximum number of big grids that can fit inside a sphere of radius r, // along the three lattice vector directions. From fbbbedc8949e26362de831b96094d09bb2ca3392 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 23 May 2025 10:23:32 +0800 Subject: [PATCH 16/63] optimize gint_atom --- .../module_gint/temp_gint/gint_atom.cpp | 90 ++++++++----------- .../module_gint/temp_gint/gint_atom.h | 10 +-- .../module_gint/temp_gint/set_ddphi.cpp | 45 ++-------- 3 files changed, 53 insertions(+), 92 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp index 6ae3735ec6..f6a15b86fc 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp @@ -5,6 +5,33 @@ namespace ModuleGint { +GintAtom::GintAtom( + const Atom* atom, + int ia, + int iat, + Vec3i biggrid_idx, + Vec3i unitcell_idx, + Vec3d tau_in_biggrid, + const Numerical_Orbital* orb) +: atom_(atom), ia_(ia), iat_(iat), biggrid_idx_(biggrid_idx), + unitcell_idx_(unitcell_idx), tau_in_biggrid_(tau_in_biggrid), + orb_(orb) +{ + p_psi_uniform_.resize(atom_->nw); + p_dpsi_uniform_.resize(atom_->nw); + p_ddpsi_uniform_.resize(atom_->nw); + for (int iw=0; iw < atom_->nw; ++iw) + { + if ( atom_->iw2_new[iw] ) + { + int l = atom_->iw2l[iw]; + int n = atom_->iw2n[iw]; + p_psi_uniform_[iw] = orb_->PhiLN(l, n).psi_uniform.data(); + p_dpsi_uniform_[iw] = orb_->PhiLN(l, n).dpsi_uniform.data(); + p_ddpsi_uniform_[iw] = orb_->PhiLN(l, n).ddpsi_uniform.data(); + } + } +} template void GintAtom::set_phi(const std::vector& coords, const int stride, T* phi) const @@ -14,20 +41,6 @@ void GintAtom::set_phi(const std::vector& coords, const int stride, T* ph // orb_ does not have the member variable dr_uniform const double dr_uniform = orb_->PhiLN(0, 0).dr_uniform; - // store the pointer to reduce repeated address fetching - std::vector p_psi_uniform(atom_->nw); - std::vector p_dpsi_uniform(atom_->nw); - for(int iw = 0; iw < atom_->nw; iw++) - { - if(atom_->iw2_new[iw]) - { - int l = atom_->iw2l[iw]; - int n = atom_->iw2n[iw]; - p_psi_uniform[iw] = orb_->PhiLN(l, n).psi_uniform.data(); - p_dpsi_uniform[iw] = orb_->PhiLN(l, n).dpsi_uniform.data(); - } - } - // store the spherical harmonics // it's outside the loop to reduce the vector allocation overhead std::vector ylma; @@ -35,7 +48,6 @@ void GintAtom::set_phi(const std::vector& coords, const int stride, T* ph for(int im = 0; im < num_mgrids; im++) { const Vec3d& coord = coords[im]; - // 1e-9 is to avoid division by zero const double dist = coord.norm() < 1e-9 ? 1e-9 : coord.norm(); if(dist > orb_->getRcut()) @@ -74,8 +86,8 @@ void GintAtom::set_phi(const std::vector& coords, const int stride, T* ph { if(atom_->iw2_new[iw]) { - auto psi_uniform = p_psi_uniform[iw]; - auto dpsi_uniform = p_dpsi_uniform[iw]; + auto psi_uniform = p_psi_uniform_[iw]; + auto dpsi_uniform = p_dpsi_uniform_[iw]; psi = c1 * psi_uniform[ip] + c2 * dpsi_uniform[ip] + c3 * psi_uniform[ip + 1] + c4 * dpsi_uniform[ip + 1]; } @@ -94,22 +106,6 @@ void GintAtom::set_phi_dphi( // orb_ does not have the member variable dr_uniform const double dr_uniform = orb_->PhiLN(0, 0).dr_uniform; - - // store the pointer to reduce repeated address fetching - std::vector p_psi_uniform(atom_->nw); - std::vector p_dpsi_uniform(atom_->nw); - std::vector phi_nr_uniform(atom_->nw); - for (int iw=0; iw< atom_->nw; ++iw) - { - if ( atom_->iw2_new[iw] ) - { - int l = atom_->iw2l[iw]; - int n = atom_->iw2n[iw]; - p_psi_uniform[iw] = orb_->PhiLN(l, n).psi_uniform.data(); - p_dpsi_uniform[iw] = orb_->PhiLN(l, n).dpsi_uniform.data(); - phi_nr_uniform[iw] = orb_->PhiLN(l, n).nr_uniform; - } - } std::vector rly(std::pow(atom_->nwl + 1, 2)); // TODO: replace array_pool with std::vector @@ -157,24 +153,16 @@ void GintAtom::set_phi_dphi( // function from interpolation method. if(atom_->iw2_new[iw]) { - auto psi_uniform = p_psi_uniform[iw]; - auto dpsi_uniform = p_dpsi_uniform[iw]; - - if(ip >= phi_nr_uniform[iw] - 4) - { - tmp = dtmp = 0.0; - } - else - { - // use Polynomia Interpolation method to get the - // wave functions - - tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) - + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); - - dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) - + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); - } + auto psi_uniform = p_psi_uniform_[iw]; + auto dpsi_uniform = p_dpsi_uniform_[iw]; + // use Polynomia Interpolation method to get the + // wave functions + + tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) + + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); + + dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) + + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); } // new l is used. // get the 'l' of this localized wave function diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h index b54f1feedd..e98cc0e3bc 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h @@ -18,10 +18,7 @@ class GintAtom Vec3i biggrid_idx, Vec3i unitcell_idx, Vec3d tau_in_biggrid, - const Numerical_Orbital* orb) - : atom_(atom), ia_(ia), iat_(iat), biggrid_idx_(biggrid_idx), - unitcell_idx_(unitcell_idx), tau_in_biggrid_(tau_in_biggrid), - orb_(orb) {}; + const Numerical_Orbital* orb); // getter functions const Atom* get_atom() const { return atom_; }; @@ -110,7 +107,10 @@ class GintAtom // the numerical orbitals of this atom const Numerical_Orbital* orb_; - + + std::vector p_psi_uniform_; + std::vector p_dpsi_uniform_; + std::vector p_ddpsi_uniform_; }; } // namespace ModuleGint diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp index 4d01acc262..c84f087487 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp @@ -20,24 +20,6 @@ void GintAtom::set_ddphi( // orb_ does not have the member variable dr_uniform const double dr_uniform = orb_->PhiLN(0, 0).dr_uniform; - // store the pointer to reduce repeated address fetching - std::vector p_psi_uniform(atom_->nw); - std::vector p_dpsi_uniform(atom_->nw); - std::vector p_ddpsi_uniform(atom_->nw); - std::vector phi_nr_uniform(atom_->nw); - for (int iw=0; iw< atom_->nw; ++iw) - { - if ( atom_->iw2_new[iw] ) - { - int l = atom_->iw2l[iw]; - int n = atom_->iw2n[iw]; - p_psi_uniform[iw] = orb_->PhiLN(l, n).psi_uniform.data(); - p_dpsi_uniform[iw] = orb_->PhiLN(l, n).dpsi_uniform.data(); - p_ddpsi_uniform[iw] = orb_->PhiLN(l, n).ddpsi_uniform.data(); - phi_nr_uniform[iw] = orb_->PhiLN(l, n).nr_uniform; - } - } - std::vector rly(std::pow(atom_->nwl + 1, 2)); ModuleBase::Array_Pool grly(std::pow(atom_->nwl + 1, 2), 3); // TODO: A better data structure such as a 3D tensor can be used to store dphi @@ -96,24 +78,15 @@ void GintAtom::set_ddphi( { if(atom_->iw2_new[iw]) { - auto psi_uniform = p_psi_uniform[iw]; - auto dpsi_uniform = p_dpsi_uniform[iw]; - - if(ip >= phi_nr_uniform[iw] - 4) - { - tmp = dtmp = 0.0; - } - else - { - // use Polynomia Interpolation method to get the - // wave functions - - tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) - + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); - - dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) - + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); - } + auto psi_uniform = p_psi_uniform_[iw]; + auto dpsi_uniform = p_dpsi_uniform_[iw]; + // use Polynomia Interpolation method to get the + // wave functions + tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) + + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); + + dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) + + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); } // get the 'l' of this localized wave function From ed8182bb7eea4f633c118e2a1a3bfa6386b14787 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 30 May 2025 00:19:30 +0800 Subject: [PATCH 17/63] delete virtual function --- source/module_hamilt_lcao/module_gint/temp_gint/gint.h | 2 -- source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h | 2 +- source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h | 2 +- source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h | 2 +- .../module_gint/temp_gint/gint_fvl_meta_gpu.h | 2 +- source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h | 2 +- source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h | 2 +- source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h | 2 +- source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h | 2 +- source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h | 2 +- source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h | 2 +- .../module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h | 2 +- .../module_gint/temp_gint/gint_vl_metagga_gpu.h | 2 +- .../module_gint/temp_gint/gint_vl_metagga_nspin4.h | 2 +- .../module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h | 2 +- .../module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h | 2 +- .../module_gint/temp_gint/gint_vl_nspin4_gpu.h | 2 +- 17 files changed, 16 insertions(+), 18 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint.h index 2d8a1c1cba..64e941c380 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint.h @@ -12,8 +12,6 @@ class Gint Gint() = default; virtual ~Gint() = default; - virtual void cal_gint() = 0; - // note that gint_info_ is a static member variable // it is shared by all instances of Gint static void set_gint_info(std::shared_ptr gint_info) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h index 013c7b2e0a..d1e224d4d5 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h @@ -25,7 +25,7 @@ class Gint_fvl : public Gint isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), dr3_(gint_info_->get_mgrid_volume()) {}; - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h index 1ae52a0755..92b2c445d8 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h @@ -26,7 +26,7 @@ class Gint_fvl_gpu : public Gint isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), dr3_(gint_info_->get_mgrid_volume()) {}; - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h index 636bbc47b5..0062f3d923 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h @@ -25,7 +25,7 @@ class Gint_fvl_meta : public Gint isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), dr3_(gint_info_->get_mgrid_volume()) {}; - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h index 90a1e9871c..6acf87a20a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h @@ -26,7 +26,7 @@ class Gint_fvl_meta_gpu : public Gint isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), dr3_(gint_info_->get_mgrid_volume()) {}; - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h index 6bd2b51030..1af05e72cd 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h @@ -18,7 +18,7 @@ class Gint_rho : public Gint double **rho) : dm_vec_(dm_vec), nspin_(nspin), rho_(rho) {}; - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h index 9026bfb05e..d411e46518 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h @@ -19,7 +19,7 @@ class Gint_rho_gpu: public Gint double **rho) : dm_vec_(dm_vec), nspin_(nspin), rho_(rho) {}; - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h index d36552a79e..641cdb1bec 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h @@ -17,7 +17,7 @@ class Gint_tau : public Gint double** tau) : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {}; - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h index 376fb919c3..4b1245ebbc 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h @@ -19,7 +19,7 @@ class Gint_tau_gpu : public Gint double** tau) : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {}; - void cal_gint() override; + void cal_gint(); private: void init_dm_gint_(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h index fa3f4b9888..d18532add6 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h @@ -17,7 +17,7 @@ class Gint_vl : public Gint HContainer* hR) : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; - void cal_gint() override; + void cal_gint(); private: diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h index f609a3f053..3b73f740b1 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h @@ -18,7 +18,7 @@ class Gint_vl_gpu : public Gint HContainer* hR) : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; - void cal_gint() override; + void cal_gint(); private: diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h index 6ae267f7d4..5aa8fd3441 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h @@ -18,7 +18,7 @@ class Gint_vl_metagga : public Gint HContainer* hR) : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; - void cal_gint() override; + void cal_gint(); private: diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h index 5669c24384..e6406f4141 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h @@ -19,7 +19,7 @@ class Gint_vl_metagga_gpu : public Gint HContainer* hR) : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; - void cal_gint() override; + void cal_gint(); private: diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h index 40bf386fa3..7b40546854 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h @@ -18,7 +18,7 @@ class Gint_vl_metagga_nspin4 : public Gint HContainer>* hR) : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; - void cal_gint() override; + void cal_gint(); private: void init_hr_gint_(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h index 395ea1544f..47d1e34425 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h @@ -19,7 +19,7 @@ class Gint_vl_metagga_nspin4_gpu : public Gint HContainer>* hR) : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; - void cal_gint() override; + void cal_gint(); private: void init_hr_gint_(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h index 6338055db6..f5e23532bc 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h @@ -17,7 +17,7 @@ class Gint_vl_nspin4 : public Gint HContainer>* hR) : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; - void cal_gint() override; + void cal_gint(); private: diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h index d7033df167..bd9f059a1a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h @@ -18,7 +18,7 @@ class Gint_vl_nspin4_gpu : public Gint HContainer>* hR) : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; - void cal_gint() override; + void cal_gint(); private: From 226094f7f1f56fb4c76d90f304286bf2dc5ef0ee Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sun, 1 Jun 2025 17:11:52 +0800 Subject: [PATCH 18/63] add cal_env to new gint module --- .../module_gint/CMakeLists.txt | 2 + .../module_gint/temp_gint/gint_atom.cpp | 11 +- .../module_gint/temp_gint/gint_atom.h | 18 +- .../module_gint/temp_gint/gint_common.cpp | 119 +++++++++++ .../module_gint/temp_gint/gint_common.h | 2 + .../module_gint/temp_gint/gint_env_gamma.cpp | 44 +++++ .../module_gint/temp_gint/gint_env_gamma.h | 32 +++ .../module_gint/temp_gint/gint_env_k.cpp | 50 +++++ .../module_gint/temp_gint/gint_env_k.h | 43 ++++ .../module_gint/temp_gint/gint_info.cpp | 46 ++++- .../module_gint/temp_gint/gint_info.h | 16 +- .../module_gint/temp_gint/phi_operator.cpp | 89 ++++++++- .../module_gint/temp_gint/phi_operator.h | 28 ++- source/module_io/get_wf_lcao.cpp | 185 ++++++++++-------- 14 files changed, 585 insertions(+), 100 deletions(-) create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index 3062fc9060..1f0001f405 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -45,6 +45,8 @@ if(NEW_GINT) temp_gint/gint_tau.cpp temp_gint/gint_fvl.cpp temp_gint/gint_fvl_meta.cpp + temp_gint/gint_env_gamma.cpp + temp_gint/gint_env_k.cpp temp_gint/localcell_info.cpp temp_gint/phi_operator.cpp temp_gint/set_ddphi.cpp diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp index f6a15b86fc..4d0227bf71 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp @@ -1,21 +1,22 @@ #include "source_base/ylm.h" #include "source_base/array_pool.h" #include "gint_atom.h" +#include "module_cell/unitcell.h" #include "gint_helper.h" namespace ModuleGint { GintAtom::GintAtom( const Atom* atom, - int ia, - int iat, + int it, int ia, int iat, Vec3i biggrid_idx, Vec3i unitcell_idx, Vec3d tau_in_biggrid, - const Numerical_Orbital* orb) -: atom_(atom), ia_(ia), iat_(iat), biggrid_idx_(biggrid_idx), + const Numerical_Orbital* orb, + const UnitCell* ucell) +: atom_(atom), it_(it), ia_(ia), iat_(iat), biggrid_idx_(biggrid_idx), unitcell_idx_(unitcell_idx), tau_in_biggrid_(tau_in_biggrid), - orb_(orb) + orb_(orb), ucell_(ucell) { p_psi_uniform_.resize(atom_->nw); p_dpsi_uniform_.resize(atom_->nw); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h index e98cc0e3bc..3a0e874e4e 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h @@ -13,17 +13,18 @@ class GintAtom // constructor GintAtom( const Atom* atom, - int ia, - int iat, + int it, int ia, int iat, Vec3i biggrid_idx, Vec3i unitcell_idx, Vec3d tau_in_biggrid, - const Numerical_Orbital* orb); + const Numerical_Orbital* orb, + const UnitCell* ucell); // getter functions const Atom* get_atom() const { return atom_; }; int get_ia() const { return ia_; }; int get_iat() const { return iat_; }; + int get_start_iw() const { return ucell_->itiaiw2iwt(it_, ia_, 0); }; // get the start index of global atomic orbitals const Vec3i& get_bgrid_idx() const { return biggrid_idx_; }; const Vec3i& get_unitcell_idx() const { return unitcell_idx_; }; const Vec3i& get_R() const { return unitcell_idx_; }; @@ -88,13 +89,16 @@ class GintAtom private: // the atom object const Atom* atom_; - - // the global index of the atom - int iat_; + + // the global index of the atom type + int it_; // the global index of the atom among the same type of atoms int ia_; + // the global index of the atom + int iat_; + // the index of big grid which contains this atom Vec3i biggrid_idx_; @@ -107,6 +111,8 @@ class GintAtom // the numerical orbitals of this atom const Numerical_Orbital* orb_; + + const UnitCell* ucell_; std::vector p_psi_uniform_; std::vector p_dpsi_uniform_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp index 52edcf8e18..7a8d54551d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp @@ -3,6 +3,11 @@ #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h" #include "module_parameter/parameter.h" +#ifdef __MPI +#include "module_base/blacs_connector.h" +#include +#endif + namespace ModuleGint { @@ -191,6 +196,110 @@ void transfer_dm_2d_to_gint( } } +int globalIndex(int localindex, int nblk, int nprocs, int myproc) +{ + const int iblock = localindex / nblk; + const int gIndex = (iblock * nprocs + myproc) * nblk + localindex % nblk; + return gIndex; +} + +int localIndex(int globalindex, int nblk, int nprocs, int& myproc) +{ + myproc = int((globalindex % (nblk * nprocs)) / nblk); + return int(globalindex / (nblk * nprocs)) * nblk + globalindex % nblk; +} + +template +void wfc_2d_to_gint(const T* wfc_2d, + const Parallel_Orbitals& pv, + T* wfc_gint, + std::shared_ptr gint_info) +{ + ModuleBase::TITLE("Module_gint", "wfc_2d_to_gint"); + ModuleBase::timer::tick("Module_gint", "wfc_2d_to_gint"); + + // dimension related + const int nlocal = pv.desc_wfc[2]; + const int nbands = pv.desc_wfc[3]; + +#ifdef __MPI + const std::vector& trace_lo = gint_info->get_trace_lo(); + + // MPI and memory related + const int mem_stride = 1; + int mpi_info = 0; + + // get the rank of the current process + int rank = 0; + MPI_Comm_rank(pv.comm(), &rank); + + // calculate the maximum number of nlocal over all processes in pv.comm() range + long buf_size; + mpi_info = MPI_Reduce(&pv.nloc_wfc, &buf_size, 1, MPI_LONG, MPI_MAX, 0, pv.comm()); + mpi_info = MPI_Bcast(&buf_size, 1, MPI_LONG, 0, pv.comm()); // get and then broadcast + std::vector wfc_block(buf_size); + + // this quantity seems to have the value returned by function numroc_ in ScaLAPACK? + int naroc[2]; + + // for BLACS broadcast + char scope = 'A'; + char top = ' '; + + // loop over all processors + for (int iprow = 0; iprow < pv.dim0; ++iprow) + { + for (int ipcol = 0; ipcol < pv.dim1; ++ipcol) + { + if (iprow == pv.coord[0] && ipcol == pv.coord[1]) + { + BlasConnector::copy(pv.nloc_wfc, wfc_2d, mem_stride, wfc_block.data(), mem_stride); + naroc[0] = pv.nrow; + naroc[1] = pv.ncol_bands; + Cxgebs2d(pv.blacs_ctxt, &scope, &top, 2, 1, naroc, 2); + Cxgebs2d(pv.blacs_ctxt, &scope, &top, buf_size, 1, wfc_block.data(), buf_size); + } + else + { + Cxgebr2d(pv.blacs_ctxt, &scope, &top, 2, 1, naroc, 2, iprow, ipcol); + Cxgebr2d(pv.blacs_ctxt, &scope, &top, buf_size, 1, wfc_block.data(), buf_size, iprow, ipcol); + } + + // then use it to set the wfc_grid. + const int nb = pv.nb; + const int dim0 = pv.dim0; + const int dim1 = pv.dim1; + for (int j = 0; j < naroc[1]; ++j) + { + int igcol = globalIndex(j, nb, dim1, ipcol); + if (igcol >= PARAM.inp.nbands) + { + continue; + } + for (int i = 0; i < naroc[0]; ++i) + { + int igrow = globalIndex(i, nb, dim0, iprow); + int mu_local = trace_lo[igrow]; + if (wfc_gint && mu_local >= 0) + { + wfc_gint[igcol * nlocal + mu_local] = wfc_block[j * naroc[0] + i]; + } + } + } + // this operation will let all processors have the same wfc_grid + } + } +#else + for (int i = 0; i < nbands; ++i) + { + for (int j = 0; j < nlocal; ++j) + { + wfc_k_grid[i * nlocal + j] = psi[0](i, j); + } + } +#endif + ModuleBase::timer::tick("Module_gint", "wfc_2d_to_gint"); +} template void transfer_hr_gint_to_hR( std::shared_ptr> hr_gint, @@ -206,4 +315,14 @@ template void transfer_dm_2d_to_gint( std::shared_ptr gint_info, std::vector>*> dm, std::vector>>> dm_gint); +template void wfc_2d_to_gint( + const double* wfc_2d, + const Parallel_Orbitals& pv, + double* wfc_grid, + std::shared_ptr gint_info); +template void wfc_2d_to_gint( + const std::complex* wfc_2d, + const Parallel_Orbitals& pv, + std::complex* wfc_grid, + std::shared_ptr gint_info); } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h index 47f0eda35b..66b940a610 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h @@ -19,4 +19,6 @@ namespace ModuleGint std::vector*> dm, std::vector>> dm_gint); + template + void wfc_2d_to_gint(const T* wfc_2d, const Parallel_Orbitals& pv, T* wfc_grid, std::shared_ptr gint_info); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp new file mode 100644 index 0000000000..29e97b64bb --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp @@ -0,0 +1,44 @@ +#include "gint_env_gamma.h" +#include "gint_common.h" +#include "phi_operator.h" + +namespace ModuleGint +{ + +Gint_env_gamma::Gint_env_gamma( + const double* psid, + const Parallel_Orbitals* pv, + const int nbands, + double* rho) + :rho_(rho) +{ + wfc_gint_.resize(nbands * gint_info_->get_lgd()); + wfc_2d_to_gint(psid, *pv, wfc_gint_.data(), gint_info_); +} + +void Gint_env_gamma::cal_env_band(const int iband) +{ + ModuleBase::GlobalFunc::ZEROS(rho_, gint_info_->get_local_mgrid_num()); + const double* wfc_gint_band = &wfc_gint_[iband * gint_info_->get_lgd()]; +#pragma omp parallel + { + PhiOperator phi_op; + std::vector phi; +#pragma omp for schedule(dynamic) + for(const auto& biggrid: gint_info_->get_biggrids()) + { + if(biggrid->get_atoms().empty()) + { + continue; + } + phi_op.set_bgrid(biggrid); + const int phi_len = phi_op.get_rows() * phi_op.get_cols(); + phi.resize(phi_len); + phi_op.set_phi(phi.data()); + phi_op.cal_env_gamma(phi.data(), wfc_gint_band, gint_info_->get_trace_lo(), rho_); + } + } +} + + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h new file mode 100644 index 0000000000..fbd7c85754 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" + +namespace ModuleGint +{ + +class Gint_env_gamma : public Gint +{ + public: + Gint_env_gamma( + const double* psid, + const Parallel_Orbitals* pv, + const int nbands, + double* rho); + + void cal_env_band(const int iband); + + private: + + // output + double* rho_; + + // intermediate variable + std::vector wfc_gint_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp new file mode 100644 index 0000000000..9590089e53 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp @@ -0,0 +1,50 @@ +#include "gint_env_k.h" +#include "gint_common.h" +#include "phi_operator.h" + +namespace ModuleGint +{ + +Gint_env_k::Gint_env_k( + const std::complex* psid, + const Parallel_Orbitals* pv, + const std::vector& kvec_c, + const std::vector& kvec_d, + const int nbands, + const int ik, + const int nspin, + const int npol, + double* rho) + :kvec_c_(kvec_c), kvec_d_(kvec_d), ik_(ik), nspin_(nspin), npol_(npol), rho_(rho) +{ + wfc_gint_.resize(nbands * gint_info_->get_lgd()); + wfc_2d_to_gint(psid, *pv, wfc_gint_.data(), gint_info_); +} + +void Gint_env_k::cal_env_band(const int iband) +{ + ModuleBase::GlobalFunc::ZEROS(rho_, gint_info_->get_local_mgrid_num()); + const std::complex* wfc_gint_band = &wfc_gint_[iband * gint_info_->get_lgd()]; +#pragma omp parallel + { + PhiOperator phi_op; + std::vector phi; +#pragma omp for schedule(dynamic) + for(const auto& biggrid: gint_info_->get_biggrids()) + { + if(biggrid->get_atoms().empty()) + { + continue; + } + phi_op.set_bgrid(biggrid); + const int phi_len = phi_op.get_rows() * phi_op.get_cols(); + phi.resize(phi_len); + phi_op.set_phi(phi.data()); + phi_op.cal_env_k(phi.data(), wfc_gint_band, gint_info_->get_trace_lo(), ik_, nspin_, + npol_, gint_info_->get_lgd(), kvec_c_, kvec_d_, rho_); + } + } +} + + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h new file mode 100644 index 0000000000..31938bc73e --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h @@ -0,0 +1,43 @@ +#pragma once + +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "gint.h" +#include "gint_info.h" + +namespace ModuleGint +{ + +class Gint_env_k : public Gint +{ + public: + Gint_env_k( + const std::complex* psid, + const Parallel_Orbitals* pv, + const std::vector& kvec_c, + const std::vector& kvec_d, + const int nbands, + const int ik, + const int nspin, + const int npol, + double* rho); + + void cal_env_band(const int iband); + + private: + // input + const std::vector& kvec_c_; + const std::vector& kvec_d_; + int ik_; + int nspin_; + int npol_; + + // output + double* rho_; + + // intermediate variable + std::vector> wfc_gint_; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp index 806de9bf94..8511de750d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp @@ -44,6 +44,9 @@ GintInfo::GintInfo( // initialize the atoms init_atoms_(ucell_->ntype, ucell_->atoms, Phi); + // initialize trace_lo_ and lgd_ + init_trace_lo_(ucell, PARAM.inp.nspin); + // initialize the ijr_info // this step needs to be done after init_atoms_, because it requires the information of is_atom_on_bgrid init_ijr_info_(ucell, gd); @@ -134,7 +137,7 @@ void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital atom_bgrid_idx.y - ucell_idx_bgrid.y * unitcell_info_->get_nby(), atom_bgrid_idx.z - ucell_idx_bgrid.z * unitcell_info_->get_nbz()); r_to_atom.insert(std::make_pair(ucell_idx_relative, - GintAtom(&atom, j, iat, ext_atom_bgrid_idx, ucell_idx_relative, tau_in_biggrid, orb))); + GintAtom(&atom, i, j, iat, ext_atom_bgrid_idx, ucell_idx_relative, tau_in_biggrid, orb, ucell_))); } if(biggrids_[bgrid_local_idx]->is_atom_on_bgrid(&r_to_atom.at(ucell_idx_relative))) { @@ -150,6 +153,47 @@ void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital ModuleBase::timer::tick("GintInfo", "init_atoms"); } +void GintInfo::init_trace_lo_(const UnitCell& ucell, const int nspin) +{ + this->trace_lo_ = std::vector(PARAM.globalv.nlocal, -1); + this->lgd_ = 0; + int iat = 0; + int iw_all = 0; + int iw_local = 0; + for (int it = 0; it < ucell.ntype; it++) + { + for (int ia = 0; ia < ucell.atoms[it].na; ia++) + { + if (is_atom_in_proc_[iat]) + { + int nw0 = ucell.atoms[it].nw; + if (nspin== 4) + { // added by zhengdy-soc, need to be double in soc + nw0 *= 2; + this->lgd_ += nw0; + } else { + this->lgd_ += nw0; + } + + for (int iw = 0; iw < nw0; iw++) + { + this->trace_lo_[iw_all] = iw_local; + ++iw_local; + ++iw_all; + } + } else { + // global index of atomic orbitals + iw_all += ucell.atoms[it].nw; + if (nspin == 4) + { + iw_all += ucell.atoms[it].nw; + } + } + ++iat; + } + } +} + void GintInfo::init_ijr_info_(const UnitCell& ucell, Grid_Driver& gd) { HContainer hr_gint_local(ucell.nat); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h index ceb669b635..ba2313657a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h @@ -34,10 +34,11 @@ class GintInfo const UnitCell& ucell, Grid_Driver& gd); // getter functions - std::vector>& get_biggrids() { return biggrids_; }; - std::shared_ptr get_bgrid_info() const { return biggrid_info_; }; + const std::vector>& get_biggrids() { return biggrids_; }; + const std::vector& get_trace_lo() const{ return trace_lo_; }; + int get_lgd() const { return lgd_; }; int get_nat() const { return ucell_->nat; }; // return the number of atoms in the unitcell - double get_local_mgrid_num() const { return localcell_info_->get_mgrids_num(); }; + int get_local_mgrid_num() const { return localcell_info_->get_mgrids_num(); }; double get_mgrid_volume() const { return meshgrid_info_->get_volume(); }; //========================================= @@ -50,6 +51,9 @@ class GintInfo // initialize the atoms void init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital* Phi); + // initialize trace_lo_ and lgd_ + void init_trace_lo_(const UnitCell& ucell, const int nspin); + // initialize the ijr_info void init_ijr_info_(const UnitCell& ucell, Grid_Driver& gd); @@ -85,6 +89,12 @@ class GintInfo // format for storing atomic pair information in hcontainer, used for initializing hcontainer std::vector ijr_info_; + // map the global index of atomic orbitals to local index + std::vector trace_lo_; + + // total num of atomic orbitals on this proc + int lgd_; + #ifdef __CUDA public: std::vector>& get_bgrid_batches() { return bgrid_batches_; }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp index b8924864c7..51a8bf0496 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp @@ -18,14 +18,13 @@ void PhiOperator::set_bgrid(std::shared_ptr biggrid) // init is_atom_on_mgrid_ and atoms_relative_coords_ const int atoms_num = biggrid_->get_atoms_num(); atoms_relative_coords_.resize(atoms_num); - is_atom_on_mgrid_.resize(atoms_num); + is_atom_on_mgrid_.resize(biggrid_->get_mgrids_num() * atoms_num); for(int i = 0; i < atoms_num; ++i) { biggrid_->set_atom_relative_coords(biggrid_->get_atom(i), atoms_relative_coords_[i]); - is_atom_on_mgrid_[i].resize(rows_); for(int j = 0; j < rows_; ++j) { - is_atom_on_mgrid_[i][j] = atoms_relative_coords_[i][j].norm() <= biggrid_->get_atom(i)->get_rcut(); + is_atom_on_mgrid_[i * rows_ + j] = atoms_relative_coords_[i][j].norm() <= biggrid_->get_atom(i)->get_rcut(); } } @@ -131,6 +130,86 @@ void PhiOperator::phi_dot_dphi_r( svl[0](2, 2) += szz * 2; } +void PhiOperator::cal_env_gamma( + const double* phi, + const double* wfc, + const vector& trace_lo, + double* rho) const +{ + for(int i = 0; i < biggrid_->get_atoms_num(); ++i) + { + const auto atom = biggrid_->get_atom(i); + const int iw_start = atom->get_start_iw(); + const int start_idx = atoms_startidx_[i]; + for(int j = 0; j < biggrid_->get_mgrids_num(); ++j) + { + if(is_atom_on_mgrid(i, j)) + { + double tmp = 0.0; + int iw_lo = trace_lo[iw_start]; + for(int iw = 0; iw < atom->get_nw(); ++iw, ++iw_lo) + { + tmp += phi[j * cols_ + start_idx + iw] * wfc[iw_lo]; + } + rho[meshgrids_local_idx_[j]] += tmp; + } + } + } +} + +void PhiOperator::cal_env_k( + const double* phi, + const std::complex* wfc, + const vector& trace_lo, + const int ik, + const int nspin, + const int npol, + const int lgd, + const std::vector& kvec_c, + const std::vector& kvec_d, + double* rho) const +{ + for(int i = 0; i < biggrid_->get_atoms_num(); ++i) + { + const auto atom = biggrid_->get_atom(i); + const int iw_start = atom->get_start_iw(); + const Vec3d R(atom->get_unitcell_idx()); + const double arg = (kvec_d[ik] * R) * ModuleBase::TWO_PI; + const std::complex kphase = std::complex(cos(arg), sin(arg)); + const int start_idx = atoms_startidx_[i]; + for(int j = 0; j < biggrid_->get_mgrids_num(); ++j) + { + if(is_atom_on_mgrid(i, j)) + { + std::complex tmp{0.0, 0.0}; + int phi_start_idx = j * cols_ + start_idx; + + int iw_lo = 0; + if (nspin == 4) // is it a simple add of 2 spins? + { + for (int is = 0; is < 2; ++is) + { + iw_lo = trace_lo[iw_start] / npol + lgd / npol * is; + for (int iw = 0; iw < atom->get_nw(); ++iw, ++iw_lo) + { + tmp += std::complex(phi[iw], 0.0) * wfc[iw_lo] * kphase; + } + } + } + else + { + iw_lo = trace_lo[iw_start]; + for (int iw = 0; iw < atom->get_nw(); ++iw, ++iw_lo) + { + tmp += std::complex(phi[iw], 0.0) * wfc[iw_lo] * kphase; + } + } + rho[meshgrids_local_idx_[j]] += tmp.real(); + } + } + } +} + //=============================== // private methods @@ -150,7 +229,7 @@ void PhiOperator::init_atom_pair_start_end_idx_() int end_idx = -1; for(int mgrid_idx = 0; mgrid_idx < mgrids_num; ++mgrid_idx) { - if(is_atom_on_mgrid_[i][mgrid_idx] && is_atom_on_mgrid_[j][mgrid_idx]) + if(is_atom_on_mgrid(i, mgrid_idx) && is_atom_on_mgrid(j, mgrid_idx)) { start_idx = mgrid_idx; break; @@ -158,7 +237,7 @@ void PhiOperator::init_atom_pair_start_end_idx_() } for(int mgrid_idx = mgrids_num - 1; mgrid_idx >= 0; --mgrid_idx) { - if(is_atom_on_mgrid_[i][mgrid_idx] && is_atom_on_mgrid_[j][mgrid_idx]) + if(is_atom_on_mgrid(i, mgrid_idx) && is_atom_on_mgrid(j, mgrid_idx)) { end_idx = mgrid_idx; break; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h index 8fe9cbfc11..e25c3641d8 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h @@ -93,6 +93,24 @@ class PhiOperator const double* dphi_z, ModuleBase::matrix *svl) const; + void cal_env_gamma( + const double* phi, + const double* wfc, + const vector& trace_lo, + double* rho) const; + + void cal_env_k( + const double* phi, + const std::complex* wfc, + const vector& trace_lo, + const int ik, + const int nspin, + const int npol, + const int lgd, + const std::vector& kvec_c, + const std::vector& kvec_d, + double* rho) const; + private: void init_atom_pair_start_end_idx_(); @@ -105,6 +123,11 @@ class PhiOperator return atom_pair_start_end_idx_[(2 * biggrid_->get_atoms_num() - x + 1) * x / 2 + y]; }; + bool is_atom_on_mgrid(int atom_idx, int mgrid_idx) const + { + return is_atom_on_mgrid_[atom_idx * rows_ + mgrid_idx]; + } + // the row number of the phi matrix // rows_ = biggrid_->get_mgrids_num() int rows_; @@ -124,9 +147,8 @@ class PhiOperator std::vector> atoms_relative_coords_; // record whether the atom affects the meshgrid - // is_atom_on_mgrid_[i][j] = true if the ith atom affects the jth meshgrid, otherwise false - // FIXME,std::vector> is not a efficient data structure, we can use a 1D array to replace it. - std::vector> is_atom_on_mgrid_; + // is_atom_on_mgrid_[i * rows_ + j] = true if the ith atom affects jhe ith meshgrid, otherwise false + std::vector is_atom_on_mgrid_; // the start index of the phi of each atom std::vector atoms_startidx_; diff --git a/source/module_io/get_wf_lcao.cpp b/source/module_io/get_wf_lcao.cpp index 52a02bbd60..bd02097b4d 100644 --- a/source/module_io/get_wf_lcao.cpp +++ b/source/module_io/get_wf_lcao.cpp @@ -10,6 +10,11 @@ #include "module_io/write_wfc_r.h" #include "module_parameter/parameter.h" +#ifdef __NEW_GINT +#include "module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h" +#endif + Get_wf_lcao::Get_wf_lcao(const elecstate::ElecState* pes) { pes_ = pes; @@ -44,6 +49,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, int fermi_band = 0; prepare_get_wf(GlobalV::ofs_running, nelec, fermi_band); +#ifndef __NEW_GINT // allocate grid wave functions for gamma_only std::vector wfc_gamma_grid(nspin); for (int is = 0; is < nspin; ++is) @@ -54,6 +60,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, wfc_gamma_grid[is][ib] = new double[gg.gridt->lgd]; } } +#endif // for pw_wfc in G space psi::Psi> psi_g; @@ -61,9 +68,11 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // if (out_wfc_pw || out_wfc_r) psi_g.resize(nspin, nbands, kv.ngk[0]); +#ifndef __NEW_GINT const double mem_size = sizeof(double) * double(gg.gridt->lgd) * double(nbands) * double(nspin) / 1024.0 / 1024.0; ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size); ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "On-the-fly memory consumption (MB)", mem_size); +#endif int mode_norm = 0; const int norm_size = static_cast(out_wfc_norm.size()); @@ -83,34 +92,38 @@ void Get_wf_lcao::begin(const UnitCell& ucell, this->select_bands(nbands_istate, out_wfc_norm, nbands, nelec, mode_norm, fermi_band); // Calculate out_wfc_norm - for (int ib = 0; ib < nbands; ++ib) + for (int is = 0; is < nspin; ++is) { - if (bands_picked_[ib]) + psid->fix_k(is); +#ifndef __NEW_GINT + #ifdef __MPI + wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); + #else + for (int i = 0; i < nbands; ++i) { - for (int is = 0; is < nspin; ++is) + for (int j = 0; j < nlocal; ++j) { - ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); - - psid->fix_k(is); -#ifdef __MPI - wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); + wfc_gamma_grid[is][i][j] = psid[0](i, j); + } + } + #endif #else - // if not MPI enabled, it is the case psid holds a global matrix. - // use fix_k to switch between different spin channels (actually kpoints, + ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, pes_->charge->rho[is]); +#endif + for (int ib = 0; ib < nbands; ++ib) + { + if (bands_picked_[ib]) + { + #ifndef __NEW_GINT + ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); + // if not MPI enabled, it is the case psid holds a global matrix. + // use fix_k to switch between different spin channels (actually kpoints, // because now the same kpoint in different spin channels are treated // as distinct kpoints) - - for (int i = 0; i < nbands; ++i) - { - for (int j = 0; j < nlocal; ++j) - { - wfc_gamma_grid[is][i][j] = psid[0](i, j); - } - } -#endif - gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); - + #else + gint_env.cal_env_band(ib); + #endif pes_->charge->save_rho_before_sum_band(); // pint out information @@ -156,35 +169,41 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // Set this->bands_picked_ according to the mode this->select_bands(nbands_istate, out_wfc_re_im, nbands, nelec, mode_re_im, fermi_band); - // Calculate out_wfc_re_im - for (int ib = 0; ib < nbands; ++ib) + if (out_wfc_pw || out_wfc_r) { - if (bands_picked_[ib]) + // Calculate out_wfc_re_im + for (int is = 0; is < nspin; ++is) { - std::cout << " Performing grid integral over real space grid for band " << ib + 1 << "..." << std::endl; - - for (int is = 0; is < nspin; ++is) + psid->fix_k(is); +#ifndef __NEW_GINT + #ifdef __MPI + wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); + #else + // if not MPI enabled, it is the case psid holds a global matrix. use fix_k to switch between + // different spin channels (actually kpoints, because now the same kpoint in different spin channels + // are treated as distinct kpoints) + + for (int i = 0; i < nbands; ++i) { - ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); - - psid->fix_k(is); -#ifdef __MPI - wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); -#else - // if not MPI enabled, it is the case psid holds a global matrix. use fix_k to switch between - // different spin channels (actually kpoints, because now the same kpoint in different spin channels - // are treated as distinct kpoints) - - for (int i = 0; i < nbands; ++i) + for (int j = 0; j < nlocal; ++j) { - for (int j = 0; j < nlocal; ++j) - { - wfc_gamma_grid[is][i][j] = psid[0](i, j); - } + wfc_gamma_grid[is][i][j] = psid[0](i, j); } + } + #endif +#else + ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, pes_->charge->rho[is]); +#endif + for (int ib = 0; ib < nbands; ++ib) + { + if (bands_picked_[ib]) + { +#ifndef __NEW_GINT + ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); + gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); +#else + gint_env.cal_env_band(ib); #endif - - gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); pes_->charge->save_rho_before_sum_band(); @@ -234,6 +253,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, } delete[] wfc_gamma_grid[is]; } +#endif return; } @@ -265,6 +285,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // allocate grid wave functions for multi-k const int nks = kv.get_nks(); std::vector**> wfc_k_grid(nks); +#ifndef __NEW_GINT for (int ik = 0; ik < nks; ++ik) { wfc_k_grid[ik] = new std::complex*[nbands]; @@ -273,11 +294,11 @@ void Get_wf_lcao::begin(const UnitCell& ucell, wfc_k_grid[ik][ib] = new std::complex[gk.gridt->lgd]; } } - const double mem_size = sizeof(std::complex) * double(gk.gridt->lgd) * double(nbands) * double(nks) / 1024.0 / 1024.0; ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size); ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "On-the-fly memory consumption (MB)", mem_size); +#endif // for pw_wfc in G space psi::Psi> psi_g; @@ -301,44 +322,53 @@ void Get_wf_lcao::begin(const UnitCell& ucell, this->select_bands(nbands_istate, out_wfc_norm, nbands, nelec, mode_norm, fermi_band); // Calculate out_wfc_norm - for (int ib = 0; ib < nbands; ++ib) + const int nspin0 = (nspin == 2) ? 2 : 1; + for (int ik = 0; ik < nks; ++ik) // the loop of nspin0 is included { - if (bands_picked_[ib]) + const int ispin = kv.isk[ik]; + // 2d-to-grid conversion is unified into `wfc_2d_to_grid`. + psi->fix_k(ik); + +#ifndef __NEW_GINT + #ifdef __MPI // need to deal with NSPIN=4 !!!! + wfc_2d_to_grid(psi->get_pointer(), para_orb, wfc_k_grid[ik], gk.gridt->trace_lo); + #else + for (int i = 0; i < nbands; ++i) { - const int nspin0 = (nspin == 2) ? 2 : 1; - for (int ik = 0; ik < nks; ++ik) // the loop of nspin0 is included + for (int j = 0; j < nlocal; ++j) { - const int ispin = kv.isk[ik]; + wfc_k_grid[ik][i][j] = psi[0](i, j); + } + } + #endif +#else + ModuleGint::Gint_env_k gint_env(psi->get_pointer(), ¶_orb, kv.kvec_c, kv.kvec_d, + nbands, ik, PARAM.inp.nspin, PARAM.globalv.npol, pes_->charge->rho[ispin]); +#endif + + for (int ib = 0; ib < nbands; ++ib) + { + if (bands_picked_[ib]) + { +#ifndef __NEW_GINT ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[ispin], pw_wfc->nrxx); // terrible, you make changes on another instance's data??? - // 2d-to-grid conversion is unified into `wfc_2d_to_grid`. - psi->fix_k(ik); - -#ifdef __MPI // need to deal with NSPIN=4 !!!! - wfc_2d_to_grid(psi->get_pointer(), para_orb, wfc_k_grid[ik], gk.gridt->trace_lo); -#else - for (int i = 0; i < nbands; ++i) - { - for (int j = 0; j < nlocal; ++j) - { - wfc_k_grid[ik][i][j] = psi[0](i, j); - } - } -#endif // deal with NSPIN=4 gk.cal_env_k(ik, wfc_k_grid[ik][ib], pes_->charge->rho[ispin], kv.kvec_c, kv.kvec_d, ucell); - - // ik0 is the real k-point index, starting from 0 - int ik0 = kv.ik2iktot[ik]; - if (nspin == 2) - { - const int half_k = kv.get_nkstot() / 2; - if (ik0 >= half_k) - { - ik0 -= half_k; - } - } +#else + gint_env.cal_env_band(ib); +#endif + // ik0 is the real k-point index, starting from 0 + int ik0 = kv.ik2iktot[ik]; + if(nspin == 2) + { + const int half_k = kv.get_nkstot()/2; + if(ik0 >= half_k) + { + ik0 -= half_k; + } + } // pint out information std::stringstream ss_file; @@ -426,10 +456,11 @@ void Get_wf_lcao::begin(const UnitCell& ucell, std::stringstream ss_imag; ss_imag << global_out_dir << "wf" << ib + 1 << "s" << ispin + 1 << "k" << ik0 + 1 << "imag.cube"; ModuleIO::write_vdata_palgrid(pgrid, wfc_imag.data(), ispin, nspin, 0, ss_imag.str(), ef_tmp, &(ucell)); + } } } } - +#ifndef __NEW_GINT for (int ik = 0; ik < nks; ++ik) { for (int ib = 0; ib < nbands; ++ib) @@ -438,7 +469,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, } delete[] wfc_k_grid[ik]; } - +#endif return; } From d79d418459c1feb13f426e63fed3772268ff923e Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sun, 1 Jun 2025 17:12:04 +0800 Subject: [PATCH 19/63] renew some notes --- source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp | 2 -- .../module_gint/temp_gint/gint_vl_metagga.cpp | 2 -- 2 files changed, 4 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp index b0107bf064..7b8199422e 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp @@ -26,8 +26,6 @@ void Gint_vl::init_hr_gint_() void Gint_vl::cal_hr_gint_() { -// be careful!! -// each thread will have a copy of hr_gint_, this may cause a lot of memory usage #pragma omp parallel { PhiOperator phi_op; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp index fa651a89e1..b5b5222148 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp @@ -26,8 +26,6 @@ void Gint_vl_metagga::init_hr_gint_() void Gint_vl_metagga::cal_hr_gint_() { -// be careful!! -// each thread will have a copy of hr_gint_, this may cause a lot of memory usage #pragma omp parallel { PhiOperator phi_op; From 14b6949a4a348ef2b1b0dc8c28158b5f844b5c23 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Wed, 4 Jun 2025 17:14:16 +0800 Subject: [PATCH 20/63] simplify cuda_mem_wrapper.h --- .../temp_gint/kernel/cuda_mem_wrapper.h | 30 ++++--------------- .../temp_gint/kernel/phi_operator_gpu.cu | 22 +++++--------- 2 files changed, 13 insertions(+), 39 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h index c38ac39030..b0612c9de6 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h @@ -16,14 +16,12 @@ class CudaMemWrapper this->device_ptr_ = other.device_ptr_; this->host_ptr_ = other.host_ptr_; this->size_ = other.size_; - this->capacity_ = other.capacity_; this->malloc_host_ = other.malloc_host_; this->stream_ = other.stream_; other.device_ptr_ = nullptr; other.host_ptr_ = nullptr; other.size_ = 0; - other.capacity_ = 0; other.malloc_host_ = false; other.stream_ = 0; }; @@ -35,39 +33,36 @@ class CudaMemWrapper this->device_ptr_ = other.device_ptr_; this->host_ptr_ = other.host_ptr_; this->size_ = other.size_; - this->capacity_ = other.capacity_; this->malloc_host_ = other.malloc_host_; this->stream_ = other.stream_; other.device_ptr_ = nullptr; other.host_ptr_ = nullptr; other.size_ = 0; - other.capacity_ = 0; other.malloc_host_ = false; other.stream_ = 0; } return *this; }; - CudaMemWrapper(int capacity, + CudaMemWrapper(int size, cudaStream_t stream = 0, bool malloc_host = true) { - capacity_ = capacity; - size_ = capacity; + size_ = size; malloc_host_ = malloc_host; stream_ = stream; if (malloc_host) { - checkCuda(cudaMallocHost((void**)&host_ptr_, capacity * sizeof(T))); - memset(host_ptr_, 0, capacity * sizeof(T)); + checkCuda(cudaMallocHost((void**)&host_ptr_, size_* sizeof(T))); + memset(host_ptr_, 0, size_ * sizeof(T)); } else { host_ptr_ = nullptr; } - checkCuda(cudaMalloc((void**)&device_ptr_, capacity * sizeof(T))); - checkCuda(cudaMemset(device_ptr_, 0, capacity_ * sizeof(T))); + checkCuda(cudaMalloc((void**)&device_ptr_, size_ * sizeof(T))); + checkCuda(cudaMemset(device_ptr_, 0, size_ * sizeof(T))); }; ~CudaMemWrapper() @@ -165,25 +160,12 @@ class CudaMemWrapper T* get_host_ptr() { return host_ptr_; }; const T* get_device_ptr() const { return device_ptr_; }; const T* get_host_ptr() const { return host_ptr_; }; - - // Only supports setting size to a value less than or equal to capacity - void set_size(int new_size) - { - if (new_size > capacity_) - { - ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "New size exceeds capacity, cannot resize."); - } - size_ = new_size; - }; - int get_size() const { return size_; }; - int get_capacity() const { return capacity_; }; private: T* device_ptr_ = nullptr; T* host_ptr_ = nullptr; int size_ = 0; - int capacity_ = 0; bool malloc_host_ = false; cudaStream_t stream_ = 0; }; \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu index 4a265c6b1d..00f1307a8f 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu @@ -38,14 +38,6 @@ PhiOperatorGpu::~PhiOperatorGpu() void PhiOperatorGpu::set_bgrid_batch(std::shared_ptr bgrid_batch) { bgrid_batch_ = bgrid_batch; - // The set_size here is to determine how many bytes to transfer in the subsequent copy_host_to_device - atoms_num_info_.set_size(bgrid_batch->get_batch_size()); - bgrids_phi_len_.set_size(bgrid_batch->get_batch_size()); - bgrids_phi_start_.set_size(bgrid_batch->get_batch_size()); - atoms_iat_.set_size(bgrid_batch->get_atoms_num()); - atoms_bgrids_rcoords_.set_size(bgrid_batch->get_atoms_num()); - atoms_phi_start_.set_size(bgrid_batch->get_atoms_num()); - mgrids_local_idx_batch_.set_size(bgrid_batch->get_batch_size() * mgrids_num_); auto atoms_num_info_h = atoms_num_info_.get_host_ptr(); auto bgrids_phi_len_h = bgrids_phi_len_.get_host_ptr(); auto bgrids_phi_start_h = bgrids_phi_start_.get_host_ptr(); @@ -84,13 +76,13 @@ void PhiOperatorGpu::set_bgrid_batch(std::shared_ptr bgrid_batch) i++; } - atoms_num_info_.copy_host_to_device_async(); - bgrids_phi_len_.copy_host_to_device_async(); - bgrids_phi_start_.copy_host_to_device_async(); - atoms_iat_.copy_host_to_device_async(); - atoms_bgrids_rcoords_.copy_host_to_device_async(); - atoms_phi_start_.copy_host_to_device_async(); - mgrids_local_idx_batch_.copy_host_to_device_async(); + atoms_num_info_.copy_host_to_device_async(bgrid_batch->get_batch_size()); + bgrids_phi_len_.copy_host_to_device_async(bgrid_batch->get_batch_size()); + bgrids_phi_start_.copy_host_to_device_async(bgrid_batch->get_batch_size()); + atoms_iat_.copy_host_to_device_async(bgrid_batch->get_atoms_num()); + atoms_bgrids_rcoords_.copy_host_to_device_async(bgrid_batch->get_atoms_num()); + atoms_phi_start_.copy_host_to_device_async(bgrid_batch->get_atoms_num()); + mgrids_local_idx_batch_.copy_host_to_device_async(bgrid_batch->get_batch_size() * mgrids_num_); checkCuda(cudaEventRecord(event_, stream_)); } From c2b682167901db40bd18d875e6207bf5061edf1f Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 5 Jun 2025 22:08:54 +0800 Subject: [PATCH 21/63] update cuda_mem_wrapper.h --- .../temp_gint/kernel/cuda_mem_wrapper.h | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h index b0612c9de6..e5aa721b7f 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h @@ -45,7 +45,7 @@ class CudaMemWrapper return *this; }; - CudaMemWrapper(int size, + CudaMemWrapper(size_t size, cudaStream_t stream = 0, bool malloc_host = true) { @@ -70,7 +70,7 @@ class CudaMemWrapper free(); }; - void copy_host_to_device_sync(int size) + void copy_host_to_device_sync(size_t size) { if (host_ptr_ == nullptr) { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to device."); } @@ -82,7 +82,7 @@ class CudaMemWrapper copy_host_to_device_sync(size_); }; - void copy_host_to_device_async(int size) + void copy_host_to_device_async(size_t size) { if (host_ptr_ == nullptr) { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to device."); } @@ -94,7 +94,7 @@ class CudaMemWrapper copy_host_to_device_async(size_); }; - void copy_device_to_host_sync(int size) + void copy_device_to_host_sync(size_t size) { if (host_ptr_ == nullptr) { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to host."); } @@ -106,7 +106,7 @@ class CudaMemWrapper copy_device_to_host_sync(size_); }; - void copy_device_to_host_async(int size) + void copy_device_to_host_async(size_t size) { if (host_ptr_ == nullptr) { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to host."); } @@ -118,7 +118,7 @@ class CudaMemWrapper copy_device_to_host_async(size_); }; - void memset_device_sync(const int size, const int value = 0) + void memset_device_sync(const size_t size, const int value = 0) { checkCuda(cudaMemset(device_ptr_, value, size * sizeof(T))); }; @@ -128,7 +128,7 @@ class CudaMemWrapper memset_device_sync(size_, value); }; - void memset_device_async(const int size, const int value = 0) + void memset_device_async(const size_t size, const int value = 0) { checkCuda(cudaMemsetAsync(device_ptr_, value, size * sizeof(T), stream_)); }; @@ -138,7 +138,7 @@ class CudaMemWrapper memset_device_async(size_, value); }; - void memset_host(const int size, const int value = 0) + void memset_host(const size_t size, const int value = 0) { if (host_ptr_ == nullptr) { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot memset host."); } @@ -160,12 +160,12 @@ class CudaMemWrapper T* get_host_ptr() { return host_ptr_; }; const T* get_device_ptr() const { return device_ptr_; }; const T* get_host_ptr() const { return host_ptr_; }; - int get_size() const { return size_; }; + size_t get_size() const { return size_; }; private: T* device_ptr_ = nullptr; T* host_ptr_ = nullptr; - int size_ = 0; + size_t size_ = 0; bool malloc_host_ = false; cudaStream_t stream_ = 0; }; \ No newline at end of file From 052c3c95927eb5263f1ad4e0c72860d3b6ad81a6 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 6 Jun 2025 00:56:28 +0800 Subject: [PATCH 22/63] fix get_wf_lcao.cpp --- source/module_io/get_wf_lcao.cpp | 78 ++++++++++++++++---------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/source/module_io/get_wf_lcao.cpp b/source/module_io/get_wf_lcao.cpp index bd02097b4d..5f8307c269 100644 --- a/source/module_io/get_wf_lcao.cpp +++ b/source/module_io/get_wf_lcao.cpp @@ -99,6 +99,10 @@ void Get_wf_lcao::begin(const UnitCell& ucell, #ifdef __MPI wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); #else + // if not MPI enabled, it is the case psid holds a global matrix. + // use fix_k to switch between different spin channels (actually kpoints, + // because now the same kpoint in different spin channels are treated + // as distinct kpoints) for (int i = 0; i < nbands; ++i) { for (int j = 0; j < nlocal; ++j) @@ -108,7 +112,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, } #endif #else - ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, pes_->charge->rho[is]); + ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, pes_->charge->rho[is]); #endif for (int ib = 0; ib < nbands; ++ib) { @@ -116,10 +120,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell, { #ifndef __NEW_GINT ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); - // if not MPI enabled, it is the case psid holds a global matrix. - // use fix_k to switch between different spin channels (actually kpoints, - // because now the same kpoint in different spin channels are treated - // as distinct kpoints) gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); #else gint_env.cal_env_band(ib); @@ -169,42 +169,38 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // Set this->bands_picked_ according to the mode this->select_bands(nbands_istate, out_wfc_re_im, nbands, nelec, mode_re_im, fermi_band); - if (out_wfc_pw || out_wfc_r) + // Calculate out_wfc_re_im + for (int is = 0; is < nspin; ++is) { - // Calculate out_wfc_re_im - for (int is = 0; is < nspin; ++is) - { - psid->fix_k(is); + psid->fix_k(is); #ifndef __NEW_GINT #ifdef __MPI - wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); + wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); #else - // if not MPI enabled, it is the case psid holds a global matrix. use fix_k to switch between - // different spin channels (actually kpoints, because now the same kpoint in different spin channels - // are treated as distinct kpoints) - - for (int i = 0; i < nbands; ++i) + // if not MPI enabled, it is the case psid holds a global matrix. use fix_k to switch between + // different spin channels (actually kpoints, because now the same kpoint in different spin channels + // are treated as distinct kpoints) + for (int i = 0; i < nbands; ++i) + { + for (int j = 0; j < nlocal; ++j) { - for (int j = 0; j < nlocal; ++j) - { - wfc_gamma_grid[is][i][j] = psid[0](i, j); - } + wfc_gamma_grid[is][i][j] = psid[0](i, j); } + } #endif #else - ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, pes_->charge->rho[is]); + ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, pes_->charge->rho[is]); #endif - for (int ib = 0; ib < nbands; ++ib) + for (int ib = 0; ib < nbands; ++ib) + { + if (bands_picked_[ib]) { - if (bands_picked_[ib]) - { #ifndef __NEW_GINT - ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); - gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); + ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); + gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); #else - gint_env.cal_env_band(ib); + gint_env.cal_env_band(ib); #endif - pes_->charge->save_rho_before_sum_band(); const double ef_tmp = this->pes_->eferm.get_efval(is); @@ -245,6 +241,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL, out_wfc_pw, PARAM.inp.ecutwfc, global_out_dir,psi_g, kv, pw_wfc, GlobalV::ofs_running); +#ifndef __NEW_GINT for (int is = 0; is < nspin; ++is) { for (int ib = 0; ib < nbands; ++ib) @@ -294,6 +291,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, wfc_k_grid[ik][ib] = new std::complex[gk.gridt->lgd]; } } + const double mem_size = sizeof(std::complex) * double(gk.gridt->lgd) * double(nbands) * double(nks) / 1024.0 / 1024.0; ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size); @@ -321,7 +319,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // Set this->bands_picked_ according to the mode this->select_bands(nbands_istate, out_wfc_norm, nbands, nelec, mode_norm, fermi_band); - // Calculate out_wfc_norm + // Calculate out_wfc_norm const int nspin0 = (nspin == 2) ? 2 : 1; for (int ik = 0; ik < nks; ++ik) // the loop of nspin0 is included { @@ -359,16 +357,17 @@ void Get_wf_lcao::begin(const UnitCell& ucell, #else gint_env.cal_env_band(ib); #endif - // ik0 is the real k-point index, starting from 0 - int ik0 = kv.ik2iktot[ik]; - if(nspin == 2) - { - const int half_k = kv.get_nkstot()/2; - if(ik0 >= half_k) - { - ik0 -= half_k; - } - } + + // ik0 is the real k-point index, starting from 0 + int ik0 = kv.ik2iktot[ik]; + if (nspin == 2) + { + const int half_k = kv.get_nkstot() / 2; + if (ik0 >= half_k) + { + ik0 -= half_k; + } + } // pint out information std::stringstream ss_file; @@ -456,7 +455,6 @@ void Get_wf_lcao::begin(const UnitCell& ucell, std::stringstream ss_imag; ss_imag << global_out_dir << "wf" << ib + 1 << "s" << ispin + 1 << "k" << ik0 + 1 << "imag.cube"; ModuleIO::write_vdata_palgrid(pgrid, wfc_imag.data(), ispin, nspin, 0, ss_imag.str(), ef_tmp, &(ucell)); - } } } } From 6a22b90d52bb19a48c363704bda76b7143595340 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Tue, 10 Jun 2025 16:06:32 +0800 Subject: [PATCH 23/63] fix assert error --- .../module_gint/temp_gint/gint_common.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp index 7a8d54551d..72bd647516 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp @@ -136,10 +136,15 @@ void transfer_dm_2d_to_gint( std::vector>> dm_gint) { // To check whether input parameter dm_2d has been initialized -#ifdef __DEBUG - assert(PARAM.inp.nspin == dm.size() + assert(PARAM.inp.nspin == dm_gint.size() && "The size of dm should be equal to the number of spins!"); -#endif + if(PARAM.inp.nspin != 4) + { + assert(dm.size() == PARAM.inp.nspin); + } else + { + assert(dm.size() == 1); + } if (PARAM.inp.nspin != 4) { From f1d19461aef47e32cbf6c939a41e6d33fe720b6e Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Tue, 10 Jun 2025 16:12:20 +0800 Subject: [PATCH 24/63] add gint_info init for lcao_others --- source/source_esolver/lcao_others.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/source/source_esolver/lcao_others.cpp b/source/source_esolver/lcao_others.cpp index bd30d3748d..beed83eaec 100644 --- a/source/source_esolver/lcao_others.cpp +++ b/source/source_esolver/lcao_others.cpp @@ -124,6 +124,26 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) dpsi_u, d2psi_u, PARAM.inp.nstream); + #ifdef __NEW_GINT + auto gint_info = std::make_shared( + this->pw_big->nbx, + this->pw_big->nby, + this->pw_big->nbz, + this->pw_rho->nx, + this->pw_rho->ny, + this->pw_rho->nz, + 0, + 0, + this->pw_big->nbzp_start, + this->pw_big->nbx, + this->pw_big->nby, + this->pw_big->nbzp, + orb_.Phi, + ucell, + this->gd); + ModuleGint::Gint::set_gint_info(gint_info); + #endif + psi_u.clear(); psi_u.shrink_to_fit(); dpsi_u.clear(); From 81c8f6b09aa1c7d5c8aa9ca735c2269dbd39ecd8 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Tue, 10 Jun 2025 16:17:46 +0800 Subject: [PATCH 25/63] refactor output_dHR --- .../hamilt_lcaodft/spar_dh.cpp | 21 ++++++++++++++++++- source/module_io/write_HS_R.cpp | 18 ---------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp index fddf2e584d..2bfd7d9958 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp @@ -106,8 +106,27 @@ void sparse_format::cal_dH(const UnitCell& ucell, delete[] fsr_dh.DHloc_fixedR_y; delete[] fsr_dh.DHloc_fixedR_z; - gint_k.cal_dvlocal_R_sparseMatrix(current_spin, sparse_thr, HS_Arrays, &pv, ucell, grid); + if(PARAM.inp.nspin==2) + { + gint_k.allocate_pvdpR(); + // note: some MPI process will not have grids when MPI cores are too + // many, v_eff in these processes are empty + const double* vr_eff1 + = v_eff.nc * v_eff.nr > 0 ? &(v_eff(cspin, 0)) : nullptr; + if (!PARAM.globalv.gamma_only_local) + { + if (PARAM.inp.vl_in_h) + { + Gint_inout inout(vr_eff1, + cspin, + Gint_Tools::job_type::dvlocal); + gint_k.cal_gint(&inout); + } + } + gint_k.cal_dvlocal_R_sparseMatrix(current_spin, sparse_thr, HS_Arrays, &pv, ucell, grid); + gint_k.destroy_pvdpR(); + } return; } diff --git a/source/module_io/write_HS_R.cpp b/source/module_io/write_HS_R.cpp index ebd91a3d35..57b602ca68 100644 --- a/source/module_io/write_HS_R.cpp +++ b/source/module_io/write_HS_R.cpp @@ -169,22 +169,6 @@ void ModuleIO::output_dHR(const int& istep, { for (int cspin = 0; cspin < 2; cspin++) { - // note: some MPI process will not have grids when MPI cores are too - // many, v_eff in these processes are empty - const double* vr_eff1 - = v_eff.nc * v_eff.nr > 0 ? &(v_eff(cspin, 0)) : nullptr; - - if (!PARAM.globalv.gamma_only_local) - { - if (PARAM.inp.vl_in_h) - { - Gint_inout inout(vr_eff1, - cspin, - Gint_Tools::job_type::dvlocal); - gint_k.cal_gint(&inout); - } - } - sparse_format::cal_dH(ucell, pv, HS_Arrays, @@ -201,8 +185,6 @@ void ModuleIO::output_dHR(const int& istep, sparse_format::destroy_dH_R_sparse(HS_Arrays); - gint_k.destroy_pvdpR(); - ModuleBase::timer::tick("ModuleIO", "output_dHR"); return; } From f130cde551e591937e46b7b9f891ed197a97cdc7 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Wed, 11 Jun 2025 21:28:18 +0800 Subject: [PATCH 26/63] add gint_dvlocal to new gint module --- .../hamilt_lcaodft/spar_dh.cpp | 20 +- .../hamilt_lcaodft/spar_dh.h | 1 + .../module_gint/CMakeLists.txt | 1 + .../module_hamilt_lcao/module_gint/gint_k.h | 2 +- .../module_gint/gint_k_sparse1.cpp | 1 - .../module_gint/temp_gint/gint_dvlocal.cpp | 265 ++++++++++++++++++ .../module_gint/temp_gint/gint_dvlocal.h | 65 +++++ .../module_gint/temp_gint/gint_interface.cpp | 19 ++ .../module_gint/temp_gint/gint_interface.h | 13 +- source/module_io/write_HS_R.cpp | 2 + 10 files changed, 382 insertions(+), 7 deletions(-) create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp create mode 100644 source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp index 2bfd7d9958..dba1f28393 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp @@ -2,6 +2,7 @@ #include "module_parameter/parameter.h" #include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" #include void sparse_format::cal_dS(const UnitCell& ucell, @@ -49,7 +50,6 @@ delete[] fsr_dh.DHloc_fixedR_y; delete[] fsr_dh.DHloc_fixedR_z; return; } - void sparse_format::cal_dH(const UnitCell& ucell, const Parallel_Orbitals& pv, LCAO_HS_Arrays& HS_Arrays, @@ -58,6 +58,7 @@ void sparse_format::cal_dH(const UnitCell& ucell, const LCAO_Orbitals& orb, const int& current_spin, const double& sparse_thr, + const ModuleBase::matrix& v_eff, Gint_k& gint_k) { ModuleBase::TITLE("sparse_format", "cal_dH"); @@ -108,24 +109,35 @@ void sparse_format::cal_dH(const UnitCell& ucell, if(PARAM.inp.nspin==2) { +#ifndef __NEW_GINT gint_k.allocate_pvdpR(); // note: some MPI process will not have grids when MPI cores are too // many, v_eff in these processes are empty const double* vr_eff1 - = v_eff.nc * v_eff.nr > 0 ? &(v_eff(cspin, 0)) : nullptr; + = v_eff.nc * v_eff.nr > 0 ? &(v_eff(current_spin, 0)) : nullptr; if (!PARAM.globalv.gamma_only_local) { if (PARAM.inp.vl_in_h) { Gint_inout inout(vr_eff1, - cspin, - Gint_Tools::job_type::dvlocal); + current_spin, + Gint_Tools::job_type::dvlocal); gint_k.cal_gint(&inout); } } gint_k.cal_dvlocal_R_sparseMatrix(current_spin, sparse_thr, HS_Arrays, &pv, ucell, grid); gint_k.destroy_pvdpR(); +#else + const double* vr_eff1 + = v_eff.nc * v_eff.nr > 0 ? &(v_eff(current_spin, 0)) : nullptr; + if (!PARAM.globalv.gamma_only_local) + { + ModuleGint::cal_dvlocal_R_sparseMatrix( + PARAM.inp.nspin, PARAM.globalv.npol, current_spin, PARAM.globalv.nlocal, + sparse_thr, vr_eff1, pv, ucell, grid, HS_Arrays); + } +#endif } return; } diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h index eeddc06740..a05872079d 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h +++ b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h @@ -19,6 +19,7 @@ void cal_dH(const UnitCell& ucell, const LCAO_Orbitals& orb, const int& current_spin, const double& sparse_thr, + const ModuleBase::matrix& v_eff, Gint_k& gint_k); // calculated the derivative of the overlap matrix: diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index 1f0001f405..5e03b11cf5 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -47,6 +47,7 @@ if(NEW_GINT) temp_gint/gint_fvl_meta.cpp temp_gint/gint_env_gamma.cpp temp_gint/gint_env_k.cpp + temp_gint/gint_dvlocal.cpp temp_gint/localcell_info.cpp temp_gint/phi_operator.cpp temp_gint/set_ddphi.cpp diff --git a/source/module_hamilt_lcao/module_gint/gint_k.h b/source/module_hamilt_lcao/module_gint/gint_k.h index 4aeebefc27..aafa2ef5ba 100644 --- a/source/module_hamilt_lcao/module_gint/gint_k.h +++ b/source/module_hamilt_lcao/module_gint/gint_k.h @@ -67,7 +67,7 @@ class Gint_k : public Gint { Abfs::Vector3_Order, std::map>>>& pvdpR_soc_sparseMatrix, - LCAO_HS_Arrays& HS_Arrays, + LCAO_HS_Arrays& HS_arrays, const Parallel_Orbitals* pv); void cal_dvlocal_R_sparseMatrix(const int& current_spin, diff --git a/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp b/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp index 4275f9aa51..19216513c0 100644 --- a/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp +++ b/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp @@ -337,7 +337,6 @@ void Gint_k::cal_dvlocal_R_sparseMatrix(const int& current_spin, std::map, std::map>>> pvdpRz_soc_sparseMatrix; - int lgd = 0; double temp_value_double; std::complex temp_value_complex; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp new file mode 100644 index 0000000000..1254d59e21 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp @@ -0,0 +1,265 @@ +#include +#include "gint_dvlocal.h" +#include "phi_operator.h" +#include "module_base/parallel_reduce.h" + +namespace ModuleGint +{ + +void Gint_dvlocal::cal_dvlocal() +{ + init_hr_gint_(); + cal_hr_gint_(); +} + +void Gint_dvlocal::init_hr_gint_() +{ + pvdpRx = gint_info_->get_hr(); + pvdpRy = gint_info_->get_hr(); + pvdpRz = gint_info_->get_hr(); +} + +void Gint_dvlocal::cal_hr_gint_() +{ +#pragma omp parallel + { + PhiOperator phi_op; + std::vector phi; + std::vector phi_vldr3; + std::vector dphi_x; + std::vector dphi_y; + std::vector dphi_z; +#pragma omp for schedule(dynamic) + for(const auto& biggrid: gint_info_->get_biggrids()) + { + if(biggrid->get_atoms().empty()) + { + continue; + } + phi_op.set_bgrid(biggrid); + const int phi_len = phi_op.get_rows() * phi_op.get_cols(); + phi.resize(phi_len); + phi_vldr3.resize(phi_len); + dphi_x.resize(phi_len); + dphi_y.resize(phi_len); + dphi_z.resize(phi_len); + phi_op.set_phi_dphi(phi.data(), dphi_x.data(), dphi_y.data(), dphi_z.data()); + phi_op.phi_mul_vldr3(vr_eff_, dr3_, phi.data(), phi_vldr3.data()); + phi_op.phi_mul_phi(phi_vldr3.data(), dphi_x.data(), *pvdpRx, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi_vldr3.data(), dphi_y.data(), *pvdpRy, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi_vldr3.data(), dphi_z.data(), *pvdpRz, PhiOperator::Triangular_Matrix::Upper); + } + } +} + +void Gint_dvlocal::cal_dvlocal_R_sparseMatrix( + const int nspin, + const int cspin, + const int nlocal, + const double sparse_thr, + const Parallel_Orbitals& pv, + const UnitCell& ucell, + const Grid_Driver& gdriver, + LCAO_HS_Arrays& hs_arrays) +{ + std::map, std::map>> pvdpRx_sparseMatrix; + std::map, std::map>> pvdpRy_sparseMatrix; + std::map, std::map>> pvdpRz_sparseMatrix; + + double temp_value_double; + + Vec3d tau1, dtau; + for (int iap = 0; iap < pvdpRx->size_atom_pairs(); iap++) + { + const auto& ap = pvdpRx->get_atom_pair(iap); + const int iat1 = ap.get_atom_i(); + const int iat2 = ap.get_atom_j(); + const int it1 = ucell.iat2it[iat1]; + const int it2 = ucell.iat2it[iat2]; + const Atom* atom1 = &ucell.atoms[it1]; + const Atom* atom2 = &ucell.atoms[it2]; + const int start1 = ucell.itiaiw2iwt(it1, ucell.iat2ia[iat1], 0); + const int start2 = ucell.itiaiw2iwt(it2, ucell.iat2ia[iat2], 0); + + for (int ir = 0; ir < ap.get_R_size(); ir++) + { + const ModuleBase::Vector3 R = ap.get_R_index(ir); + Abfs::Vector3_Order dR(R.x, R.y, R.z); + double* p_pvdpRx = pvdpRx->get_atom_pair(iap).get_pointer(ir); + double* p_pvdpRy = pvdpRy->get_atom_pair(iap).get_pointer(ir); + double* p_pvdpRz = pvdpRz->get_atom_pair(iap).get_pointer(ir); + + for (int iw = 0; iw < atom1->nw * npol_; iw++) + { + for (int iw2 = 0; iw2 < atom2->nw * npol_; iw2++) + { + const int nw = atom2->nw; + const int mug0 = iw / npol_; + const int nug0 = iw2 / npol_; + const int iw_nowg = mug0 * nw + nug0; + + double temp_value = p_pvdpRx[iw_nowg]; + if (std::abs(temp_value) > sparse_thr) + { + pvdpRx_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value; + } + temp_value = p_pvdpRy[iw_nowg]; + if (std::abs(temp_value) > sparse_thr) + { + pvdpRy_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value; + } + temp_value = p_pvdpRz[iw_nowg]; + if (std::abs(temp_value) > sparse_thr) + { + pvdpRz_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value; + } + } + } + } + } + distribute_pvdpR_sparseMatrix(cspin, 0, nlocal, sparse_thr, pvdpRx_sparseMatrix, pv, hs_arrays); + distribute_pvdpR_sparseMatrix(cspin, 1, nlocal, sparse_thr, pvdpRy_sparseMatrix, pv, hs_arrays); + distribute_pvdpR_sparseMatrix(cspin, 2, nlocal, sparse_thr, pvdpRz_sparseMatrix, pv, hs_arrays); +} + + +void Gint_dvlocal::distribute_pvdpR_sparseMatrix( + const int cspin, + const int dim, + const int nlocal, + const double sparse_threshold, + const std::map, + std::map>>& + pvdpR_sparseMatrix, + const Parallel_Orbitals& pv, + LCAO_HS_Arrays& hs_arrays) +{ + int total_R_num = hs_arrays.all_R_coor.size(); + std::vector nonzero_num(total_R_num); + std::vector minus_nonzero_num(total_R_num); + int count = 0; + for (const auto& R_coor: hs_arrays.all_R_coor) + { + auto iter = pvdpR_sparseMatrix.find(R_coor); + if (iter != pvdpR_sparseMatrix.end()) + { + for (auto& row_loop: iter->second) + { + nonzero_num[count] += row_loop.second.size(); + } + } + + auto minus_R_coor = -1 * R_coor; + + iter = pvdpR_sparseMatrix.find(minus_R_coor); + if (iter != pvdpR_sparseMatrix.end()) + { + for (auto& row_loop: iter->second) + { + minus_nonzero_num[count] += row_loop.second.size(); + } + } + count++; + } + + Parallel_Reduce::reduce_all(nonzero_num.data(), total_R_num); + Parallel_Reduce::reduce_all(minus_nonzero_num.data(), total_R_num); + + std::vector tmp(nlocal); + count = 0; + + const std::vector& trace_lo = gint_info_->get_trace_lo(); + for (const auto& R_coor: hs_arrays.all_R_coor) + { + if (nonzero_num[count] != 0 || minus_nonzero_num[count] != 0) + { + auto minus_R_coor = -1 * R_coor; + + for (int row = 0; row < nlocal; ++row) + { + tmp.assign(tmp.size(), 0); + + auto iter = pvdpR_sparseMatrix.find(R_coor); + if (iter != pvdpR_sparseMatrix.end()) + { + + if (trace_lo[row] >= 0) + { + auto row_iter = iter->second.find(row); + if (row_iter != iter->second.end()) + { + for (auto& value: row_iter->second) + { + tmp[value.first] = value.second; + } + } + } + } + + auto minus_R_iter = pvdpR_sparseMatrix.find(minus_R_coor); + if (minus_R_iter != pvdpR_sparseMatrix.end()) + { + for (int col = 0; col < row; ++col) + { + if (trace_lo[col] >= 0) + { + auto row_iter = minus_R_iter->second.find(col); + if (row_iter != minus_R_iter->second.end()) + { + auto col_iter = row_iter->second.find(row); + if (col_iter != row_iter->second.end()) + { + tmp[col] = col_iter->second; + } + } + } + } + } + + Parallel_Reduce::reduce_pool(tmp.data(), nlocal); + + if (pv.global2local_row(row) >= 0) + { + for (int col = 0; col < nlocal; ++col) + { + if (pv.global2local_col(col) >= 0) + { + if (std::abs(tmp[col]) > sparse_threshold) + { + if (dim == 0) + { + double& value = hs_arrays.dHRx_sparse[cspin][R_coor][row][col]; + value += tmp[col]; + if (std::abs(value) <= sparse_threshold) + { + hs_arrays.dHRx_sparse[cspin][R_coor][row].erase(col); + } + } + if (dim == 1) + { + double& value = hs_arrays.dHRy_sparse[cspin][R_coor][row][col]; + value += tmp[col]; + if (std::abs(value) <= sparse_threshold) + { + hs_arrays.dHRy_sparse[cspin][R_coor][row].erase(col); + } + } + if (dim == 2) + { + double& value = hs_arrays.dHRz_sparse[cspin][R_coor][row][col]; + value += tmp[col]; + if (std::abs(value) <= sparse_threshold) + { + hs_arrays.dHRz_sparse[cspin][R_coor][row].erase(col); + } + } + } + } + } + } + } + } + count++; + } +} +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h new file mode 100644 index 0000000000..f61ee8de00 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h @@ -0,0 +1,65 @@ +#pragma once +#include +#include +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_HS_arrays.hpp" +#include "module_base/abfs-vector3_order.h" +#include "gint.h" +#include "gint_info.h" + +namespace ModuleGint +{ + +class Gint_dvlocal : public Gint +{ + public: + Gint_dvlocal( + const double* vr_eff, + const int nspin, + const int npol) + : vr_eff_(vr_eff), nspin_(nspin), npol_(npol), dr3_(gint_info_->get_mgrid_volume()) + { + assert(nspin_ == 2); // currently only npin == 2 is supported + } + + void cal_dvlocal(); + + void cal_dvlocal_R_sparseMatrix( + const int nspin, + const int cspin, + const int nlocal, + const double sparse_thr, + const Parallel_Orbitals& pv, + const UnitCell& ucell, + const Grid_Driver& gdriver, + LCAO_HS_Arrays& hs_arrays); + + private: + void init_hr_gint_(); + + void cal_hr_gint_(); + + void distribute_pvdpR_sparseMatrix( + const int cspin, + const int dim, + const int nlocal, + const double sparse_threshold, + const std::map, + std::map>>& + pvdpR_sparseMatrix, + const Parallel_Orbitals& pv, + LCAO_HS_Arrays& HS_Arrays); + + // input + const double* vr_eff_; + int nspin_; + int npol_; + + // intermediate variables + double dr3_; + std::shared_ptr> pvdpRx; + std::shared_ptr> pvdpRy; + std::shared_ptr> pvdpRz; +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index dd8ba3c9e4..091110c6c4 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -192,4 +192,23 @@ void cal_gint_fvl_meta( ModuleBase::timer::tick("Gint", "cal_gint_fvl_meta"); } +void cal_dvlocal_R_sparseMatrix( + const int nspin, + const int npol, + const int current_spin, + const int nlocal, + const double sparse_thr, + const double* vr_eff, + const Parallel_Orbitals& pv, + const UnitCell& ucell, + const Grid_Driver& gdriver, + LCAO_HS_Arrays& hs_arrays) +{ + Gint_dvlocal gint_dvlocal(vr_eff, nspin, npol); + gint_dvlocal.cal_dvlocal(); + gint_dvlocal.cal_dvlocal_R_sparseMatrix( + nspin, current_spin, nlocal, sparse_thr, + pv, ucell, gdriver, hs_arrays); +} + } // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h index cec6b12e01..0d064be2f2 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h @@ -2,7 +2,7 @@ #include #include "module_hamilt_lcao/module_hcontainer/hcontainer.h" #include "gint_type.h" - +#include "gint_dvlocal.h" namespace ModuleGint { @@ -54,6 +54,17 @@ void cal_gint_fvl_meta( ModuleBase::matrix* fvl, ModuleBase::matrix* svl); +void cal_dvlocal_R_sparseMatrix( + const int nspin, + const int npol, + const int current_spin, + const int nlocal, + const double sparse_thr, + const double* vr_eff, + const Parallel_Orbitals& pv, + const UnitCell& ucell, + const Grid_Driver& gdriver, + LCAO_HS_Arrays& hs_arrays); } // namespace ModuleGint \ No newline at end of file diff --git a/source/module_io/write_HS_R.cpp b/source/module_io/write_HS_R.cpp index 57b602ca68..01907432cb 100644 --- a/source/module_io/write_HS_R.cpp +++ b/source/module_io/write_HS_R.cpp @@ -163,6 +163,7 @@ void ModuleIO::output_dHR(const int& istep, orb, cspin, sparse_thr, + v_eff, gint_k); } else if (nspin == 2) @@ -177,6 +178,7 @@ void ModuleIO::output_dHR(const int& istep, orb, cspin, sparse_thr, + v_eff, gint_k); } } From bbfe636e442765d182973ae98fe617d785339df1 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 12 Jun 2025 10:06:02 +0800 Subject: [PATCH 27/63] add namespace --- .../module_gint/temp_gint/gint_helper.h | 7 ++++++- .../module_gint/temp_gint/gint_type.h | 13 ++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h index 687c37df50..8288028691 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h @@ -5,6 +5,9 @@ #include "gint_type.h" #include "source_base/timer.h" +namespace ModuleGint +{ + template std::shared_ptr toConstSharedPtr(std::shared_ptr ptr) { return std::static_pointer_cast(ptr); @@ -60,4 +63,6 @@ inline int floor_div(const int a, const int b) inline int ceil_div(const int a, const int b) { return a / b + (a % b != 0 && (a ^ b) > 0); -}; \ No newline at end of file +}; + +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h index 9cf623765b..4d1b2e8537 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h @@ -4,9 +4,12 @@ #include "source_base/vector3.h" #include "source_base/matrix3.h" -using Matrix3 = ModuleBase::Matrix3; -using Vec3d = ModuleBase::Vector3; -using Vec3i = ModuleBase::Vector3; +namespace ModuleGint +{ + using Matrix3 = ModuleBase::Matrix3; + using Vec3d = ModuleBase::Vector3; + using Vec3i = ModuleBase::Vector3; -template -using HContainer = hamilt::HContainer; \ No newline at end of file + template + using HContainer = hamilt::HContainer; +} \ No newline at end of file From b62e31184d61b50c1473d4db62d8201bff8abf10 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 12 Jun 2025 21:37:03 +0800 Subject: [PATCH 28/63] remove some shared_ptr --- .../module_gint/temp_gint/gint_common.cpp | 82 +++++++++---------- .../module_gint/temp_gint/gint_common.h | 14 ++-- .../module_gint/temp_gint/gint_dvlocal.cpp | 16 ++-- .../module_gint/temp_gint/gint_dvlocal.h | 6 +- .../module_gint/temp_gint/gint_env_gamma.cpp | 2 +- .../module_gint/temp_gint/gint_env_k.cpp | 2 +- .../module_gint/temp_gint/gint_fvl.cpp | 4 +- .../module_gint/temp_gint/gint_fvl.h | 2 +- .../module_gint/temp_gint/gint_fvl_gpu.cpp | 10 +-- .../module_gint/temp_gint/gint_fvl_gpu.h | 2 +- .../module_gint/temp_gint/gint_fvl_meta.cpp | 10 +-- .../module_gint/temp_gint/gint_fvl_meta.h | 2 +- .../temp_gint/gint_fvl_meta_gpu.cpp | 16 ++-- .../module_gint/temp_gint/gint_fvl_meta_gpu.h | 2 +- .../module_gint/temp_gint/gint_info.cpp | 16 ++-- .../module_gint/temp_gint/gint_info.h | 2 +- .../module_gint/temp_gint/gint_rho.cpp | 4 +- .../module_gint/temp_gint/gint_rho.h | 2 +- .../module_gint/temp_gint/gint_rho_gpu.cpp | 10 +-- .../module_gint/temp_gint/gint_rho_gpu.h | 2 +- .../module_gint/temp_gint/gint_tau.cpp | 8 +- .../module_gint/temp_gint/gint_tau.h | 2 +- .../module_gint/temp_gint/gint_tau_gpu.cpp | 14 ++-- .../module_gint/temp_gint/gint_tau_gpu.h | 2 +- .../module_gint/temp_gint/gint_vl.cpp | 4 +- .../module_gint/temp_gint/gint_vl.h | 2 +- .../module_gint/temp_gint/gint_vl_gpu.cpp | 8 +- .../module_gint/temp_gint/gint_vl_gpu.h | 2 +- .../module_gint/temp_gint/gint_vl_metagga.cpp | 10 +-- .../module_gint/temp_gint/gint_vl_metagga.h | 2 +- .../temp_gint/gint_vl_metagga_gpu.cpp | 8 +- .../temp_gint/gint_vl_metagga_gpu.h | 2 +- .../temp_gint/gint_vl_metagga_nspin4.cpp | 10 +-- .../temp_gint/gint_vl_metagga_nspin4.h | 4 +- .../temp_gint/gint_vl_metagga_nspin4_gpu.cpp | 8 +- .../temp_gint/gint_vl_metagga_nspin4_gpu.h | 4 +- .../module_gint/temp_gint/gint_vl_nspin4.cpp | 4 +- .../module_gint/temp_gint/gint_vl_nspin4.h | 7 +- .../temp_gint/gint_vl_nspin4_gpu.cpp | 8 +- .../temp_gint/gint_vl_nspin4_gpu.h | 4 +- .../temp_gint/kernel/phi_operator_gpu.cu | 4 +- .../temp_gint/kernel/phi_operator_gpu.h | 2 +- .../module_hcontainer/hcontainer.cpp | 26 +++++- .../module_hcontainer/hcontainer.h | 6 +- 44 files changed, 192 insertions(+), 165 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp index 72bd647516..2e2faf2ab1 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp @@ -11,19 +11,19 @@ namespace ModuleGint { -void compose_hr_gint(std::shared_ptr> hr_gint) +void compose_hr_gint(HContainer& hr_gint) { - for (int iap = 0; iap < hr_gint->size_atom_pairs(); iap++) + for (int iap = 0; iap < hr_gint.size_atom_pairs(); iap++) { - auto& ap = hr_gint->get_atom_pair(iap); + auto& ap = hr_gint.get_atom_pair(iap); const int iat1 = ap.get_atom_i(); const int iat2 = ap.get_atom_j(); if (iat1 > iat2) { // fill lower triangle matrix with upper triangle matrix // the upper is - const hamilt::AtomPair* upper_ap = hr_gint->find_pair(iat2, iat1); - const hamilt::AtomPair* lower_ap = hr_gint->find_pair(iat1, iat2); + const hamilt::AtomPair* upper_ap = hr_gint.find_pair(iat2, iat1); + const hamilt::AtomPair* lower_ap = hr_gint.find_pair(iat1, iat2); #ifdef __DEBUG assert(upper_ap != nullptr); #endif @@ -44,20 +44,20 @@ void compose_hr_gint(std::shared_ptr> hr_gint) } } -void compose_hr_gint(std::vector>> hr_gint_part, - std::shared_ptr>> hr_gint_full) +void compose_hr_gint(const std::vector>& hr_gint_part, + HContainer>& hr_gint_full) { - for (int iap = 0; iap < hr_gint_full->size_atom_pairs(); iap++) + for (int iap = 0; iap < hr_gint_full.size_atom_pairs(); iap++) { - auto* ap = &hr_gint_full->get_atom_pair(iap); + auto* ap = &(hr_gint_full.get_atom_pair(iap)); const int iat1 = ap->get_atom_i(); const int iat2 = ap->get_atom_j(); if (iat1 <= iat2) { hamilt::AtomPair>* upper_ap = ap; - hamilt::AtomPair>* lower_ap = hr_gint_full->find_pair(iat2, iat1); - const hamilt::AtomPair* ap_nspin_0 = hr_gint_part[0]->find_pair(iat1, iat2); - const hamilt::AtomPair* ap_nspin_3 = hr_gint_part[3]->find_pair(iat1, iat2); + hamilt::AtomPair>* lower_ap = hr_gint_full.find_pair(iat2, iat1); + const hamilt::AtomPair* ap_nspin_0 = hr_gint_part[0].find_pair(iat1, iat2); + const hamilt::AtomPair* ap_nspin_3 = hr_gint_part[3].find_pair(iat1, iat2); for (int ir = 0; ir < upper_ap->get_R_size(); ir++) { const auto R_index = upper_ap->get_R_index(ir); @@ -77,8 +77,8 @@ void compose_hr_gint(std::vector>> hr_gint_pa if (PARAM.globalv.domag) { - const hamilt::AtomPair* ap_nspin_1 = hr_gint_part[1]->find_pair(iat1, iat2); - const hamilt::AtomPair* ap_nspin_2 = hr_gint_part[2]->find_pair(iat1, iat2); + const hamilt::AtomPair* ap_nspin_1 = hr_gint_part[1].find_pair(iat1, iat2); + const hamilt::AtomPair* ap_nspin_2 = hr_gint_part[2].find_pair(iat1, iat2); const auto mat_nspin_1 = ap_nspin_1->find_matrix(R_index); const auto mat_nspin_2 = ap_nspin_2->find_matrix(R_index); for (int irow = 0; irow < mat_nspin_1->get_row_size(); ++irow) @@ -109,21 +109,21 @@ void compose_hr_gint(std::vector>> hr_gint_pa } template -void transfer_hr_gint_to_hR(std::shared_ptr> hr_gint, HContainer* hR) +void transfer_hr_gint_to_hR(const HContainer& hr_gint, HContainer& hR) { #ifdef __MPI int size = 0; MPI_Comm_size(MPI_COMM_WORLD, &size); if (size == 1) { - hR->add(*hr_gint); + hR.add(hr_gint); } else { - hamilt::transferSerials2Parallels(*hr_gint, hR); + hamilt::transferSerials2Parallels(hr_gint, &hR); } #else - hR->add(*hr_gint); + hR.add(hr_gint); #endif } @@ -131,9 +131,9 @@ void transfer_hr_gint_to_hR(std::shared_ptr> hr_gint, HConta // In the future, we might try to remove the gint_info parameter template void transfer_dm_2d_to_gint( - std::shared_ptr gint_info, + const GintInfo& gint_info, std::vector*> dm, - std::vector>> dm_gint) + std::vector>& dm_gint) { // To check whether input parameter dm_2d has been initialized assert(PARAM.inp.nspin == dm_gint.size() @@ -151,25 +151,25 @@ void transfer_dm_2d_to_gint( for (int is = 0; is < PARAM.inp.nspin; is++) { #ifdef __MPI - hamilt::transferParallels2Serials(*dm[is], dm_gint[is].get()); + hamilt::transferParallels2Serials(*dm[is], &dm_gint[is]); #else - dm_gint[is]->set_zero(); - dm_gint[is]->add(*dm[is]); + dm_gint[is].set_zero(); + dm_gint[is].add(*dm[is]); #endif } } else // NSPIN=4 case { #ifdef __MPI const int npol = 2; - std::shared_ptr> dm_full = gint_info->get_hr(npol); - hamilt::transferParallels2Serials(*dm[0], dm_full.get()); + HContainer dm_full = gint_info.get_hr(npol); + hamilt::transferParallels2Serials(*dm[0], &dm_full); #else - HContainer* dm_full = dm[0]; + HContainer& dm_full = dm[0]; #endif std::vector tmp_pointer(4, nullptr); - for (int iap = 0; iap < dm_full->size_atom_pairs(); iap++) + for (int iap = 0; iap < dm_full.size_atom_pairs(); iap++) { - auto& ap = dm_full->get_atom_pair(iap); + auto& ap = dm_full.get_atom_pair(iap); const int iat1 = ap.get_atom_i(); const int iat2 = ap.get_atom_j(); for (int ir = 0; ir < ap.get_R_size(); ir++) @@ -178,7 +178,7 @@ void transfer_dm_2d_to_gint( for (int is = 0; is < 4; is++) { tmp_pointer[is] = - dm_gint[is]->find_matrix(iat1, iat2, r_index)->get_pointer(); + dm_gint[is].find_matrix(iat1, iat2, r_index)->get_pointer(); } T* data_full = ap.get_pointer(ir); for (int irow = 0; irow < ap.get_row_size(); irow += 2) @@ -218,7 +218,7 @@ template void wfc_2d_to_gint(const T* wfc_2d, const Parallel_Orbitals& pv, T* wfc_gint, - std::shared_ptr gint_info) + const GintInfo& gint_info) { ModuleBase::TITLE("Module_gint", "wfc_2d_to_gint"); ModuleBase::timer::tick("Module_gint", "wfc_2d_to_gint"); @@ -228,7 +228,7 @@ void wfc_2d_to_gint(const T* wfc_2d, const int nbands = pv.desc_wfc[3]; #ifdef __MPI - const std::vector& trace_lo = gint_info->get_trace_lo(); + const std::vector& trace_lo = gint_info.get_trace_lo(); // MPI and memory related const int mem_stride = 1; @@ -307,27 +307,27 @@ void wfc_2d_to_gint(const T* wfc_2d, } template void transfer_hr_gint_to_hR( - std::shared_ptr> hr_gint, - HContainer* hR); + const HContainer& hr_gint, + HContainer& hR); template void transfer_hr_gint_to_hR( - std::shared_ptr>> hr_gint, - HContainer>* hR); + const HContainer>& hr_gint, + HContainer>& hR); template void transfer_dm_2d_to_gint( - std::shared_ptr gint_info, + const GintInfo& gint_info, std::vector*> dm, - std::vector>> dm_gint); + std::vector>& dm_gint); template void transfer_dm_2d_to_gint( - std::shared_ptr gint_info, + const GintInfo& gint_info, std::vector>*> dm, - std::vector>>> dm_gint); + std::vector>>& dm_gint); template void wfc_2d_to_gint( const double* wfc_2d, const Parallel_Orbitals& pv, double* wfc_grid, - std::shared_ptr gint_info); + const GintInfo& gint_info); template void wfc_2d_to_gint( const std::complex* wfc_2d, const Parallel_Orbitals& pv, std::complex* wfc_grid, - std::shared_ptr gint_info); + const GintInfo& gint_info); } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h index 66b940a610..15258e92ad 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h @@ -5,20 +5,20 @@ namespace ModuleGint { // fill the lower triangle matrix with the upper triangle matrix - void compose_hr_gint(std::shared_ptr> hr_gint); + void compose_hr_gint(HContainer& hr_gint); // for nspin=4 case - void compose_hr_gint(std::vector>> hr_gint_part, - std::shared_ptr>> hr_gint_full); + void compose_hr_gint(const std::vector>& hr_gint_part, + HContainer>& hr_gint_full); template - void transfer_hr_gint_to_hR(std::shared_ptr> hr_gint, HContainer* hR); + void transfer_hr_gint_to_hR(const HContainer& hr_gint, HContainer& hR); template void transfer_dm_2d_to_gint( - std::shared_ptr gint_info, + const GintInfo& gint_info, std::vector*> dm, - std::vector>> dm_gint); + std::vector>& dm_gint); template - void wfc_2d_to_gint(const T* wfc_2d, const Parallel_Orbitals& pv, T* wfc_grid, std::shared_ptr gint_info); + void wfc_2d_to_gint(const T* wfc_2d, const Parallel_Orbitals& pv, T* wfc_grid, const GintInfo& gint_info); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp index 1254d59e21..092735c10d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp @@ -45,9 +45,9 @@ void Gint_dvlocal::cal_hr_gint_() dphi_z.resize(phi_len); phi_op.set_phi_dphi(phi.data(), dphi_x.data(), dphi_y.data(), dphi_z.data()); phi_op.phi_mul_vldr3(vr_eff_, dr3_, phi.data(), phi_vldr3.data()); - phi_op.phi_mul_phi(phi_vldr3.data(), dphi_x.data(), *pvdpRx, PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(phi_vldr3.data(), dphi_y.data(), *pvdpRy, PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(phi_vldr3.data(), dphi_z.data(), *pvdpRz, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi_vldr3.data(), dphi_x.data(), pvdpRx, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi_vldr3.data(), dphi_y.data(), pvdpRy, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi_vldr3.data(), dphi_z.data(), pvdpRz, PhiOperator::Triangular_Matrix::Upper); } } } @@ -69,9 +69,9 @@ void Gint_dvlocal::cal_dvlocal_R_sparseMatrix( double temp_value_double; Vec3d tau1, dtau; - for (int iap = 0; iap < pvdpRx->size_atom_pairs(); iap++) + for (int iap = 0; iap < pvdpRx.size_atom_pairs(); iap++) { - const auto& ap = pvdpRx->get_atom_pair(iap); + const auto& ap = pvdpRx.get_atom_pair(iap); const int iat1 = ap.get_atom_i(); const int iat2 = ap.get_atom_j(); const int it1 = ucell.iat2it[iat1]; @@ -85,9 +85,9 @@ void Gint_dvlocal::cal_dvlocal_R_sparseMatrix( { const ModuleBase::Vector3 R = ap.get_R_index(ir); Abfs::Vector3_Order dR(R.x, R.y, R.z); - double* p_pvdpRx = pvdpRx->get_atom_pair(iap).get_pointer(ir); - double* p_pvdpRy = pvdpRy->get_atom_pair(iap).get_pointer(ir); - double* p_pvdpRz = pvdpRz->get_atom_pair(iap).get_pointer(ir); + double* p_pvdpRx = pvdpRx.get_atom_pair(iap).get_pointer(ir); + double* p_pvdpRy = pvdpRy.get_atom_pair(iap).get_pointer(ir); + double* p_pvdpRz = pvdpRz.get_atom_pair(iap).get_pointer(ir); for (int iw = 0; iw < atom1->nw * npol_; iw++) { diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h index f61ee8de00..3160d11c1c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h @@ -57,9 +57,9 @@ class Gint_dvlocal : public Gint // intermediate variables double dr3_; - std::shared_ptr> pvdpRx; - std::shared_ptr> pvdpRy; - std::shared_ptr> pvdpRz; + HContainer pvdpRx; + HContainer pvdpRy; + HContainer pvdpRz; }; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp index 29e97b64bb..5a251b6d08 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp @@ -13,7 +13,7 @@ Gint_env_gamma::Gint_env_gamma( :rho_(rho) { wfc_gint_.resize(nbands * gint_info_->get_lgd()); - wfc_2d_to_gint(psid, *pv, wfc_gint_.data(), gint_info_); + wfc_2d_to_gint(psid, *pv, wfc_gint_.data(), *gint_info_); } void Gint_env_gamma::cal_env_band(const int iband) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp index 9590089e53..941b1af9c8 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp @@ -18,7 +18,7 @@ Gint_env_k::Gint_env_k( :kvec_c_(kvec_c), kvec_d_(kvec_d), ik_(ik), nspin_(nspin), npol_(npol), rho_(rho) { wfc_gint_.resize(nbands * gint_info_->get_lgd()); - wfc_2d_to_gint(psid, *pv, wfc_gint_.data(), gint_info_); + wfc_2d_to_gint(psid, *pv, wfc_gint_.data(), *gint_info_); } void Gint_env_k::cal_env_band(const int iband) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp index 01fd6de0ab..227b7906ac 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp @@ -9,7 +9,7 @@ namespace ModuleGint void Gint_fvl::cal_gint() { init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_fvl_svl_(); } @@ -64,7 +64,7 @@ void Gint_fvl::cal_fvl_svl_() for (int is = 0; is < nspin_; is++) { phi_op.phi_mul_vldr3(vr_eff_[is], dr3_, phi.data(), phi_vldr3.data()); - phi_op.phi_mul_dm(phi_vldr3.data(), *dm_gint_vec_[is], false, phi_vldr3_dm.data()); + phi_op.phi_mul_dm(phi_vldr3.data(), dm_gint_vec_[is], false, phi_vldr3_dm.data()); if(isforce_) { phi_op.phi_dot_dphi(phi_vldr3_dm.data(), dphi_x.data(), dphi_y.data(), dphi_z.data(), fvl_thread); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h index d1e224d4d5..031a6c2cc3 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h @@ -44,7 +44,7 @@ class Gint_fvl : public Gint ModuleBase::matrix* svl_; // intermediate variables - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; double dr3_; }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp index 32489424a7..305b677a3c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp @@ -10,7 +10,7 @@ namespace ModuleGint void Gint_fvl_gpu::cal_gint() { init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); transfer_cpu_to_gpu_(); cal_fvl_svl_(); transfer_gpu_to_cpu_(); @@ -31,9 +31,9 @@ void Gint_fvl_gpu::transfer_cpu_to_gpu_() vr_eff_d_vec_.resize(nspin_); for (int is = 0; is < nspin_; is++) { - dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is]->get_nnr(), 0, false); - checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is]->get_wrapper(), - dm_gint_vec_[is]->get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is].get_nnr(), 0, false); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), + dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); vr_eff_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); checkCuda(cudaMemcpy(vr_eff_d_vec_[is].get_device_ptr(), vr_eff_[is], gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); @@ -108,7 +108,7 @@ void Gint_fvl_gpu::cal_fvl_svl_() phi_op.phi_mul_vldr3(vr_eff_d_vec_[is].get_device_ptr(), dr3_, phi.get_device_ptr(), phi_vldr3.get_device_ptr()); phi_op.phi_mul_dm(phi_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), - *dm_gint_vec_[is], is_symm, phi_vldr3_dm.get_device_ptr()); + dm_gint_vec_[is], is_symm, phi_vldr3_dm.get_device_ptr()); if (isforce_) { phi_op.phi_dot_dphi(phi_vldr3_dm.get_device_ptr(), diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h index 92b2c445d8..29a1b7704e 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h @@ -47,7 +47,7 @@ class Gint_fvl_gpu : public Gint ModuleBase::matrix* svl_; // intermediate variables - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; double dr3_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp index 15ca44b041..5a718fac9b 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp @@ -9,7 +9,7 @@ namespace ModuleGint void Gint_fvl_meta::cal_gint() { init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_fvl_svl_(); } @@ -93,10 +93,10 @@ void Gint_fvl_meta::cal_fvl_svl_() phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_x.data(), dphi_x_vldr3.data()); phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_y.data(), dphi_y_vldr3.data()); phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_z.data(), dphi_z_vldr3.data()); - phi_op.phi_mul_dm(phi_vldr3.data(), *dm_gint_vec_[is], false, phi_vldr3_dm.data()); - phi_op.phi_mul_dm(dphi_x_vldr3.data(), *dm_gint_vec_[is], false, dphi_x_vldr3_dm.data()); - phi_op.phi_mul_dm(dphi_y_vldr3.data(), *dm_gint_vec_[is], false, dphi_y_vldr3_dm.data()); - phi_op.phi_mul_dm(dphi_z_vldr3.data(), *dm_gint_vec_[is], false, dphi_z_vldr3_dm.data()); + phi_op.phi_mul_dm(phi_vldr3.data(), dm_gint_vec_[is], false, phi_vldr3_dm.data()); + phi_op.phi_mul_dm(dphi_x_vldr3.data(), dm_gint_vec_[is], false, dphi_x_vldr3_dm.data()); + phi_op.phi_mul_dm(dphi_y_vldr3.data(), dm_gint_vec_[is], false, dphi_y_vldr3_dm.data()); + phi_op.phi_mul_dm(dphi_z_vldr3.data(), dm_gint_vec_[is], false, dphi_z_vldr3_dm.data()); if(isforce_) { phi_op.phi_dot_dphi(phi_vldr3_dm.data(), dphi_x.data(), dphi_y.data(), dphi_z.data(), fvl_thread); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h index 0062f3d923..974b77b5e8 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h @@ -45,7 +45,7 @@ class Gint_fvl_meta : public Gint ModuleBase::matrix* svl_; // intermediate variables - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; double dr3_; }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp index 293390fbbc..a129ea4872 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp @@ -10,7 +10,7 @@ namespace ModuleGint void Gint_fvl_meta_gpu::cal_gint() { init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); transfer_cpu_to_gpu_(); cal_fvl_svl_(); transfer_gpu_to_cpu_(); @@ -32,9 +32,9 @@ void Gint_fvl_meta_gpu::transfer_cpu_to_gpu_() vofk_d_vec_.resize(nspin_); for (int is = 0; is < nspin_; is++) { - dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is]->get_nnr(), 0, false); - checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is]->get_wrapper(), - dm_gint_vec_[is]->get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is].get_nnr(), 0, false); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), + dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); vr_eff_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); checkCuda(cudaMemcpy(vr_eff_d_vec_[is].get_device_ptr(), vr_eff_[is], gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); @@ -132,13 +132,13 @@ void Gint_fvl_meta_gpu::cal_fvl_svl_() phi_op.phi_mul_vldr3(vofk_d_vec_[is].get_device_ptr(), dr3_, dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr()); phi_op.phi_mul_dm(phi_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), - *dm_gint_vec_[is], is_symm, phi_vldr3_dm.get_device_ptr()); + dm_gint_vec_[is], is_symm, phi_vldr3_dm.get_device_ptr()); phi_op.phi_mul_dm(dphi_x_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), - *dm_gint_vec_[is], is_symm, dphi_x_vldr3_dm.get_device_ptr()); + dm_gint_vec_[is], is_symm, dphi_x_vldr3_dm.get_device_ptr()); phi_op.phi_mul_dm(dphi_y_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), - *dm_gint_vec_[is], is_symm, dphi_y_vldr3_dm.get_device_ptr()); + dm_gint_vec_[is], is_symm, dphi_y_vldr3_dm.get_device_ptr()); phi_op.phi_mul_dm(dphi_z_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), - *dm_gint_vec_[is], is_symm, dphi_z_vldr3_dm.get_device_ptr()); + dm_gint_vec_[is], is_symm, dphi_z_vldr3_dm.get_device_ptr()); if (isforce_) { phi_op.phi_dot_dphi(phi_vldr3_dm.get_device_ptr(), diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h index 6acf87a20a..9e5fd08bc1 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h @@ -50,7 +50,7 @@ class Gint_fvl_meta_gpu : public Gint ModuleBase::matrix* svl_; // intermediate variables - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; double dr3_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp index 8511de750d..0f46930048 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp @@ -63,16 +63,16 @@ GintInfo::GintInfo( } template -std::shared_ptr> GintInfo::get_hr(int npol) const +HContainer GintInfo::get_hr(int npol) const { - auto p_hr = std::make_shared>(ucell_->nat); + auto hr = HContainer(ucell_->nat); if(PARAM.inp.gamma_only) { - p_hr->fix_gamma(); + hr.fix_gamma(); } - p_hr->insert_ijrs(&ijr_info_, *ucell_, npol); - p_hr->allocate(nullptr, true); - return p_hr; + hr.insert_ijrs(&ijr_info_, *ucell_, npol); + hr.allocate(nullptr, true); + return std::move(hr); } void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital* Phi) @@ -277,6 +277,6 @@ void GintInfo::init_bgrid_batches_(int batch_size) } #endif -template std::shared_ptr> GintInfo::get_hr(int npol) const; -template std::shared_ptr>> GintInfo::get_hr>(int npol) const; +template HContainer GintInfo::get_hr(int npol) const; +template HContainer> GintInfo::get_hr>(int npol) const; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h index ba2313657a..5071c0f1a0 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h @@ -45,7 +45,7 @@ class GintInfo // functions about hcontainer //========================================= template - std::shared_ptr> get_hr(int npol = 1) const; + HContainer get_hr(int npol = 1) const; private: // initialize the atoms diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp index 2924487c7e..c811908a3d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp @@ -9,7 +9,7 @@ namespace ModuleGint void Gint_rho::cal_gint() { init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_rho_(); } @@ -43,7 +43,7 @@ void Gint_rho::cal_rho_() phi_op.set_phi(phi.data()); for (int is = 0; is < nspin_; is++) { - phi_op.phi_mul_dm(phi.data(), *dm_gint_vec_[is], true, phi_dm.data()); + phi_op.phi_mul_dm(phi.data(), dm_gint_vec_[is], true, phi_dm.data()); phi_op.phi_dot_phi(phi.data(), phi_dm.data(), rho_[is]); } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h index 1af05e72cd..d46ed15e37 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h @@ -35,7 +35,7 @@ class Gint_rho : public Gint //======================== // Intermediate variables //======================== - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; }; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp index a5ef6a335a..c007df9baa 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp @@ -10,7 +10,7 @@ namespace ModuleGint void Gint_rho_gpu::cal_gint() { init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); transfer_cpu_to_gpu_(); cal_rho_(); transfer_gpu_to_cpu_(); @@ -31,10 +31,10 @@ void Gint_rho_gpu::transfer_cpu_to_gpu_() rho_d_vec_.resize(nspin_); for (int is = 0; is < nspin_; is++) { - dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is]->get_nnr(), 0, false); + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is].get_nnr(), 0, false); rho_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); - checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is]->get_wrapper(), - dm_gint_vec_[is]->get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), + dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); } } @@ -71,7 +71,7 @@ void Gint_rho_gpu::cal_rho_() for(int is = 0; is < nspin_; is++) { const bool is_symm = true; - phi_op.phi_mul_dm(phi.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), *dm_gint_vec_[is], + phi_op.phi_mul_dm(phi.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is], is_symm, phi_dm.get_device_ptr()); phi_op.phi_dot_phi(phi.get_device_ptr(), phi_dm.get_device_ptr(), rho_d_vec_[is].get_device_ptr()); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h index d411e46518..97071a3085 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h @@ -40,7 +40,7 @@ class Gint_rho_gpu: public Gint //======================== // Intermediate variables //======================== - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; std::vector> dm_gint_d_vec_; std::vector> rho_d_vec_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp index f5d0b70a0c..4edfe459ab 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp @@ -9,7 +9,7 @@ namespace ModuleGint void Gint_tau::cal_gint() { init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_tau_(); } @@ -51,9 +51,9 @@ void Gint_tau::cal_tau_() phi_op.set_phi_dphi(nullptr, dphi_x.data(), dphi_y.data(), dphi_z.data()); for (int is = 0; is < nspin_; is++) { - phi_op.phi_mul_dm(dphi_x.data(), *dm_gint_vec_[is], true, dphi_x_dm.data()); - phi_op.phi_mul_dm(dphi_y.data(), *dm_gint_vec_[is], true, dphi_y_dm.data()); - phi_op.phi_mul_dm(dphi_z.data(), *dm_gint_vec_[is], true, dphi_z_dm.data()); + phi_op.phi_mul_dm(dphi_x.data(), dm_gint_vec_[is], true, dphi_x_dm.data()); + phi_op.phi_mul_dm(dphi_y.data(), dm_gint_vec_[is], true, dphi_y_dm.data()); + phi_op.phi_mul_dm(dphi_z.data(), dm_gint_vec_[is], true, dphi_z_dm.data()); phi_op.phi_dot_phi(dphi_x.data(), dphi_x_dm.data(), kin_[is]); phi_op.phi_dot_phi(dphi_y.data(), dphi_y_dm.data(), kin_[is]); phi_op.phi_dot_phi(dphi_z.data(), dphi_z_dm.data(), kin_[is]); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h index 641cdb1bec..fae12f524b 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h @@ -34,7 +34,7 @@ class Gint_tau : public Gint //======================== // Intermediate variables //======================== - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; }; } // namespace ModuleGint diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp index 6ed4b19c05..7660552d1c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp @@ -10,7 +10,7 @@ namespace ModuleGint void Gint_tau_gpu::cal_gint() { init_dm_gint_(); - transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_); + transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); transfer_cpu_to_gpu_(); cal_tau_(); transfer_gpu_to_cpu_(); @@ -31,10 +31,10 @@ void Gint_tau_gpu::transfer_cpu_to_gpu_() kin_d_vec_.resize(nspin_); for (int is = 0; is < nspin_; is++) { - dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is]->get_nnr(), 0, false); + dm_gint_d_vec_[is] = CudaMemWrapper(dm_gint_vec_[is].get_nnr(), 0, false); kin_d_vec_[is] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); - checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is]->get_wrapper(), - dm_gint_vec_[is]->get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), + dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); } } @@ -76,11 +76,11 @@ void Gint_tau_gpu::cal_tau_() for(int is = 0; is < nspin_; is++) { const bool is_symm = true; - phi_op.phi_mul_dm(dphi_x.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), *dm_gint_vec_[is], + phi_op.phi_mul_dm(dphi_x.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is], is_symm, dphi_x_dm.get_device_ptr()); - phi_op.phi_mul_dm(dphi_y.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), *dm_gint_vec_[is], + phi_op.phi_mul_dm(dphi_y.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is], is_symm, dphi_y_dm.get_device_ptr()); - phi_op.phi_mul_dm(dphi_z.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), *dm_gint_vec_[is], + phi_op.phi_mul_dm(dphi_z.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is], is_symm, dphi_z_dm.get_device_ptr()); phi_op.phi_dot_phi(dphi_x.get_device_ptr(), dphi_x_dm.get_device_ptr(), kin_d_vec_[is].get_device_ptr()); phi_op.phi_dot_phi(dphi_y.get_device_ptr(), dphi_y_dm.get_device_ptr(), kin_d_vec_[is].get_device_ptr()); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h index 4b1245ebbc..f4bd1dc77f 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h @@ -40,7 +40,7 @@ class Gint_tau_gpu : public Gint //======================== // Intermediate variables //======================== - std::vector>> dm_gint_vec_; + std::vector> dm_gint_vec_; std::vector> dm_gint_d_vec_; std::vector> kin_d_vec_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp index 7b8199422e..983d647574 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp @@ -12,7 +12,7 @@ void Gint_vl::cal_gint() init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_), hR_); + transfer_hr_gint_to_hR(hr_gint_, *hR_); } //======================== @@ -44,7 +44,7 @@ void Gint_vl::cal_hr_gint_() phi_vldr3.resize(phi_len); phi_op.set_phi(phi.data()); phi_op.phi_mul_vldr3(vr_eff_, dr3_, phi.data(), phi_vldr3.data()); - phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper); } } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h index d18532add6..742fd8e625 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h @@ -38,7 +38,7 @@ class Gint_vl : public Gint //======================== double dr3_; - std::shared_ptr> hr_gint_; + HContainer hr_gint_; }; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp index 8d953a50c0..60552b46fd 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp @@ -14,7 +14,7 @@ void Gint_vl_gpu::cal_gint() cal_hr_gint_(); transfer_gpu_to_cpu_(); compose_hr_gint(hr_gint_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_), hR_); + transfer_hr_gint_to_hR(hr_gint_, *hR_); } void Gint_vl_gpu::init_hr_gint_() @@ -24,7 +24,7 @@ void Gint_vl_gpu::init_hr_gint_() void Gint_vl_gpu::transfer_cpu_to_gpu_() { - hr_gint_d_ = CudaMemWrapper(hr_gint_->get_nnr(), 0, false); + hr_gint_d_ = CudaMemWrapper(hr_gint_.get_nnr(), 0, false); vr_eff_d_ = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); checkCuda(cudaMemcpy(vr_eff_d_.get_device_ptr(), vr_eff_, gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); @@ -32,8 +32,8 @@ void Gint_vl_gpu::transfer_cpu_to_gpu_() void Gint_vl_gpu::transfer_gpu_to_cpu_() { - checkCuda(cudaMemcpy(hr_gint_->get_wrapper(), hr_gint_d_.get_device_ptr(), - hr_gint_->get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); + checkCuda(cudaMemcpy(hr_gint_.get_wrapper(), hr_gint_d_.get_device_ptr(), + hr_gint_.get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); } void Gint_vl_gpu::cal_hr_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h index 3b73f740b1..fd7cee190b 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h @@ -42,7 +42,7 @@ class Gint_vl_gpu : public Gint //======================== double dr3_; - std::shared_ptr> hr_gint_; + HContainer hr_gint_; CudaMemWrapper hr_gint_d_; CudaMemWrapper vr_eff_d_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp index b5b5222148..aa1abee188 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp @@ -12,7 +12,7 @@ void Gint_vl_metagga::cal_gint() init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_), hR_); + transfer_hr_gint_to_hR(hr_gint_, *hR_); } //======================== @@ -59,10 +59,10 @@ void Gint_vl_metagga::cal_hr_gint_() phi_op.phi_mul_vldr3(vofk_, dr3_, dphi_x.data(), dphi_x_vldr3.data()); phi_op.phi_mul_vldr3(vofk_, dr3_, dphi_y.data(), dphi_y_vldr3.data()); phi_op.phi_mul_vldr3(vofk_, dr3_, dphi_z.data(), dphi_z_vldr3.data()); - phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper); } } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h index 5aa8fd3441..0ddb5b828f 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h @@ -40,7 +40,7 @@ class Gint_vl_metagga : public Gint //======================== double dr3_; - std::shared_ptr> hr_gint_; + HContainer hr_gint_; }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp index 850fd20770..12bb8a4c8b 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp @@ -14,7 +14,7 @@ void Gint_vl_metagga_gpu::cal_gint() cal_hr_gint_(); transfer_gpu_to_cpu_(); compose_hr_gint(hr_gint_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_), hR_); + transfer_hr_gint_to_hR(hr_gint_, *hR_); } //======================== @@ -28,7 +28,7 @@ void Gint_vl_metagga_gpu::init_hr_gint_() void Gint_vl_metagga_gpu::transfer_cpu_to_gpu_() { - hr_gint_d_ = CudaMemWrapper(hr_gint_->get_nnr(), 0, false); + hr_gint_d_ = CudaMemWrapper(hr_gint_.get_nnr(), 0, false); vr_eff_d_ = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); vofk_d_ = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); checkCuda(cudaMemcpy(vr_eff_d_.get_device_ptr(), vr_eff_, @@ -39,8 +39,8 @@ void Gint_vl_metagga_gpu::transfer_cpu_to_gpu_() void Gint_vl_metagga_gpu::transfer_gpu_to_cpu_() { - checkCuda(cudaMemcpy(hr_gint_->get_wrapper(), hr_gint_d_.get_device_ptr(), - hr_gint_->get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); + checkCuda(cudaMemcpy(hr_gint_.get_wrapper(), hr_gint_d_.get_device_ptr(), + hr_gint_.get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); } void Gint_vl_metagga_gpu::cal_hr_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h index e6406f4141..65486dcdfd 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h @@ -45,7 +45,7 @@ class Gint_vl_metagga_gpu : public Gint //======================== double dr3_; - std::shared_ptr> hr_gint_; + HContainer hr_gint_; CudaMemWrapper hr_gint_d_; CudaMemWrapper vr_eff_d_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp index 986b182c09..0e81bb35a0 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp @@ -14,7 +14,7 @@ void Gint_vl_metagga_nspin4::cal_gint() init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_full_), hR_); + transfer_hr_gint_to_hR(hr_gint_full_, *hR_); } void Gint_vl_metagga_nspin4::init_hr_gint_() @@ -65,10 +65,10 @@ void Gint_vl_metagga_nspin4::cal_hr_gint_() phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_x.data(), dphi_x_vldr3.data()); phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_y.data(), dphi_y_vldr3.data()); phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_z.data(), dphi_z_vldr3.data()); - phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); - phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); } } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h index 7b40546854..1505c39af0 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h @@ -38,8 +38,8 @@ class Gint_vl_metagga_nspin4 : public Gint const int nspin_ = 4; - std::vector>> hr_gint_part_; - std::shared_ptr>> hr_gint_full_; + std::vector> hr_gint_part_; + HContainer> hr_gint_full_; }; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp index 1a43981e1e..2b5132531d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp @@ -14,7 +14,7 @@ void Gint_vl_metagga_nspin4_gpu::cal_gint() cal_hr_gint_(); transfer_gpu_to_cpu_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_full_), hR_); + transfer_hr_gint_to_hR(hr_gint_full_, *hR_); } void Gint_vl_metagga_nspin4_gpu::init_hr_gint_() @@ -35,7 +35,7 @@ void Gint_vl_metagga_nspin4_gpu::transfer_cpu_to_gpu_() hr_gint_part_d_.resize(nspin_); for(int i = 0; i < nspin_; i++) { - hr_gint_part_d_[i] = CudaMemWrapper(hr_gint_part_[i]->get_nnr(), 0, false); + hr_gint_part_d_[i] = CudaMemWrapper(hr_gint_part_[i].get_nnr(), 0, false); vr_eff_d_[i] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); vofk_d_[i] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); checkCuda(cudaMemcpy(vr_eff_d_[i].get_device_ptr(), vr_eff_[i], @@ -49,8 +49,8 @@ void Gint_vl_metagga_nspin4_gpu::transfer_gpu_to_cpu_() { for(int i = 0; i < nspin_; i++) { - checkCuda(cudaMemcpy(hr_gint_part_[i]->get_wrapper(), hr_gint_part_d_[i].get_device_ptr(), - hr_gint_part_[i]->get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); + checkCuda(cudaMemcpy(hr_gint_part_[i].get_wrapper(), hr_gint_part_d_[i].get_device_ptr(), + hr_gint_part_[i].get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h index 47d1e34425..1c2722decd 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h @@ -43,8 +43,8 @@ class Gint_vl_metagga_nspin4_gpu : public Gint const int nspin_ = 4; - std::vector>> hr_gint_part_; - std::shared_ptr>> hr_gint_full_; + std::vector> hr_gint_part_; + HContainer> hr_gint_full_; std::vector> vr_eff_d_; std::vector> vofk_d_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp index db211570ca..3cbab74273 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp @@ -13,7 +13,7 @@ void Gint_vl_nspin4::cal_gint() init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_full_), hR_); + transfer_hr_gint_to_hR(hr_gint_full_, *hR_); } void Gint_vl_nspin4::init_hr_gint_() @@ -49,7 +49,7 @@ void Gint_vl_nspin4::cal_hr_gint_() for(int is = 0; is < nspin_; is++) { phi_op.phi_mul_vldr3(vr_eff_[is], dr3_, phi.data(), phi_vldr3.data()); - phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); + phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper); } } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h index f5e23532bc..6371257823 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h @@ -15,7 +15,7 @@ class Gint_vl_nspin4 : public Gint Gint_vl_nspin4( std::vector vr_eff, HContainer>* hR) - : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){} void cal_gint(); @@ -40,9 +40,8 @@ class Gint_vl_nspin4 : public Gint const int nspin_ = 4; - std::vector>> hr_gint_part_; - std::shared_ptr>> hr_gint_full_; - + std::vector> hr_gint_part_; + HContainer> hr_gint_full_; }; } // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp index e6ae47dfcf..fabce6649b 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp @@ -14,7 +14,7 @@ void Gint_vl_nspin4_gpu::cal_gint() cal_hr_gint_(); transfer_gpu_to_cpu_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); - transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_full_), hR_); + transfer_hr_gint_to_hR(hr_gint_full_, *hR_); } void Gint_vl_nspin4_gpu::init_hr_gint_() @@ -34,7 +34,7 @@ void Gint_vl_nspin4_gpu::transfer_cpu_to_gpu_() hr_gint_part_d_.resize(nspin_); for(int i = 0; i < nspin_; i++) { - hr_gint_part_d_[i] = CudaMemWrapper(hr_gint_part_[i]->get_nnr(), 0, false); + hr_gint_part_d_[i] = CudaMemWrapper(hr_gint_part_[i].get_nnr(), 0, false); vr_eff_d_[i] = CudaMemWrapper(gint_info_->get_local_mgrid_num(), 0, false); checkCuda(cudaMemcpy(vr_eff_d_[i].get_device_ptr(), vr_eff_[i], gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice)); @@ -45,8 +45,8 @@ void Gint_vl_nspin4_gpu::transfer_gpu_to_cpu_() { for(int i = 0; i < nspin_; i++) { - checkCuda(cudaMemcpy(hr_gint_part_[i]->get_wrapper(), hr_gint_part_d_[i].get_device_ptr(), - hr_gint_part_[i]->get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); + checkCuda(cudaMemcpy(hr_gint_part_[i].get_wrapper(), hr_gint_part_d_[i].get_device_ptr(), + hr_gint_part_[i].get_nnr() * sizeof(double), cudaMemcpyDeviceToHost)); } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h index bd9f059a1a..4a12b2bc69 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h @@ -45,8 +45,8 @@ class Gint_vl_nspin4_gpu : public Gint const int nspin_ = 4; - std::vector>> hr_gint_part_; - std::shared_ptr>> hr_gint_full_; + std::vector> hr_gint_part_; + HContainer> hr_gint_full_; std::vector> vr_eff_d_; std::vector> hr_gint_part_d_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu index 00f1307a8f..e5c616a53a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu @@ -209,7 +209,7 @@ void PhiOperatorGpu::phi_mul_vldr3( void PhiOperatorGpu::phi_mul_phi_vldr3( const double* phi_d, const double* phi_vldr3_d, - std::shared_ptr> hRGint, + HContainer& hRGint, double* hr_d) const { // ap_num means number of atom pairs @@ -242,7 +242,7 @@ void PhiOperatorGpu::phi_mul_phi_vldr3( if(iat_1 > iat_2) { continue; } - int hr_offset = hRGint->find_matrix_offset(iat_1, iat_2, r_1 - r_2); + int hr_offset = hRGint.find_matrix_offset(iat_1, iat_2, r_1 - r_2); if (hr_offset == -1) { continue; } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h index 3997c5cc28..8f20a7a054 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h @@ -35,7 +35,7 @@ class PhiOperatorGpu void phi_mul_phi_vldr3( const double* phi_d, const double* phi_vldr3_d, - std::shared_ptr> hRGint, + HContainer& hRGint, double* hr_d) const; void phi_mul_dm( diff --git a/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp b/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp index 54250a2ce2..89b71306a1 100644 --- a/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp +++ b/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp @@ -16,6 +16,9 @@ HContainer::~HContainer() } } +template +HContainer::HContainer() {} + // copy constructor template HContainer::HContainer(const HContainer& HR_in, T* data_array) @@ -35,17 +38,38 @@ HContainer::HContainer(const HContainer& HR_in, T* data_array) // move constructor template -HContainer::HContainer(HContainer&& HR_in) +HContainer::HContainer(HContainer&& HR_in) noexcept { this->atom_pairs = std::move(HR_in.atom_pairs); this->sparse_ap = std::move(HR_in.sparse_ap); this->sparse_ap_index = std::move(HR_in.sparse_ap_index); + this->wrapper_pointer = HR_in.wrapper_pointer; this->gamma_only = HR_in.gamma_only; this->paraV = HR_in.paraV; this->current_R = -1; + HR_in.wrapper_pointer = nullptr; // tmp terms not moved } +// move assignment +template +HContainer& HContainer::operator=(HContainer&& HR_in) noexcept +{ + if (this != &HR_in) + { + this->atom_pairs = std::move(HR_in.atom_pairs); + this->sparse_ap = std::move(HR_in.sparse_ap); + this->sparse_ap_index = std::move(HR_in.sparse_ap_index); + this->wrapper_pointer = HR_in.wrapper_pointer; + this->gamma_only = HR_in.gamma_only; + this->paraV = HR_in.paraV; + this->current_R = -1; + + HR_in.wrapper_pointer = nullptr; + } + return *this; +} + // simple constructor template HContainer::HContainer(int natom) diff --git a/source/module_hamilt_lcao/module_hcontainer/hcontainer.h b/source/module_hamilt_lcao/module_hcontainer/hcontainer.h index cf50e7c263..edaca9577e 100644 --- a/source/module_hamilt_lcao/module_hcontainer/hcontainer.h +++ b/source/module_hamilt_lcao/module_hcontainer/hcontainer.h @@ -146,6 +146,8 @@ class HContainer // Destructor of class HContainer ~HContainer(); + HContainer(); + /** * @brief copy constructor * when data_array is not nullptr, new HContainer will be wrapper for data_array @@ -154,7 +156,9 @@ class HContainer HContainer(const HContainer& HR_in, T* data_array = nullptr); // move constructor - HContainer(HContainer&& HR_in); + HContainer(HContainer&& HR_in) noexcept; + // move assignment + HContainer& operator=(HContainer&& HR_in) noexcept; // simple constructor HContainer(int natom); From d0d98b0f35de024d23554560bb15113ba6e7733a Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 13 Jun 2025 10:15:49 +0800 Subject: [PATCH 29/63] rename phi_mul_phi_vldr3 --- .../module_gint/temp_gint/gint_vl_gpu.cpp | 2 +- .../temp_gint/gint_vl_metagga_gpu.cpp | 16 ++++++++-------- .../temp_gint/gint_vl_metagga_nspin4_gpu.cpp | 16 ++++++++-------- .../module_gint/temp_gint/gint_vl_nspin4_gpu.cpp | 4 ++-- .../temp_gint/kernel/phi_operator_gpu.cu | 2 +- .../temp_gint/kernel/phi_operator_gpu.h | 2 +- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp index 60552b46fd..e3a22946e7 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp @@ -59,7 +59,7 @@ void Gint_vl_gpu::cal_hr_gint_() phi_op.set_phi(phi.get_device_ptr()); phi_op.phi_mul_vldr3(vr_eff_d_.get_device_ptr(), dr3_, phi.get_device_ptr(), phi_vldr3.get_device_ptr()); - phi_op.phi_mul_phi_vldr3(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), hr_gint_, hr_gint_d_.get_device_ptr()); } checkCuda(cudaStreamSynchronize(stream)); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp index 12bb8a4c8b..50f6d52f39 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp @@ -79,14 +79,14 @@ void Gint_vl_metagga_gpu::cal_hr_gint_() dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr()); phi_op.phi_mul_vldr3(vofk_d_.get_device_ptr(), dr3_, dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr()); - phi_op.phi_mul_phi_vldr3(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), - hr_gint_, hr_gint_d_.get_device_ptr()); - phi_op.phi_mul_phi_vldr3(dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr(), - hr_gint_, hr_gint_d_.get_device_ptr()); - phi_op.phi_mul_phi_vldr3(dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr(), - hr_gint_, hr_gint_d_.get_device_ptr()); - phi_op.phi_mul_phi_vldr3(dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr(), - hr_gint_, hr_gint_d_.get_device_ptr()); + phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + phi_op.phi_mul_phi(dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + phi_op.phi_mul_phi(dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); + phi_op.phi_mul_phi(dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr(), + hr_gint_, hr_gint_d_.get_device_ptr()); } checkCuda(cudaStreamSynchronize(stream)); checkCuda(cudaStreamDestroy(stream)); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp index 2b5132531d..6ab6fb23cf 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp @@ -92,14 +92,14 @@ void Gint_vl_metagga_nspin4_gpu::cal_hr_gint_() dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr()); phi_op.phi_mul_vldr3(vofk_d_[is].get_device_ptr(), dr3_, dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr()); - phi_op.phi_mul_phi_vldr3(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), - hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); - phi_op.phi_mul_phi_vldr3(dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr(), - hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); - phi_op.phi_mul_phi_vldr3(dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr(), - hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); - phi_op.phi_mul_phi_vldr3(dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr(), - hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + phi_op.phi_mul_phi(dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + phi_op.phi_mul_phi(dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + phi_op.phi_mul_phi(dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); } } checkCuda(cudaStreamSynchronize(stream)); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp index fabce6649b..be4e3a7de9 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp @@ -76,8 +76,8 @@ void Gint_vl_nspin4_gpu::cal_hr_gint_() { phi_op.phi_mul_vldr3(vr_eff_d_[is].get_device_ptr(), dr3_, phi.get_device_ptr(), phi_vldr3.get_device_ptr()); - phi_op.phi_mul_phi_vldr3(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), - hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); + phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(), + hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr()); } } checkCuda(cudaStreamSynchronize(stream)); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu index e5c616a53a..3eefeb09f5 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu @@ -206,7 +206,7 @@ void PhiOperatorGpu::phi_mul_vldr3( result_d); } -void PhiOperatorGpu::phi_mul_phi_vldr3( +void PhiOperatorGpu::phi_mul_phi( const double* phi_d, const double* phi_vldr3_d, HContainer& hRGint, diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h index 8f20a7a054..fd746e011e 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h @@ -32,7 +32,7 @@ class PhiOperatorGpu const double* phi_d, double* result_d) const; - void phi_mul_phi_vldr3( + void phi_mul_phi( const double* phi_d, const double* phi_vldr3_d, HContainer& hRGint, From 479b0adb36c3309b180006cf21f141f3120ea8da Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 13 Jun 2025 10:41:56 +0800 Subject: [PATCH 30/63] small modification --- .../module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp | 4 ++-- .../module_gint/temp_gint/gint_fvl_meta_gpu.cpp | 4 ++-- .../module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp | 4 ++-- .../module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp | 4 ++-- .../module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp | 4 ++-- .../module_gint/temp_gint/gint_vl_metagga_gpu.cpp | 4 ++-- .../module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp | 4 ++-- .../module_gint/temp_gint/gint_vl_nspin4_gpu.cpp | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp index 305b677a3c..2cb20049cf 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp @@ -11,9 +11,7 @@ void Gint_fvl_gpu::cal_gint() { init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); - transfer_cpu_to_gpu_(); cal_fvl_svl_(); - transfer_gpu_to_cpu_(); } void Gint_fvl_gpu::init_dm_gint_() @@ -75,6 +73,7 @@ void Gint_fvl_gpu::transfer_gpu_to_cpu_() void Gint_fvl_gpu::cal_fvl_svl_() { + transfer_cpu_to_gpu_(); #pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because @@ -126,6 +125,7 @@ void Gint_fvl_gpu::cal_fvl_svl_() checkCuda(cudaStreamSynchronize(stream)); checkCuda(cudaStreamDestroy(stream)); } + transfer_gpu_to_cpu_(); } } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp index a129ea4872..a0756eb90d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp @@ -11,9 +11,7 @@ void Gint_fvl_meta_gpu::cal_gint() { init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); - transfer_cpu_to_gpu_(); cal_fvl_svl_(); - transfer_gpu_to_cpu_(); } void Gint_fvl_meta_gpu::init_dm_gint_() @@ -79,6 +77,7 @@ void Gint_fvl_meta_gpu::transfer_gpu_to_cpu_() void Gint_fvl_meta_gpu::cal_fvl_svl_() { + transfer_cpu_to_gpu_(); #pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because @@ -174,6 +173,7 @@ void Gint_fvl_meta_gpu::cal_fvl_svl_() checkCuda(cudaStreamSynchronize(stream)); checkCuda(cudaStreamDestroy(stream)); } + transfer_gpu_to_cpu_(); } } // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp index c007df9baa..ad6f08d195 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp @@ -11,9 +11,7 @@ void Gint_rho_gpu::cal_gint() { init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); - transfer_cpu_to_gpu_(); cal_rho_(); - transfer_gpu_to_cpu_(); } void Gint_rho_gpu::init_dm_gint_() @@ -49,6 +47,7 @@ void Gint_rho_gpu::transfer_gpu_to_cpu_() void Gint_rho_gpu::cal_rho_() { + transfer_cpu_to_gpu_(); #pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because @@ -79,6 +78,7 @@ void Gint_rho_gpu::cal_rho_() checkCuda(cudaStreamSynchronize(stream)); checkCuda(cudaStreamDestroy(stream)); } + transfer_gpu_to_cpu_(); } } // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp index 7660552d1c..6d02e0cce4 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp @@ -11,9 +11,7 @@ void Gint_tau_gpu::cal_gint() { init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); - transfer_cpu_to_gpu_(); cal_tau_(); - transfer_gpu_to_cpu_(); } void Gint_tau_gpu::init_dm_gint_() @@ -49,6 +47,7 @@ void Gint_tau_gpu::transfer_gpu_to_cpu_() void Gint_tau_gpu::cal_tau_() { + transfer_cpu_to_gpu_(); #pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because @@ -90,6 +89,7 @@ void Gint_tau_gpu::cal_tau_() checkCuda(cudaStreamSynchronize(stream)); checkCuda(cudaStreamDestroy(stream)); } + transfer_gpu_to_cpu_(); } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp index e3a22946e7..5a38e93036 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp @@ -10,9 +10,7 @@ namespace ModuleGint void Gint_vl_gpu::cal_gint() { init_hr_gint_(); - transfer_cpu_to_gpu_(); cal_hr_gint_(); - transfer_gpu_to_cpu_(); compose_hr_gint(hr_gint_); transfer_hr_gint_to_hR(hr_gint_, *hR_); } @@ -38,6 +36,7 @@ void Gint_vl_gpu::transfer_gpu_to_cpu_() void Gint_vl_gpu::cal_hr_gint_() { + transfer_cpu_to_gpu_(); #pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because @@ -65,6 +64,7 @@ void Gint_vl_gpu::cal_hr_gint_() checkCuda(cudaStreamSynchronize(stream)); checkCuda(cudaStreamDestroy(stream)); } + transfer_gpu_to_cpu_(); } } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp index 50f6d52f39..bd54d42c6a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp @@ -10,9 +10,7 @@ namespace ModuleGint void Gint_vl_metagga_gpu::cal_gint() { init_hr_gint_(); - transfer_cpu_to_gpu_(); cal_hr_gint_(); - transfer_gpu_to_cpu_(); compose_hr_gint(hr_gint_); transfer_hr_gint_to_hR(hr_gint_, *hR_); } @@ -45,6 +43,7 @@ void Gint_vl_metagga_gpu::transfer_gpu_to_cpu_() void Gint_vl_metagga_gpu::cal_hr_gint_() { + transfer_cpu_to_gpu_(); #pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because @@ -91,5 +90,6 @@ void Gint_vl_metagga_gpu::cal_hr_gint_() checkCuda(cudaStreamSynchronize(stream)); checkCuda(cudaStreamDestroy(stream)); } + transfer_gpu_to_cpu_(); } } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp index 6ab6fb23cf..de6a85d9da 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp @@ -10,9 +10,7 @@ namespace ModuleGint void Gint_vl_metagga_nspin4_gpu::cal_gint() { init_hr_gint_(); - transfer_cpu_to_gpu_(); cal_hr_gint_(); - transfer_gpu_to_cpu_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); transfer_hr_gint_to_hR(hr_gint_full_, *hR_); } @@ -56,6 +54,7 @@ void Gint_vl_metagga_nspin4_gpu::transfer_gpu_to_cpu_() void Gint_vl_metagga_nspin4_gpu::cal_hr_gint_() { + transfer_cpu_to_gpu_(); #pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because @@ -105,6 +104,7 @@ void Gint_vl_metagga_nspin4_gpu::cal_hr_gint_() checkCuda(cudaStreamSynchronize(stream)); checkCuda(cudaStreamDestroy(stream)); } + transfer_gpu_to_cpu_(); } } // namespace ModuleGint \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp index be4e3a7de9..178892c2b8 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp @@ -10,9 +10,7 @@ namespace ModuleGint void Gint_vl_nspin4_gpu::cal_gint() { init_hr_gint_(); - transfer_cpu_to_gpu_(); cal_hr_gint_(); - transfer_gpu_to_cpu_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); transfer_hr_gint_to_hR(hr_gint_full_, *hR_); } @@ -53,6 +51,7 @@ void Gint_vl_nspin4_gpu::transfer_gpu_to_cpu_() void Gint_vl_nspin4_gpu::cal_hr_gint_() { + transfer_cpu_to_gpu_(); #pragma omp parallel num_threads(gint_info_->get_streams_num()) { // 20240620 Note that it must be set again here because @@ -83,6 +82,7 @@ void Gint_vl_nspin4_gpu::cal_hr_gint_() checkCuda(cudaStreamSynchronize(stream)); checkCuda(cudaStreamDestroy(stream)); } + transfer_gpu_to_cpu_(); } } // namespace ModuleGint \ No newline at end of file From d52f745e0d576076ead9e5059f461d217ee66b23 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 13 Jun 2025 11:00:43 +0800 Subject: [PATCH 31/63] remove some outdated comments --- .../module_gint/temp_gint/kernel/dgemm_vbatch.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu index e7e212cc2c..b35e0669b6 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu @@ -2,8 +2,6 @@ #include "gemm_nn_vbatch.cuh" #include "dgemm_vbatch.h" -// The template parameter settings for the function are based on the MAGMA source code settings. -// Specifically, they refer to the settings for the "nn" shape in dgemm_vbatched_core. void dgemm_nn_vbatch( int max_m, int max_n, int max_k, const int* m_d, const int* n_d, const int* k_d, @@ -13,7 +11,6 @@ void dgemm_nn_vbatch( int batchCount, cudaStream_t stream, const double* alpha) { - vbatched_gemm_nn_impl (max_m, max_n, m_d, n_d, k_d, A_array_d, lda_d, @@ -23,7 +20,6 @@ void dgemm_nn_vbatch( } -// the template parameters refer to the settings for the "nt" shape in dgemm_vbatched_core. void dgemm_tn_vbatch( int max_m, int max_n, int max_k, const int* m_d, const int* n_d, const int* k_d, From 5feaea5e32f382a3453ef8929c4213f546441a6f Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Fri, 13 Jun 2025 14:24:42 +0800 Subject: [PATCH 32/63] small modification --- .../module_gint/temp_gint/kernel/phi_operator_gpu.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu index 3eefeb09f5..f4bb2667fb 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu @@ -291,7 +291,6 @@ void PhiOperatorGpu::phi_mul_phi( ap_num, stream_, nullptr); - checkCudaLastError(); } void PhiOperatorGpu::phi_mul_dm( @@ -386,7 +385,6 @@ void PhiOperatorGpu::phi_mul_dm( ap_num, stream_, alpha_ptr); - checkCudaLastError(); } void PhiOperatorGpu::phi_dot_phi( From 8a38edec3d31ded5a45f36682f24557308f4d164 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sat, 14 Jun 2025 11:21:02 +0800 Subject: [PATCH 33/63] small modification --- .../hamilt_lcaodft/operator_lcao/veff_lcao.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp index 08bd703c3f..8c5850fdb1 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp @@ -187,19 +187,15 @@ void Veff, std::complex>>::contributeH if(XC_Functional::get_ked_flag()) { vofk_eff[is] = this->pot->get_effective_vofk(is); - if(is == 3) - { - ModuleGint::cal_gint_vl_metagga(vr_eff, vofk_eff, this->hR); - } - } - else - { - if(is == 3) - { - ModuleGint::cal_gint_vl(vr_eff, this->hR); - } } } + if(XC_Functional::get_ked_flag()) + { + ModuleGint::cal_gint_vl_metagga(vr_eff, vofk_eff, this->hR); + } else + { + ModuleGint::cal_gint_vl(vr_eff, this->hR); + } #endif ModuleBase::timer::tick("Veff", "contributeHR"); From 94b522c60daa4a30d9364313cdd45b614b4b85cd Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sun, 15 Jun 2025 22:58:08 +0800 Subject: [PATCH 34/63] renew gint interface in rdmft related code --- source/module_io/read_input_item_other.cpp | 6 ++++ source/module_rdmft/rdmft_tools.cpp | 41 ++++++++++++++++++++-- source/module_rdmft/update_state_rdmft.cpp | 10 +++++- 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/source/module_io/read_input_item_other.cpp b/source/module_io/read_input_item_other.cpp index 74b1e25221..5694c7b33b 100644 --- a/source/module_io/read_input_item_other.cpp +++ b/source/module_io/read_input_item_other.cpp @@ -501,6 +501,12 @@ void ReadInput::item_others() item.annotation = "whether to perform rdmft calculation, default is false"; read_sync_bool(input.rdmft); this->add_item(item); + item.check_value = [](const Input_Item& item, const Parameter& para) { + if (para.input.rdmft && para.input.nspin == 4) + { + ModuleBase::WARNING_QUIT("ReadInput", "rdmft is not available for nspin = 4"); + } + }; } { Input_Item item("rdmft_power_alpha"); diff --git a/source/module_rdmft/rdmft_tools.cpp b/source/module_rdmft/rdmft_tools.cpp index d9fa7cf2e0..460d4668bc 100644 --- a/source/module_rdmft/rdmft_tools.cpp +++ b/source/module_rdmft/rdmft_tools.cpp @@ -12,6 +12,7 @@ #include "module_elecstate/module_pot/pot_local.h" #include "module_elecstate/module_pot/pot_xc.h" #include "module_hamilt_pw/hamilt_pwdft/structure_factor.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" #include #include @@ -240,8 +241,9 @@ void Veff_rdmft::initialize_HR(const UnitCell* ucell_in, const Grid_Driv // this part of the code is copying from class Veff and do some modifications. -template -void Veff_rdmft::contributeHR() +// nspin == 1 or 2 case +template<> +void Veff_rdmft, double>::contributeHR() { ModuleBase::TITLE("Veff", "contributeHR"); ModuleBase::timer::tick("Veff", "contributeHR"); @@ -261,8 +263,12 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_hartree(is, 0); // do grid integral calculation to get HR +#ifndef __NEW_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GK->cal_gint(&inout); +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } } else if( potential_ == "local" ) @@ -276,8 +282,12 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_local(0, 0); // do grid integral calculation to get HR +#ifndef __NEW_GINT Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal); this->GK->cal_gint(&inout); +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } else if( potential_ == "xc" ) { @@ -296,8 +306,12 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_XC(is, 0); // do grid integral calculation to get HR +#ifndef __NEW_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GK->cal_gint(&inout); +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } } else @@ -307,7 +321,9 @@ void Veff_rdmft::contributeHR() // get HR for 2D-block parallel format // this->GK->transfer_pvpR(this->hR); +#ifndef __NEW_GINT this->GK->transfer_pvpR(this->hR,this->ucell,this->gd); +#endif if(this->nspin == 2) { @@ -318,6 +334,12 @@ void Veff_rdmft::contributeHR() return; } +template<> +void Veff_rdmft, std::complex>::contributeHR() +{ + // nspin = 4 case not implemented currently. +} + // this part of the code is copying from class Veff and do some modifications. // special case of gamma-only template<> @@ -343,8 +365,12 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_hartree(is, 0); // do grid integral calculation to get HR +#ifndef __NEW_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GG->cal_gint(&inout); +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } } else if( potential_ == "local" ) @@ -358,12 +384,16 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_local(0, 0); // do grid integral calculation to get HR +#ifndef __NEW_GINT Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal); // because in gamma_only, cal_gint would not set hRGint zero first // so must use cal_vlocal(), and in rdmft_test.h, calculate V_hartree->contributeHR() first this->GG->cal_vlocal(&inout, false); // cal_gint ??? +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } else if( potential_ == "xc" ) { @@ -381,8 +411,12 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_XC(is, 0); // do grid integral calculation to get HR +#ifndef __NEW_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GG->cal_gint(&inout); +#else + ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR); +#endif } } else @@ -390,9 +424,10 @@ void Veff_rdmft::contributeHR() std::cout << "\n\n!!!!!!\n there may be something wrong when use class Veff_rdmft\n\n!!!!!!\n"; } +#ifndef __NEW_GINT // get HR for 2D-block parallel format this->GG->transfer_pvpR(this->hR,this->ucell); - +#endif this->new_e_iteration = false; if(this->nspin == 2) diff --git a/source/module_rdmft/update_state_rdmft.cpp b/source/module_rdmft/update_state_rdmft.cpp index 2a22b18864..5bca6cdd67 100644 --- a/source/module_rdmft/update_state_rdmft.cpp +++ b/source/module_rdmft/update_state_rdmft.cpp @@ -8,6 +8,7 @@ #include "module_elecstate/module_dm/cal_dm_psi.h" #include "module_elecstate/module_dm/density_matrix.h" #include "module_elecstate/module_charge/symmetry_rho.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" namespace rdmft @@ -105,10 +106,13 @@ void RDMFT::update_charge(UnitCell& ucell) { ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx); } - +#ifndef __NEW_GINT GG->transfer_DM2DtoGrid(DM_gamma_only.get_DMR_vector()); Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin); GG->cal_gint(&inout); +#else + ModuleGint::cal_gint_rho(DM_gamma_only.get_DMR_vector(), nspin, charge->rho); +#endif if (XC_Functional::get_ked_flag()) { @@ -136,9 +140,13 @@ void RDMFT::update_charge(UnitCell& ucell) ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx); } +#ifndef __NEW_GINT GK->transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin); GK->cal_gint(&inout); +#else + ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, charge->rho); +#endif if (XC_Functional::get_ked_flag()) { From cf73ad5558458bed488e6053cdc024be20d40b96 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sun, 15 Jun 2025 23:02:33 +0800 Subject: [PATCH 35/63] renew gint interface in get_pchg_lcao.cpp --- source/module_io/get_pchg_lcao.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/source/module_io/get_pchg_lcao.cpp b/source/module_io/get_pchg_lcao.cpp index 721f9ea199..25512cac1b 100644 --- a/source/module_io/get_pchg_lcao.cpp +++ b/source/module_io/get_pchg_lcao.cpp @@ -9,6 +9,7 @@ #include "module_elecstate/module_dm/cal_dm_psi.h" #include "module_elecstate/module_dm/density_matrix.h" #include "module_hamilt_lcao/module_gint/gint.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" #include "module_hamilt_pw/hamilt_pwdft/global.h" #include "module_io/cube_io.h" @@ -105,10 +106,14 @@ void IState_Charge::begin(Gint_Gamma& gg, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(); +#ifndef __NEW_GINT gg.initialize_pvpR(*ucell_in, GridD_in, PARAM.inp.nspin); gg.transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); gg.cal_gint(&inout); +#else + ModuleGint::cal_gint_rho(DM.get_DMR_vector(), PARAM.inp.nspin, rho); +#endif // A solution to replace the original implementation of the following code: // pelec->charge->save_rho_before_sum_band(); @@ -233,10 +238,15 @@ void IState_Charge::begin(Gint_k& gk, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(ik); +#ifndef __NEW_GINT gk.initialize_pvpR(*ucell_in, GridD_in, PARAM.inp.nspin); gk.transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); gk.cal_gint(&inout); +#else + ModuleGint::cal_gint_rho(DM.get_DMR_vector(), PARAM.inp.nspin, rho); +#endif + // Using std::vector to replace the original double** rho_save std::vector> rho_save(nspin, std::vector(rhopw_nrxx)); @@ -279,11 +289,14 @@ void IState_Charge::begin(Gint_k& gk, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(); +#ifndef __NEW_GINT gk.initialize_pvpR(*ucell_in, GridD_in, PARAM.inp.nspin); gk.transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); gk.cal_gint(&inout); - +#else + ModuleGint::cal_gint_rho(DM.get_DMR_vector(), PARAM.inp.nspin, rho); +#endif // Using std::vector to replace the original double** rho_save std::vector> rho_save(nspin, std::vector(rhopw_nrxx)); From 389c774b4e02e42c6eac6c6f892743d536a603e2 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 01:33:35 +0800 Subject: [PATCH 36/63] fully support new gint module --- .../hamilt_lcaodft/operator_lcao/veff_lcao.h | 5 ++- source/module_lr/esolver_lrtd_lcao.cpp | 21 +++++++++- source/module_lr/esolver_lrtd_lcao.h | 4 ++ source/module_lr/lr_spectrum.cpp | 27 +++++++----- .../operator_casida/operator_lr_hxc.cpp | 28 ++++++++++--- source/module_rdmft/rdmft_tools.h | 6 ++- source/source_esolver/lcao_before_scf.cpp | 41 +++++++++---------- source/source_esolver/lcao_others.cpp | 25 ++++++----- 8 files changed, 103 insertions(+), 54 deletions(-) diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h index ee9c03149a..c74f65f67c 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h +++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h @@ -50,7 +50,9 @@ class Veff> : public OperatorLCAO this->cal_type = calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); +#ifndef __NEW_GINT GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin); +#endif } /** * @brief Construct a new Veff object for Gamma-only calculation @@ -69,8 +71,9 @@ class Veff> : public OperatorLCAO { this->cal_type = calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); - +#ifndef __NEW_GINT GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin); +#endif } ~Veff>(){}; diff --git a/source/module_lr/esolver_lrtd_lcao.cpp b/source/module_lr/esolver_lrtd_lcao.cpp index 1e09216303..75ccc669d0 100644 --- a/source/module_lr/esolver_lrtd_lcao.cpp +++ b/source/module_lr/esolver_lrtd_lcao.cpp @@ -397,6 +397,7 @@ LR::ESolver_LR::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu this->gint_->gridt = &this->gt_; // (3) Periodic condition search for each grid. +#ifndef __NEW_GINT double dr_uniform = 0.001; std::vector rcuts; std::vector> psi_u; @@ -451,7 +452,25 @@ LR::ESolver_LR::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu &ucell, &orb); this->gint_->initialize_pvpR(ucell, &this->gd, 1); // always use nspin=1 for transition density - +#else + auto gint_info = std::make_shared( + this->pw_big->nbx, + this->pw_big->nby, + this->pw_big->nbz, + this->pw_rho->nx, + this->pw_rho->ny, + this->pw_rho->nz, + 0, + 0, + this->pw_big->nbzp_start, + this->pw_big->nbx, + this->pw_big->nby, + this->pw_big->nbzp, + orb.Phi, + ucell, + this->gd); + ModuleGint::Gint::set_gint_info(gint_info); +#endif // if EXX from scratch, init 2-center integral and calculate Cs, Vs #ifdef __EXX if ((xc_kernel == "hf" || xc_kernel == "hse") && this->input.lr_solver != "spectrum") diff --git a/source/module_lr/esolver_lrtd_lcao.h b/source/module_lr/esolver_lrtd_lcao.h index dc2e7b2784..9f6375aaac 100644 --- a/source/module_lr/esolver_lrtd_lcao.h +++ b/source/module_lr/esolver_lrtd_lcao.h @@ -17,6 +17,7 @@ #include "module_elecstate/module_dm/density_matrix.h" #include "module_lr/potentials/pot_hxc_lrtd.h" #include "module_lr/hamilt_casida.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint.h" #ifdef __EXX // #include #include "module_ri/Exx_LRI.h" @@ -34,6 +35,9 @@ namespace LR ESolver_LR(const Input_para& inp, UnitCell& ucell); ~ESolver_LR() { delete this->psi_ks; +#ifdef __NEW_GINT + ModuleGint::Gint::set_gint_info(nullptr); +#endif } ///input: input, call, basis(LCAO), psi(ground state), elecstate diff --git a/source/module_lr/lr_spectrum.cpp b/source/module_lr/lr_spectrum.cpp index d2fb10ab02..94b9474f73 100644 --- a/source/module_lr/lr_spectrum.cpp +++ b/source/module_lr/lr_spectrum.cpp @@ -6,6 +6,7 @@ #include "module_lr/utils/lr_util.h" #include "module_lr/utils/lr_util_hcontainer.h" #include "module_lr/utils/lr_util_print.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" template elecstate::DensityMatrix LR::LR_Spectrum::cal_transition_density_matrix(const int istate, const T* X_in, const bool need_R) @@ -34,14 +35,6 @@ elecstate::DensityMatrix LR::LR_Spectrum::cal_transition_density_matrix return DM_trans; } -template -void LR::LR_Spectrum::cal_gint_rho(double** rho, const int& nrxx) -{ - ModuleBase::GlobalFunc::ZEROS(rho[0], nrxx); - Gint_inout inout_rho(rho, Gint_Tools::job_type::rho, 1, false); - this->gint->cal_gint(&inout_rho); -} - inline void check_sum_rule(const double& osc_tot) { if (std::abs(osc_tot - 1.0) > 1e-3) { @@ -59,12 +52,16 @@ ModuleBase::Vector3 LR::LR_Spectrum::cal_transition_dipole_istat const elecstate::DensityMatrix& DM_trans = this->cal_transition_density_matrix(istate); for (int is = 0;is < this->nspin_x;++is) { - this->gint->transfer_DM2DtoGrid({ DM_trans.get_DMR_vector().at(is) }); - // 2. transition density double** rho_trans; LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, this->rho_basis.nrxx); +#ifndef __NEW_GINT + this->gint->transfer_DM2DtoGrid({ DM_trans.get_DMR_vector().at(is) }); this->cal_gint_rho(rho_trans, this->rho_basis.nrxx); +#else + ModuleBase::GlobalFunc::ZEROS(rho_trans[0], this->rho_basis.nrxx); + ModuleGint::cal_gint_rho({ DM_trans.get_DMR_vector().at(is) }, 1, rho_trans); +#endif // 3. transition dipole moment for (int ir = 0; ir < rho_basis.nrxx; ++ir) @@ -108,14 +105,24 @@ ModuleBase::Vector3> LR::LR_Spectrum>: // real part LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'R'); +#ifndef __NEW_GINT this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); this->cal_gint_rho(rho_trans_real, this->rho_basis.nrxx); +#else + ModuleBase::GlobalFunc::ZEROS(rho_trans_real[0], this->rho_basis.nrxx); + ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_real); +#endif // LR_Util::print_grid_nonzero(rho_trans_real[0], this->rho_basis.nrxx, 10, "rho_trans"); // imag part LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'I'); +#ifndef __NEW_GINT this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); this->cal_gint_rho(rho_trans_imag, this->rho_basis.nrxx); +#else + ModuleBase::GlobalFunc::ZEROS(rho_trans_imag[0], this->rho_basis.nrxx); + ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_imag); +#endif // LR_Util::print_grid_nonzero(rho_trans_imag[0], this->rho_basis.nrxx, 10, "rho_trans"); // 3. transition dipole moment diff --git a/source/module_lr/operator_casida/operator_lr_hxc.cpp b/source/module_lr/operator_casida/operator_lr_hxc.cpp index afb929685b..71dd803210 100644 --- a/source/module_lr/operator_casida/operator_lr_hxc.cpp +++ b/source/module_lr/operator_casida/operator_lr_hxc.cpp @@ -10,6 +10,7 @@ #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h" #include "module_lr/ao_to_mo_transformer/ao_to_mo.h" #include "module_hamilt_pw/hamilt_pwdft/global.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h" inline double conj(double a) { return a; } inline std::complex conj(std::complex a) { return std::conj(a); } @@ -55,7 +56,6 @@ namespace LR { ModuleBase::TITLE("OperatorLRHxc", "grid_calculation(real)"); ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation"); - this->gint->transfer_DM2DtoGrid(this->DM_trans->get_DMR_vector()); // 2d block to grid // 2. transition electron density // \f[ \tilde{\rho}(r)=\sum_{\mu_j, \mu_b}\tilde{\rho}_{\mu_j,\mu_b}\phi_{\mu_b}(r)\phi_{\mu_j}(r) \f] @@ -63,8 +63,13 @@ namespace LR const int& nrxx = this->pot.lock()->nrxx; LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // currently gint_kernel_rho uses PARAM.inp.nspin, it needs refactor ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx); +#ifndef __NEW_GINT + this->gint->transfer_DM2DtoGrid(this->DM_trans->get_DMR_vector()); // 2d block to grid Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false); this->gint->cal_gint(&inout_rho); +#else + ModuleGint::cal_gint_rho(this->DM_trans->get_DMR_vector(), 1, rho_trans); +#endif // 3. v_hxc = f_hxc * rho_trans ModuleBase::matrix vr_hxc(1, nrxx); //grid @@ -72,11 +77,15 @@ namespace LR LR_Util::_deallocate_2order_nested_ptr(rho_trans, 1); // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r) + this->hR->set_zero(); // clear hR for each bands +#ifndef __NEW_GINT Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal); this->gint->get_hRGint()->set_zero(); this->gint->cal_gint(&inout_vlocal); - this->hR->set_zero(); // clear hR for each bands this->gint->transfer_pvpR(&*this->hR, &ucell); //grid to 2d block +#else + ModuleGint::cal_gint_vl(vr_hxc.c, &*this->hR); +#endif ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation"); } @@ -96,8 +105,6 @@ namespace LR LR_Util::get_DMR_real_imag_part(*this->DM_trans, DM_trans_real_imag, ucell.nat, type); // if (this->first_print)LR_Util::print_DMR(DM_trans_real_imag, ucell.nat, "DMR(2d, real)"); - this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); - // LR_Util::print_HR(*this->gint->get_DMRGint()[0], this->ucell.nat, "DMR(grid, real)"); // 2. transition electron density double** rho_trans; @@ -105,8 +112,14 @@ namespace LR LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // nspin=1 for transition density ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx); +#ifndef __NEW_GINT + this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); + // LR_Util::print_HR(*this->gint->get_DMRGint()[0], this->ucell.nat, "DMR(grid, real)"); Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false); this->gint->cal_gint(&inout_rho); +#else + ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans); +#endif // print_grid_nonzero(rho_trans[0], nrxx, 10, "rho_trans"); // 3. v_hxc = f_hxc * rho_trans @@ -117,13 +130,16 @@ namespace LR LR_Util::_deallocate_2order_nested_ptr(rho_trans, 1); // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r) + HR_real_imag.set_zero(); +#ifndef __NEW_GINT Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal); this->gint->get_hRGint()->set_zero(); this->gint->cal_gint(&inout_vlocal); - // LR_Util::print_HR(*this->gint->get_hRGint(), this->ucell.nat, "VR(grid)"); - HR_real_imag.set_zero(); this->gint->transfer_pvpR(&HR_real_imag, &ucell, &this->gd); +#else + ModuleGint::cal_gint_vl(vr_hxc.c, &HR_real_imag); +#endif // LR_Util::print_HR(HR_real_imag, this->ucell.nat, "VR(real, 2d)"); LR_Util::set_HR_real_imag_part(HR_real_imag, *this->hR, ucell.nat, type); }; diff --git a/source/module_rdmft/rdmft_tools.h b/source/module_rdmft/rdmft_tools.h index bed7ab9b4d..5a631c34ba 100644 --- a/source/module_rdmft/rdmft_tools.h +++ b/source/module_rdmft/rdmft_tools.h @@ -284,8 +284,9 @@ class Veff_rdmft : public hamilt::OperatorLCAO this->cal_type = hamilt::calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); - +#ifndef __NEW_GINT GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin); +#endif } Veff_rdmft(Gint_Gamma* GG_in, hamilt::HS_Matrix_K* hsk_in, @@ -310,8 +311,9 @@ class Veff_rdmft : public hamilt::OperatorLCAO this->cal_type = hamilt::calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); - +#ifndef __NEW_GINT GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin); +#endif } ~Veff_rdmft(){}; diff --git a/source/source_esolver/lcao_before_scf.cpp b/source/source_esolver/lcao_before_scf.cpp index 7a9ddc7a49..8e4884066c 100644 --- a/source/source_esolver/lcao_before_scf.cpp +++ b/source/source_esolver/lcao_before_scf.cpp @@ -69,6 +69,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) Gint_Tools::init_orb(dr_uniform, rcuts, ucell, orb_, psi_u, dpsi_u, d2psi_u); //! 5) set periodic boundary conditions +#ifndef __NEW_GINT this->GridT.set_pbc_grid(this->pw_rho->nx, this->pw_rho->ny, this->pw_rho->nz, @@ -92,9 +93,17 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) dpsi_u, d2psi_u, PARAM.inp.nstream); + + psi_u.clear(); + psi_u.shrink_to_fit(); + dpsi_u.clear(); + dpsi_u.shrink_to_fit(); + d2psi_u.clear(); + d2psi_u.shrink_to_fit(); + LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big); //! 6) prepare grid integral -#ifdef __NEW_GINT +#else auto gint_info = std::make_shared( this->pw_big->nbx, this->pw_big->nby, @@ -114,22 +123,12 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) ModuleGint::Gint::set_gint_info(gint_info); #endif - psi_u.clear(); - psi_u.shrink_to_fit(); - dpsi_u.clear(); - dpsi_u.shrink_to_fit(); - d2psi_u.clear(); - d2psi_u.shrink_to_fit(); - // 7) For each atom, calculate the adjacent atoms in different cells // and allocate the space for H(R) and S(R). // If k point is used here, allocate HlocR after atom_arrange. this->RA.for_2d(ucell, this->gd, this->pv, PARAM.globalv.gamma_only_local, orb_.cutoffs()); - // 8) after ions move, prepare grid in Gint - LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big); - - // 9) initialize the Hamiltonian operators + // 8) initialize the Hamiltonian operators // if atom moves, then delete old pointer and add a new one if (this->p_hamilt != nullptr) { @@ -169,7 +168,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) #ifdef __MLALGO - // 10) for each ionic step, the overlap must be rebuilt + // 9) for each ionic step, the overlap must be rebuilt // since it depends on ionic positions if (PARAM.globalv.deepks_setorb) { @@ -198,7 +197,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) } #endif - // 11) prepare sc calculation + // 10) prepare sc calculation if (PARAM.inp.sc_mag_switch) { spinconstrain::SpinConstrain& sc = spinconstrain::SpinConstrain::getScInstance(); @@ -217,7 +216,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) this->pelec); } - // 12) set xc type before the first cal of xc in pelec->init_scf + // 11) set xc type before the first cal of xc in pelec->init_scf // Peize Lin add 2016-12-03 #ifdef __EXX if (PARAM.inp.calculation != "nscf") @@ -233,10 +232,10 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) } #endif - // 13) init_scf, should be before_scf? mohan add 2025-03-10 + // 12) init_scf, should be before_scf? mohan add 2025-03-10 this->pelec->init_scf(istep, ucell, this->Pgrid, this->sf.strucFac, this->locpp.numeric, ucell.symm); - // 14) initalize DMR + // 13) initalize DMR // DMR should be same size with Hamiltonian(R) dynamic_cast*>(this->pelec) ->get_DM() @@ -247,7 +246,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) this->ld.init_DMR(ucell, orb_, this->pv, this->gd); #endif - // 15) two cases are considered: + // 14) two cases are considered: // 1. DMK in DensityMatrix is not empty (istep > 0), then DMR is initialized by DMK // 2. DMK in DensityMatrix is empty (istep == 0), then DMR is initialized by zeros if (istep > 0) @@ -255,7 +254,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) dynamic_cast*>(this->pelec)->get_DM()->cal_DMR(); } - // 16) the electron charge density should be symmetrized, + // 15) the electron charge density should be symmetrized, // here is the initialization Symmetry_rho srho; for (int is = 0; is < PARAM.inp.nspin; is++) @@ -263,10 +262,10 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) srho.begin(is, this->chr, this->pw_rho, ucell.symm); } - // 17) why we need to set this sentence? mohan add 2025-03-10 + // 16) why we need to set this sentence? mohan add 2025-03-10 this->p_hamilt->non_first_scf = istep; - // 18) update of RDMFT, added by jghan + // 17) update of RDMFT, added by jghan if (PARAM.inp.rdmft == true) { // necessary operation of these parameters have be done with p_esolver->Init() in source/source_main/driver_run.cpp diff --git a/source/source_esolver/lcao_others.cpp b/source/source_esolver/lcao_others.cpp index beed83eaec..f4adc3617a 100644 --- a/source/source_esolver/lcao_others.cpp +++ b/source/source_esolver/lcao_others.cpp @@ -93,6 +93,7 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) PARAM.inp.test_atom_input); // (3) Periodic condition search for each grid. +#ifndef __NEW_GINT double dr_uniform = 0.001; std::vector rcuts; std::vector> psi_u; @@ -100,7 +101,6 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) std::vector> d2psi_u; Gint_Tools::init_orb(dr_uniform, rcuts, ucell, orb_, psi_u, dpsi_u, d2psi_u); - this->GridT.set_pbc_grid(this->pw_rho->nx, this->pw_rho->ny, this->pw_rho->nz, @@ -124,7 +124,16 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) dpsi_u, d2psi_u, PARAM.inp.nstream); - #ifdef __NEW_GINT + + psi_u.clear(); + psi_u.shrink_to_fit(); + dpsi_u.clear(); + dpsi_u.shrink_to_fit(); + d2psi_u.clear(); + d2psi_u.shrink_to_fit(); + // prepare grid in Gint + LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big); +#else auto gint_info = std::make_shared( this->pw_big->nbx, this->pw_big->nby, @@ -142,14 +151,7 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) ucell, this->gd); ModuleGint::Gint::set_gint_info(gint_info); - #endif - - psi_u.clear(); - psi_u.shrink_to_fit(); - dpsi_u.clear(); - dpsi_u.shrink_to_fit(); - d2psi_u.clear(); - d2psi_u.shrink_to_fit(); +#endif // (2)For each atom, calculate the adjacent atoms in different cells // and allocate the space for H(R) and S(R). @@ -206,9 +208,6 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) } } - // prepare grid in Gint - LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big); - // init Hamiltonian if (this->p_hamilt != nullptr) { From 0d3b292889afea75265b64558e20ed3e7ee36d27 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 02:24:59 +0800 Subject: [PATCH 37/63] reduce header dependency --- .../module_gint/kernels/cuda/cuda_tools.cu | 20 --------- .../module_gint/kernels/cuda/cuda_tools.cuh | 29 +++++++++++-- .../module_gint/temp_gint/gint_helper.h | 17 +++----- .../temp_gint/kernel/cuda_mem_wrapper.h | 2 +- .../temp_gint/kernel/gemm_nn_vbatch.cuh | 6 +-- .../temp_gint/kernel/gemm_tn_vbatch.cuh | 6 +-- .../temp_gint/kernel/gint_gpu_vars.h | 2 +- .../temp_gint/kernel/gint_helper.cuh | 41 ++++++++++++++++++- .../temp_gint/kernel/phi_operator_gpu.h | 2 +- .../temp_gint/kernel/set_const_mem.cu | 2 +- 10 files changed, 79 insertions(+), 48 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu index fd0e6039b0..6666c90f4b 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu @@ -4,26 +4,6 @@ #include "cuda_tools.cuh" -cudaError_t check(cudaError_t result, const char *const func, const char *const file, const int line) -{ - if (result != cudaSuccess) - { - fprintf(stderr, "CUDA Runtime Error at %s:%d code=%s \"%s\" \n", file, line, cudaGetErrorString(result), func); - exit(EXIT_FAILURE); - } - return result; -} -cudaError_t __checkCudaLastError(const char *file, const int line) -{ - cudaError_t result = cudaGetLastError(); - if (result != cudaSuccess) - { - fprintf(stderr, "%s(%i) : getLastCudaError():%s\n", file, line, cudaGetErrorString(result)); - assert(result == cudaSuccess); - } - return result; -} - void dump_cuda_array_to_file(const double* cuda_array, int width, int hight, diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh index 02f9a1d4ca..929191d12a 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh @@ -9,11 +9,32 @@ #include #include -#define checkCuda(val) check(val, #val, __FILE__, __LINE__) -#define checkCudaLastError() __checkCudaLastError(__FILE__, __LINE__) +#define checkCuda(val) check((val), #val, __FILE__, __LINE__) +#define checkCudaLastError() __getLastCudaError(__FILE__, __LINE__) -cudaError_t check(cudaError_t result, const char *const func, const char *const file, const int line); -cudaError_t __checkCudaLastError(const char *file, const int line); +inline void check(cudaError_t result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, + static_cast(result), func); + exit(EXIT_FAILURE); + } +} + +inline void __getLastCudaError(const char *file, + const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " (%d) %s.\n", + file, line, static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} static inline int ceildiv(int x, int y) { diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h index 8288028691..a017f81ba0 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h @@ -7,18 +7,11 @@ namespace ModuleGint { - -template -std::shared_ptr toConstSharedPtr(std::shared_ptr ptr) { - return std::static_pointer_cast(ptr); -} - - inline int index3Dto1D(const int id_x, const int id_y, const int id_z, const int dim_x, const int dim_y, const int dim_z) { return id_z + id_y * dim_z + id_x * dim_y * dim_z; -}; +} inline Vec3i index1Dto3D(const int index_1d, const int dim_x, const int dim_y, const int dim_z) @@ -27,7 +20,7 @@ inline Vec3i index1Dto3D(const int index_1d, int id_y = (index_1d - id_x * dim_y * dim_z) / dim_z; int id_z = index_1d % dim_z; return Vec3i(id_x, id_y, id_z); -}; +} // if exponent is an integer between 0 and 5 (the most common cases in gint) and // and exp is a variable that cannot be determined at compile time (which means the compiler cannot optimize the code), @@ -52,17 +45,17 @@ inline double pow_int(const double base, const int exp) double result = std::pow(base, exp); return result; } -}; +} inline int floor_div(const int a, const int b) { // a ^ b < 0 means a and b have different signs return a / b - (a % b != 0 && (a ^ b) < 0); -}; +} inline int ceil_div(const int a, const int b) { return a / b + (a % b != 0 && (a ^ b) > 0); -}; +} } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h index e5aa721b7f..0cf033e040 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h @@ -1,7 +1,7 @@ #pragma once #include #include "module_base/tool_quit.h" -#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include "gint_helper.cuh" template class CudaMemWrapper diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh index 2ada532854..5ad934e305 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh @@ -6,7 +6,7 @@ #include #include // for fprintf and stderr -#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include "gint_helper.cuh" #include @@ -395,8 +395,8 @@ void vbatched_gemm_nn_impl(int max_m, for (int i = 0; i < batchCount; i += max_batch_count) { const int ibatch = min(max_batch_count, batchCount - i); - dim3 dimGrid(ceildiv(max_n, BLK_M), - ceildiv(max_m, BLK_N), + dim3 dimGrid(ceil_div(max_n, BLK_M), + ceil_div(max_m, BLK_N), ibatch); const T* alpha_tmp = nullptr; if (alpha != nullptr) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh index 5b00a0f6ca..701e93e81f 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh @@ -6,7 +6,7 @@ #include #include // for fprintf and stderr -#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include "gint_helper.cuh" #include @@ -420,8 +420,8 @@ void vbatched_gemm_tn_impl(int max_m, for (int i = 0; i < batchCount; i += max_batch_count) { const int ibatch = min(max_batch_count, batchCount - i); - dim3 dimGrid(ceildiv(max_n, BLK_M), - ceildiv(max_m, BLK_N), + dim3 dimGrid(ceil_div(max_n, BLK_M), + ceil_div(max_m, BLK_N), ibatch); const T* alpha_tmp = nullptr; if (alpha != nullptr) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h index 68b591822e..50bb80c02f 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h @@ -6,7 +6,7 @@ #include "module_cell/unitcell.h" #include "module_cell/atom_spec.h" #include "module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h" -#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include "gint_helper.cuh" #include "module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cuh" namespace ModuleGint diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh index 4807801466..20fbebef03 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh @@ -1,4 +1,5 @@ #pragma once +#include // if exponent is an integer between 0 and 5 (the most common cases in gint) and // and exp is a variable that cannot be determined at compile time (which means the compiler cannot optimize the code), @@ -24,7 +25,7 @@ __forceinline__ __device__ T pow_int(const T base, const int exp) double result = std::pow(base, exp); return result; } -}; +} template __forceinline__ __device__ T warpReduceSum(T val) @@ -35,4 +36,40 @@ __forceinline__ __device__ T warpReduceSum(T val) val += __shfl_xor_sync(0xffffffff, val, 2, 32); val += __shfl_xor_sync(0xffffffff, val, 1, 32); return val; -} \ No newline at end of file +} + +inline int ceil_div(const int a, const int b) +{ + return a / b + (a % b != 0 && (a ^ b) > 0); +} + +inline void check(cudaError_t result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, + static_cast(result), func); + exit(EXIT_FAILURE); + } +} + +inline void __getLastCudaError(const char *file, + const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " (%d) %s.\n", + file, line, static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCuda(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define checkCudaLastError() __getLastCudaError(__FILE__, __LINE__) \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h index fd746e011e..4988e265ce 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h @@ -3,7 +3,7 @@ #include #include "module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h" -#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include "gint_helper.cuh" #include "gint_gpu_vars.h" #include "cuda_mem_wrapper.h" diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu index 314020fd45..38fba5de00 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu @@ -1,5 +1,5 @@ #include "set_const_mem.cuh" -#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include "gint_helper.cuh" __constant__ double ylmcoe_d[100]; From f8f6e31264a99ba9e09834acfbc3fbeaa6536238 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 02:33:33 +0800 Subject: [PATCH 38/63] remove redundant ";" --- .../module_gint/temp_gint/batch_biggrid.h | 20 ++++---- .../module_gint/temp_gint/big_grid.h | 22 ++++----- .../module_gint/temp_gint/biggrid_info.h | 24 +++++----- .../module_gint/temp_gint/gint_atom.h | 22 ++++----- .../module_gint/temp_gint/gint_fvl.h | 2 +- .../module_gint/temp_gint/gint_fvl_gpu.h | 2 +- .../module_gint/temp_gint/gint_fvl_meta.h | 2 +- .../module_gint/temp_gint/gint_fvl_meta_gpu.h | 2 +- .../module_gint/temp_gint/gint_info.h | 12 ++--- .../module_gint/temp_gint/gint_rho.h | 2 +- .../module_gint/temp_gint/gint_rho_gpu.h | 2 +- .../module_gint/temp_gint/gint_tau.h | 2 +- .../module_gint/temp_gint/gint_tau_gpu.h | 2 +- .../module_gint/temp_gint/gint_vl.h | 2 +- .../module_gint/temp_gint/gint_vl_gpu.h | 2 +- .../module_gint/temp_gint/gint_vl_metagga.h | 2 +- .../temp_gint/gint_vl_metagga_gpu.h | 2 +- .../temp_gint/gint_vl_metagga_nspin4.h | 2 +- .../temp_gint/gint_vl_metagga_nspin4_gpu.h | 2 +- .../module_gint/temp_gint/gint_vl_nspin4.h | 2 +- .../temp_gint/gint_vl_nspin4_gpu.h | 2 +- .../temp_gint/kernel/cuda_mem_wrapper.h | 46 +++++++++---------- .../module_gint/temp_gint/localcell_info.h | 20 ++++---- .../module_gint/temp_gint/meshgrid_info.h | 8 ++-- .../module_gint/temp_gint/phi_operator.h | 6 +-- .../module_gint/temp_gint/unitcell_info.h | 38 +++++++-------- 26 files changed, 125 insertions(+), 125 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h index 4556fac707..d4de77d1db 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h @@ -11,18 +11,18 @@ class BatchBigGrid public: BatchBigGrid(std::vector> biggrids); - const std::vector>& get_bgrids() { return biggrids_; }; + const std::vector>& get_bgrids() { return biggrids_; } - int get_batch_size() const { return biggrids_.size(); }; - int get_atoms_num() const { return atoms_num_; }; + int get_batch_size() const { return biggrids_.size(); } + int get_atoms_num() const { return atoms_num_; } int get_phi_len() const { return phi_len_;} - int get_max_atoms_num_per_bgrid() const { return max_atoms_num_per_bgrid_; }; - bool empty() {return atoms_num_ == 0; }; - static int get_max_batch_size() { return max_batch_size_; }; - static int get_max_atoms_num() { return max_atoms_num_; }; - static int get_max_phi_len() { return max_phi_len_; }; - static int get_max_atom_pairs_num() { return max_atom_pairs_num_; }; - static std::shared_ptr get_bgrid_info() { return BigGrid::get_bgrid_info(); }; + int get_max_atoms_num_per_bgrid() const { return max_atoms_num_per_bgrid_; } + bool empty() {return atoms_num_ == 0; } + static int get_max_batch_size() { return max_batch_size_; } + static int get_max_atoms_num() { return max_atoms_num_; } + static int get_max_phi_len() { return max_phi_len_; } + static int get_max_atom_pairs_num() { return max_atom_pairs_num_; } + static std::shared_ptr get_bgrid_info() { return BigGrid::get_bgrid_info(); } private: std::vector> biggrids_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h index b8ea90eeeb..55bed7a251 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h @@ -17,23 +17,23 @@ class BigGrid // constructor BigGrid(int idx); - static void init_localcell_info(std::shared_ptr localcell_info) { localcell_info_ = localcell_info; }; - static void init_unitcell_info(std::shared_ptr unitcell_info) { unitcell_info_ = unitcell_info; }; - static void init_bgrid_info(std::shared_ptr biggrid_info) { biggrid_info_ = biggrid_info; }; + static void init_localcell_info(std::shared_ptr localcell_info) { localcell_info_ = localcell_info; } + static void init_unitcell_info(std::shared_ptr unitcell_info) { unitcell_info_ = unitcell_info; } + static void init_bgrid_info(std::shared_ptr biggrid_info) { biggrid_info_ = biggrid_info; } // getter functions - int get_idx() const { return idx_; }; - static std::shared_ptr get_localcell_info() { return localcell_info_; }; - static std::shared_ptr get_unitcell_info() { return unitcell_info_; }; - static std::shared_ptr get_bgrid_info() { return biggrid_info_; }; - const std::vector& get_atoms() const { return atoms_; }; - const GintAtom* get_atom(int i) const { return atoms_[i]; }; + int get_idx() const { return idx_; } + static std::shared_ptr get_localcell_info() { return localcell_info_; } + static std::shared_ptr get_unitcell_info() { return unitcell_info_; } + static std::shared_ptr get_bgrid_info() { return biggrid_info_; } + const std::vector& get_atoms() const { return atoms_; } + const GintAtom* get_atom(int i) const { return atoms_[i]; } // get the number of meshgrids in the big grid - int get_mgrids_num() const { return biggrid_info_->get_mgrids_num(); }; + int get_mgrids_num() const { return biggrid_info_->get_mgrids_num(); } // get the number of atoms that can affect the big grid - int get_atoms_num() const { return atoms_.size(); }; + int get_atoms_num() const { return atoms_.size(); } // add an atom to the big grid void add_atom(const GintAtom* atom); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h index 9c6c96c243..c017f87a3d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h @@ -22,36 +22,36 @@ class BigGridInfo Vec3d biggrid_vec3, int nmx, int nmy, int nmz); - Vec3d get_cartesian_coord(const Vec3d& index_3d) const { return index_3d * biggrid_latvec0_; }; - Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * biggrid_latvec0_; }; - Vec3d get_direct_coord(const Vec3d& cart_coord) const { return cart_coord * biggrid_GT_; }; + Vec3d get_cartesian_coord(const Vec3d& index_3d) const { return index_3d * biggrid_latvec0_; } + Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * biggrid_latvec0_; } + Vec3d get_direct_coord(const Vec3d& cart_coord) const { return cart_coord * biggrid_GT_; } // Return the maximum number of big grids that can fit inside a sphere of radius r, // along the three lattice vector directions. Vec3i max_ext_bgrid_num(double r) const; // get number of meshgrids along three lattice directions - int get_nmx() const { return nmx_; }; - int get_nmy() const { return nmy_; }; - int get_nmz() const { return nmz_; }; - int get_mgrids_num() const { return nmxyz_; }; + int get_nmx() const { return nmx_; } + int get_nmy() const { return nmy_; } + int get_nmz() const { return nmz_; } + int get_mgrids_num() const { return nmxyz_; } - const std::vector& get_mgrids_coord() const { return meshgrid_coords_; }; - const Vec3d& get_mgrid_coord(int index_1d) const { return meshgrid_coords_[index_1d]; }; + const std::vector& get_mgrids_coord() const { return meshgrid_coords_; } + const Vec3d& get_mgrid_coord(int index_1d) const { return meshgrid_coords_[index_1d]; } - std::shared_ptr get_mgrid_info() const { return meshgrid_info_; }; + std::shared_ptr get_mgrid_info() const { return meshgrid_info_; } // get the 3D index of a meshgrid in the big grid from the 1D index Vec3i mgrid_idx_1Dto3D(int index_1d) const { return index1Dto3D(index_1d, nmx_, nmy_, nmz_); - }; + } // get the 1D index of a meshgrid in the big grid from the 3D index int mgrid_idx_3Dto1D(const Vec3i index_3d) const { return index3Dto1D(index_3d.x, index_3d.y, index_3d.z, nmx_, nmy_, nmz_); - }; + } private: // basis vectors of the big grid diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h index 3a0e874e4e..aff8aae5b9 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h @@ -21,18 +21,18 @@ class GintAtom const UnitCell* ucell); // getter functions - const Atom* get_atom() const { return atom_; }; - int get_ia() const { return ia_; }; - int get_iat() const { return iat_; }; - int get_start_iw() const { return ucell_->itiaiw2iwt(it_, ia_, 0); }; // get the start index of global atomic orbitals - const Vec3i& get_bgrid_idx() const { return biggrid_idx_; }; - const Vec3i& get_unitcell_idx() const { return unitcell_idx_; }; - const Vec3i& get_R() const { return unitcell_idx_; }; - const Vec3d& get_tau_in_bgrid() const { return tau_in_biggrid_; }; - const Numerical_Orbital* get_orb() const { return orb_; }; + const Atom* get_atom() const { return atom_; } + int get_ia() const { return ia_; } + int get_iat() const { return iat_; } + int get_start_iw() const { return ucell_->itiaiw2iwt(it_, ia_, 0); } // get the start index of global atomic orbitals + const Vec3i& get_bgrid_idx() const { return biggrid_idx_; } + const Vec3i& get_unitcell_idx() const { return unitcell_idx_; } + const Vec3i& get_R() const { return unitcell_idx_; } + const Vec3d& get_tau_in_bgrid() const { return tau_in_biggrid_; } + const Numerical_Orbital* get_orb() const { return orb_; } - int get_nw() const { return atom_->nw; }; - double get_rcut() const { return orb_->getRcut(); }; + int get_nw() const { return atom_->nw; } + double get_rcut() const { return orb_->getRcut(); } /** * @brief Get the wave function values of the atom at a meshgrid. diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h index 031a6c2cc3..9e225fed0f 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h @@ -23,7 +23,7 @@ class Gint_fvl : public Gint ModuleBase::matrix* svl) : nspin_(nspin), vr_eff_(vr_eff), dm_vec_(dm_vec), isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), - dr3_(gint_info_->get_mgrid_volume()) {}; + dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h index 29a1b7704e..6509968a85 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h @@ -24,7 +24,7 @@ class Gint_fvl_gpu : public Gint ModuleBase::matrix* svl) : nspin_(nspin), vr_eff_(vr_eff), dm_vec_(dm_vec), isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), - dr3_(gint_info_->get_mgrid_volume()) {}; + dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h index 974b77b5e8..1abeac9d11 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h @@ -23,7 +23,7 @@ class Gint_fvl_meta : public Gint ModuleBase::matrix* svl) : nspin_(nspin), vr_eff_(vr_eff), vofk_(vofk), dm_vec_(dm_vec), isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), - dr3_(gint_info_->get_mgrid_volume()) {}; + dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h index 9e5fd08bc1..648bc6877a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h @@ -24,7 +24,7 @@ class Gint_fvl_meta_gpu : public Gint ModuleBase::matrix* svl) : nspin_(nspin), vr_eff_(vr_eff), vofk_(vofk), dm_vec_(dm_vec), isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl), - dr3_(gint_info_->get_mgrid_volume()) {}; + dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h index 5071c0f1a0..08bc0088f5 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h @@ -34,12 +34,12 @@ class GintInfo const UnitCell& ucell, Grid_Driver& gd); // getter functions - const std::vector>& get_biggrids() { return biggrids_; }; - const std::vector& get_trace_lo() const{ return trace_lo_; }; - int get_lgd() const { return lgd_; }; - int get_nat() const { return ucell_->nat; }; // return the number of atoms in the unitcell - int get_local_mgrid_num() const { return localcell_info_->get_mgrids_num(); }; - double get_mgrid_volume() const { return meshgrid_info_->get_volume(); }; + const std::vector>& get_biggrids() { return biggrids_; } + const std::vector& get_trace_lo() const{ return trace_lo_; } + int get_lgd() const { return lgd_; } + int get_nat() const { return ucell_->nat; } // return the number of atoms in the unitcell + int get_local_mgrid_num() const { return localcell_info_->get_mgrids_num(); } + double get_mgrid_volume() const { return meshgrid_info_->get_volume(); } //========================================= // functions about hcontainer diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h index d46ed15e37..9499b3e7de 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h @@ -16,7 +16,7 @@ class Gint_rho : public Gint const std::vector*>& dm_vec, const int nspin, double **rho) - : dm_vec_(dm_vec), nspin_(nspin), rho_(rho) {}; + : dm_vec_(dm_vec), nspin_(nspin), rho_(rho) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h index 97071a3085..7831af136e 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h @@ -17,7 +17,7 @@ class Gint_rho_gpu: public Gint const std::vector*>& dm_vec, const int nspin, double **rho) - : dm_vec_(dm_vec), nspin_(nspin), rho_(rho) {}; + : dm_vec_(dm_vec), nspin_(nspin), rho_(rho) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h index fae12f524b..e0fd997fb9 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h @@ -15,7 +15,7 @@ class Gint_tau : public Gint const std::vector*>& dm_vec, const int nspin, double** tau) - : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {}; + : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h index f4bd1dc77f..392682ef43 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h @@ -17,7 +17,7 @@ class Gint_tau_gpu : public Gint const std::vector*>& dm_vec, const int nspin, double** tau) - : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {}; + : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h index 742fd8e625..e5239c1dc3 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h @@ -15,7 +15,7 @@ class Gint_vl : public Gint Gint_vl( const double* vr_eff, HContainer* hR) - : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h index fd7cee190b..b9f9c23272 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h @@ -16,7 +16,7 @@ class Gint_vl_gpu : public Gint Gint_vl_gpu( const double* vr_eff, HContainer* hR) - : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h index 0ddb5b828f..569ddf54f1 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h @@ -16,7 +16,7 @@ class Gint_vl_metagga : public Gint const double* vr_eff, const double* vofk, HContainer* hR) - : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h index 65486dcdfd..6d8b707f3d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h @@ -17,7 +17,7 @@ class Gint_vl_metagga_gpu : public Gint const double* vr_eff, const double* vofk, HContainer* hR) - : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h index 1505c39af0..d9167e5d5f 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h @@ -16,7 +16,7 @@ class Gint_vl_metagga_nspin4 : public Gint std::vector vr_eff, std::vector vofk, HContainer>* hR) - : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h index 1c2722decd..81119ce11d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h @@ -17,7 +17,7 @@ class Gint_vl_metagga_nspin4_gpu : public Gint std::vector vr_eff, std::vector vofk, HContainer>* hR) - : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h index 6371257823..ebc07b4901 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h @@ -15,7 +15,7 @@ class Gint_vl_nspin4 : public Gint Gint_vl_nspin4( std::vector vr_eff, HContainer>* hR) - : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){} + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h index 4a12b2bc69..df9993ce6c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h @@ -16,7 +16,7 @@ class Gint_vl_nspin4_gpu : public Gint Gint_vl_nspin4_gpu( std::vector vr_eff, HContainer>* hR) - : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){}; + : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {} void cal_gint(); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h index 0cf033e040..b23fa15792 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h @@ -24,7 +24,7 @@ class CudaMemWrapper other.size_ = 0; other.malloc_host_ = false; other.stream_ = 0; - }; + } CudaMemWrapper& operator=(CudaMemWrapper&& other) noexcept { @@ -43,7 +43,7 @@ class CudaMemWrapper other.stream_ = 0; } return *this; - }; + } CudaMemWrapper(size_t size, cudaStream_t stream = 0, @@ -63,92 +63,92 @@ class CudaMemWrapper checkCuda(cudaMalloc((void**)&device_ptr_, size_ * sizeof(T))); checkCuda(cudaMemset(device_ptr_, 0, size_ * sizeof(T))); - }; + } ~CudaMemWrapper() { free(); - }; + } void copy_host_to_device_sync(size_t size) { if (host_ptr_ == nullptr) { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to device."); } checkCuda(cudaMemcpy(device_ptr_, host_ptr_, size * sizeof(T), cudaMemcpyHostToDevice)); - }; + } void copy_host_to_device_sync() { copy_host_to_device_sync(size_); - }; + } void copy_host_to_device_async(size_t size) { if (host_ptr_ == nullptr) { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to device."); } checkCuda(cudaMemcpyAsync(device_ptr_, host_ptr_, size * sizeof(T), cudaMemcpyHostToDevice, stream_)); - }; + } void copy_host_to_device_async() { copy_host_to_device_async(size_); - }; + } void copy_device_to_host_sync(size_t size) { if (host_ptr_ == nullptr) { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to host."); } checkCuda(cudaMemcpy(host_ptr_, device_ptr_, size * sizeof(T), cudaMemcpyDeviceToHost)); - }; + } void copy_device_to_host_sync() { copy_device_to_host_sync(size_); - }; + } void copy_device_to_host_async(size_t size) { if (host_ptr_ == nullptr) { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to host."); } checkCuda(cudaMemcpyAsync(host_ptr_, device_ptr_, size * sizeof(T), cudaMemcpyDeviceToHost, stream_)); - }; + } void copy_device_to_host_async() { copy_device_to_host_async(size_); - }; + } void memset_device_sync(const size_t size, const int value = 0) { checkCuda(cudaMemset(device_ptr_, value, size * sizeof(T))); - }; + } void memset_device_sync(const int value = 0) { memset_device_sync(size_, value); - }; + } void memset_device_async(const size_t size, const int value = 0) { checkCuda(cudaMemsetAsync(device_ptr_, value, size * sizeof(T), stream_)); - }; + } void memset_device_async(const int value = 0) { memset_device_async(size_, value); - }; + } void memset_host(const size_t size, const int value = 0) { if (host_ptr_ == nullptr) { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot memset host."); } checkCuda(cudaMemset(host_ptr_, value, size * sizeof(T))); - }; + } void memset_host(const int value = 0) { memset_host(size_, value); - }; + } void free() { @@ -156,11 +156,11 @@ class CudaMemWrapper checkCuda(cudaFreeHost(host_ptr_)); } - T* get_device_ptr() { return device_ptr_; }; - T* get_host_ptr() { return host_ptr_; }; - const T* get_device_ptr() const { return device_ptr_; }; - const T* get_host_ptr() const { return host_ptr_; }; - size_t get_size() const { return size_; }; + T* get_device_ptr() { return device_ptr_; } + T* get_host_ptr() { return host_ptr_; } + const T* get_device_ptr() const { return device_ptr_; } + const T* get_host_ptr() const { return host_ptr_; } + size_t get_size() const { return size_; } private: T* device_ptr_ = nullptr; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h index f24d1194b4..0c146b86ab 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h @@ -17,16 +17,16 @@ class LocalCellInfo std::shared_ptr unitcell_info); // getter functions - int get_startidx_bx() const { return startidx_bx_; }; - int get_startidx_by() const { return startidx_by_; }; - int get_startidx_bz() const { return startidx_bz_; }; - int get_nbx() const { return nbx_; }; - int get_nby() const { return nby_; }; - int get_nbz() const { return nbz_; }; - int get_bgrids_num() const { return nbxyz_; }; - int get_mgrids_num() const { return nmxyz_; }; - std::shared_ptr get_unitcell_info() const { return unitcell_info_; }; - std::shared_ptr get_bgrid_info() const { return unitcell_info_->get_bgrid_info(); }; + int get_startidx_bx() const { return startidx_bx_; } + int get_startidx_by() const { return startidx_by_; } + int get_startidx_bz() const { return startidx_bz_; } + int get_nbx() const { return nbx_; } + int get_nby() const { return nby_; } + int get_nbz() const { return nbz_; } + int get_bgrids_num() const { return nbxyz_; } + int get_mgrids_num() const { return nmxyz_; } + std::shared_ptr get_unitcell_info() const { return unitcell_info_; } + std::shared_ptr get_bgrid_info() const { return unitcell_info_->get_bgrid_info(); } //==================================================================== // functions related to the big grid diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h index 99376c9a20..a8307b1048 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h @@ -35,10 +35,10 @@ class MeshGridInfo meshgrid_GT_ = meshgrid_latvec0_.Inverse(); meshgrid_volume_ = std::abs(meshgrid_latvec0_.Det()); - }; - - double get_volume() const { return meshgrid_volume_; }; - Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * meshgrid_latvec0_; }; + } + + double get_volume() const { return meshgrid_volume_; } + Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * meshgrid_latvec0_; } Vec3d get_direct_coord(const Vec3d& cart_coord) const { return cart_coord * meshgrid_GT_; } private: diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h index e25c3641d8..48044e0014 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h @@ -28,8 +28,8 @@ class PhiOperator void set_bgrid(std::shared_ptr biggrid); // getter - int get_rows() const {return rows_;}; - int get_cols() const {return cols_;}; + int get_rows() const {return rows_;} + int get_cols() const {return cols_;} // get phi of the big grid // the dimension of phi is num_mgrids * (\sum_{i=0}^{atoms_->size()} atoms_[i]->nw) @@ -121,7 +121,7 @@ class PhiOperator int x = std::min(a, b); int y = std::abs(a - b); return atom_pair_start_end_idx_[(2 * biggrid_->get_atoms_num() - x + 1) * x / 2 + y]; - }; + } bool is_atom_on_mgrid(int atom_idx, int mgrid_idx) const { diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h index b75806fa2a..df1e88b38c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h @@ -21,16 +21,16 @@ class UnitCellInfo int nmx, int nmy, int nmz); // getter functions - int get_nbx() const { return nbx_; }; - int get_nby() const { return nby_; }; - int get_nbz() const { return nbz_; }; - int get_bgrids_num() const { return nbxyz_; }; - int get_nmx() const { return nmx_; }; - int get_nmy() const { return nmy_; }; - int get_nmz() const { return nmz_; }; - int get_mgrids_num() const { return nmxyz_; }; - std::shared_ptr get_bgrid_info() const { return biggrid_info_; }; - std::shared_ptr get_mgrid_info() const { return meshgrid_info_; }; + int get_nbx() const { return nbx_; } + int get_nby() const { return nby_; } + int get_nbz() const { return nbz_; } + int get_bgrids_num() const { return nbxyz_; } + int get_nmx() const { return nmx_; } + int get_nmy() const { return nmy_; } + int get_nmz() const { return nmz_; } + int get_mgrids_num() const { return nmxyz_; } + std::shared_ptr get_bgrid_info() const { return biggrid_info_; } + std::shared_ptr get_mgrid_info() const { return meshgrid_info_; } //==================================================================== // functions related to the big grid @@ -40,25 +40,25 @@ class UnitCellInfo Vec3i bgrid_idx_1Dto3D(const int index_1d) const { return index1Dto3D(index_1d, nbx_, nby_, nbz_); - }; + } // transform the 3D index of a biggrid in the unit cell to the 1D index int bgrid_idx_3Dto1D(const Vec3i index_3d) const { return index3Dto1D(index_3d.x, index_3d.y, index_3d.z, nbx_, nby_, nbz_); - }; + } // get the cartesian coordinate of a big grid in the unit cell from the 3D index Vec3d get_bgrid_coord(Vec3i index_3d) const { return biggrid_info_->get_cartesian_coord(index_3d); - }; + } // get the cartesian coordinate of a big grid in the unit cell from the 1D index Vec3d get_bgrid_coord(int index_1d) const { return get_bgrid_coord(bgrid_idx_1Dto3D(index_1d)); - }; + } // get the 3D index of a big grid in the unit cell from the cartesian coordinate Vec3i get_bgrid_idx_3d(const Vec3d coord) const @@ -68,7 +68,7 @@ class UnitCellInfo static_cast(floor(direct_coord.x)), static_cast(floor(direct_coord.y)), static_cast(floor(direct_coord.z))); - }; + } // Get the relative Cartesian coordinates of big grid A relative to big grid B // returned vector = coordinates of point A - coordinates of point B @@ -77,7 +77,7 @@ class UnitCellInfo Vec3d get_relative_coord(Vec3i index_3d_a, Vec3i index_3d_b) const { return get_bgrid_coord(index_3d_a - index_3d_b); - }; + } // get the extended unitcell index of a big grid Vec3i get_unitcell_idx(const Vec3i index_3d) const @@ -85,7 +85,7 @@ class UnitCellInfo return Vec3i(floor_div(index_3d.x, nbx_), floor_div(index_3d.y, nby_), floor_div(index_3d.z, nbz_)); - }; + } // map the extended big grid index to the big grid index in unitcell Vec3i map_ext_idx_to_ucell(const Vec3i index_3d) const @@ -93,7 +93,7 @@ class UnitCellInfo return Vec3i(index_3d.x - floor_div(index_3d.x, nbx_) * nbx_, index_3d.y - floor_div(index_3d.y, nby_) * nby_, index_3d.z - floor_div(index_3d.z, nbz_) * nbz_); - }; + } //==================================================================== @@ -116,7 +116,7 @@ class UnitCellInfo Vec3d get_mgrid_coord(Vec3i index_3d) const { return meshgrid_info_->get_cartesian_coord(index_3d); - }; + } // get the cartesian coordinate of a meshgrid in the unit cell from the 1D index Vec3d get_mgrid_coord(int index_1d) const From 86b63b861e5b8fb6076ca4b95f91880e35bad5f0 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 10:51:18 +0800 Subject: [PATCH 39/63] fix cmakelist --- source/module_hamilt_lcao/module_gint/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index 5e03b11cf5..65afa13e5e 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -61,7 +61,7 @@ if(NEW_GINT) temp_gint/kernel/phi_operator_gpu.cu temp_gint/kernel/phi_operator_kernel.cu temp_gint/kernel/set_const_mem.cu - temp_gint/batch_biggrid + temp_gint/batch_biggrid.cpp temp_gint/gint_vl_gpu.cpp temp_gint/gint_rho_gpu.cpp temp_gint/gint_fvl_gpu.cpp From 34b9564a37c25b29106464d443bcc070c8770383 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 10:55:17 +0800 Subject: [PATCH 40/63] add checkcudalasterror --- .../module_gint/temp_gint/kernel/phi_operator_gpu.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu index f4bb2667fb..edc07959d4 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu @@ -112,6 +112,7 @@ void PhiOperatorGpu::set_phi(double* phi_d) const atoms_phi_start_.get_device_ptr(), bgrids_phi_len_.get_device_ptr(), phi_d); + checkCudaLastError(); } void PhiOperatorGpu::set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_y_d, double* dphi_z_d) const @@ -143,6 +144,7 @@ void PhiOperatorGpu::set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_ dphi_x_d, dphi_y_d, dphi_z_d); + checkCudaLastError(); } void PhiOperatorGpu::set_ddphi(double* ddphi_xx_d, double* ddphi_xy_d, double* ddphi_xz_d, @@ -185,6 +187,7 @@ void PhiOperatorGpu::set_ddphi(double* ddphi_xx_d, double* ddphi_xy_d, double* d ddphi_yy_d, ddphi_yz_d, ddphi_zz_d); + checkCudaLastError(); } void PhiOperatorGpu::phi_mul_vldr3( @@ -204,6 +207,7 @@ void PhiOperatorGpu::phi_mul_vldr3( bgrids_phi_len_.get_device_ptr(), bgrids_phi_start_.get_device_ptr(), result_d); + checkCudaLastError(); } void PhiOperatorGpu::phi_mul_phi( @@ -402,6 +406,7 @@ void PhiOperatorGpu::phi_dot_phi( bgrids_phi_len_.get_device_ptr(), bgrids_phi_start_.get_device_ptr(), rho_d); + checkCudaLastError(); } void PhiOperatorGpu::phi_dot_dphi( @@ -427,6 +432,7 @@ void PhiOperatorGpu::phi_dot_dphi( gint_gpu_vars_->iat2it_d, gint_gpu_vars_->atom_nw_d, fvl_d); + checkCudaLastError(); } void PhiOperatorGpu::phi_dot_dphi_r( @@ -454,6 +460,7 @@ void PhiOperatorGpu::phi_dot_dphi_r( gint_gpu_vars_->iat2it_d, gint_gpu_vars_->atom_nw_d, svl_d); + checkCudaLastError(); } } \ No newline at end of file From d86c0e373f509b8af0ec06eeccf894ff57b1c482 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 11:08:47 +0800 Subject: [PATCH 41/63] small modification --- source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h | 2 -- source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h | 2 -- source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h | 2 -- source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h | 2 -- source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h | 2 -- source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h | 2 -- .../module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h | 2 -- .../module_gint/temp_gint/gint_vl_metagga_gpu.h | 2 -- .../module_gint/temp_gint/gint_vl_metagga_nspin4.h | 2 -- .../module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h | 2 -- .../module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h | 2 -- .../module_gint/temp_gint/gint_vl_nspin4_gpu.h | 2 -- 12 files changed, 24 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h index 9499b3e7de..7e4816b729 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h @@ -32,9 +32,7 @@ class Gint_rho : public Gint // output double **rho_; - //======================== // Intermediate variables - //======================== std::vector> dm_gint_vec_; }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h index 7831af136e..f5a172ea33 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h @@ -37,9 +37,7 @@ class Gint_rho_gpu: public Gint // output double **rho_; - //======================== // Intermediate variables - //======================== std::vector> dm_gint_vec_; std::vector> dm_gint_d_vec_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h index e0fd997fb9..b1d3b0664a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h @@ -31,9 +31,7 @@ class Gint_tau : public Gint // output double **kin_; - //======================== // Intermediate variables - //======================== std::vector> dm_gint_vec_; }; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h index 392682ef43..bfac5a48a3 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h @@ -37,9 +37,7 @@ class Gint_tau_gpu : public Gint // output double **kin_; - //======================== // Intermediate variables - //======================== std::vector> dm_gint_vec_; std::vector> dm_gint_d_vec_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h index e5239c1dc3..fc717629c5 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h @@ -33,9 +33,7 @@ class Gint_vl : public Gint // output HContainer* hR_; - //======================== // Intermediate variables - //======================== double dr3_; HContainer hr_gint_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h index b9f9c23272..53290658c8 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h @@ -37,9 +37,7 @@ class Gint_vl_gpu : public Gint // output HContainer* hR_; - //======================== // Intermediate variables - //======================== double dr3_; HContainer hr_gint_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h index 569ddf54f1..01bef660a2 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h @@ -35,9 +35,7 @@ class Gint_vl_metagga : public Gint // output HContainer* hR_; - //======================== // Intermediate variables - //======================== double dr3_; HContainer hr_gint_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h index 6d8b707f3d..efdc01762a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h @@ -40,9 +40,7 @@ class Gint_vl_metagga_gpu : public Gint // output HContainer* hR_; - //======================== // Intermediate variables - //======================== double dr3_; HContainer hr_gint_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h index d9167e5d5f..abdbde3f08 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h @@ -31,9 +31,7 @@ class Gint_vl_metagga_nspin4 : public Gint // output HContainer>* hR_; - //======================== // Intermediate variables - //======================== const double dr3_; const int nspin_ = 4; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h index 81119ce11d..d38665dffa 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h @@ -36,9 +36,7 @@ class Gint_vl_metagga_nspin4_gpu : public Gint // output HContainer>* hR_; - //======================== // Intermediate variables - //======================== const double dr3_; const int nspin_ = 4; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h index ebc07b4901..9436b5c397 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h @@ -33,9 +33,7 @@ class Gint_vl_nspin4 : public Gint // output HContainer>* hR_; - //======================== // Intermediate variables - //======================== const double dr3_; const int nspin_ = 4; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h index df9993ce6c..a7decea9e8 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h @@ -38,9 +38,7 @@ class Gint_vl_nspin4_gpu : public Gint // output HContainer>* hR_; - //======================== // Intermediate variables - //======================== const double dr3_; const int nspin_ = 4; From c0bc2a6f105502adee9509471a0c179b3723d41e Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 11:46:28 +0800 Subject: [PATCH 42/63] add some timer --- .../module_gint/temp_gint/gint_common.cpp | 21 ++++++++++++++----- .../module_gint/temp_gint/gint_dvlocal.cpp | 6 ++++++ .../module_gint/temp_gint/gint_env_gamma.cpp | 3 +++ .../module_gint/temp_gint/gint_env_k.cpp | 3 +++ .../module_gint/temp_gint/gint_fvl.cpp | 3 +++ .../module_gint/temp_gint/gint_fvl_gpu.cpp | 3 +++ .../module_gint/temp_gint/gint_fvl_meta.cpp | 3 +++ .../temp_gint/gint_fvl_meta_gpu.cpp | 3 +++ .../module_gint/temp_gint/gint_interface.cpp | 16 -------------- .../module_gint/temp_gint/gint_rho.cpp | 3 +++ .../module_gint/temp_gint/gint_rho_gpu.cpp | 3 +++ .../module_gint/temp_gint/gint_tau.cpp | 3 +++ .../module_gint/temp_gint/gint_tau_gpu.cpp | 3 +++ .../module_gint/temp_gint/gint_vl.cpp | 3 +++ .../module_gint/temp_gint/gint_vl_gpu.cpp | 3 +++ .../module_gint/temp_gint/gint_vl_metagga.cpp | 3 +++ .../temp_gint/gint_vl_metagga_gpu.cpp | 3 +++ .../temp_gint/gint_vl_metagga_nspin4.cpp | 3 +++ .../temp_gint/gint_vl_metagga_nspin4_gpu.cpp | 3 +++ .../module_gint/temp_gint/gint_vl_nspin4.cpp | 3 +++ .../temp_gint/gint_vl_nspin4_gpu.cpp | 3 +++ 21 files changed, 76 insertions(+), 21 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp index 2e2faf2ab1..74c726c2db 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp @@ -13,6 +13,8 @@ namespace ModuleGint void compose_hr_gint(HContainer& hr_gint) { + ModuleBase::TITLE("Gint", "compose_hr_gint"); + ModuleBase::timer::tick("Gint", "compose_hr_gint"); for (int iap = 0; iap < hr_gint.size_atom_pairs(); iap++) { auto& ap = hr_gint.get_atom_pair(iap); @@ -42,11 +44,14 @@ void compose_hr_gint(HContainer& hr_gint) } } } + ModuleBase::timer::tick("Gint", "compose_hr_gint"); } void compose_hr_gint(const std::vector>& hr_gint_part, HContainer>& hr_gint_full) { + ModuleBase::TITLE("Gint", "compose_hr_gint"); + ModuleBase::timer::tick("Gint", "compose_hr_gint"); for (int iap = 0; iap < hr_gint_full.size_atom_pairs(); iap++) { auto* ap = &(hr_gint_full.get_atom_pair(iap)); @@ -106,11 +111,14 @@ void compose_hr_gint(const std::vector>& hr_gint_part, } } } + ModuleBase::timer::tick("Gint", "compose_hr_gint"); } template void transfer_hr_gint_to_hR(const HContainer& hr_gint, HContainer& hR) { + ModuleBase::TITLE("Gint", "transfer_hr_gint_to_hR"); + ModuleBase::timer::tick("Gint", "transfer_hr_gint_to_hR"); #ifdef __MPI int size = 0; MPI_Comm_size(MPI_COMM_WORLD, &size); @@ -125,6 +133,7 @@ void transfer_hr_gint_to_hR(const HContainer& hr_gint, HContainer& hR) #else hR.add(hr_gint); #endif + ModuleBase::timer::tick("Gint", "transfer_hr_gint_to_hR"); } // gint_info should not have been a parameter, but it was added to initialize dm_gint_full @@ -135,9 +144,10 @@ void transfer_dm_2d_to_gint( std::vector*> dm, std::vector>& dm_gint) { - // To check whether input parameter dm_2d has been initialized + ModuleBase::TITLE("Gint", "transfer_dm_2d_to_gint"); + ModuleBase::timer::tick("Gint", "transfer_dm_2d_to_gint"); assert(PARAM.inp.nspin == dm_gint.size() - && "The size of dm should be equal to the number of spins!"); + && "The size of dm_gint should be equal to the number of spins!"); if(PARAM.inp.nspin != 4) { assert(dm.size() == PARAM.inp.nspin); @@ -199,6 +209,7 @@ void transfer_dm_2d_to_gint( } } } + ModuleBase::timer::tick("Gint", "transfer_dm_2d_to_gint"); } int globalIndex(int localindex, int nblk, int nprocs, int myproc) @@ -220,8 +231,8 @@ void wfc_2d_to_gint(const T* wfc_2d, T* wfc_gint, const GintInfo& gint_info) { - ModuleBase::TITLE("Module_gint", "wfc_2d_to_gint"); - ModuleBase::timer::tick("Module_gint", "wfc_2d_to_gint"); + ModuleBase::TITLE("Gint", "wfc_2d_to_gint"); + ModuleBase::timer::tick("Gint", "wfc_2d_to_gint"); // dimension related const int nlocal = pv.desc_wfc[2]; @@ -303,7 +314,7 @@ void wfc_2d_to_gint(const T* wfc_2d, } } #endif - ModuleBase::timer::tick("Module_gint", "wfc_2d_to_gint"); + ModuleBase::timer::tick("Gint", "wfc_2d_to_gint"); } template void transfer_hr_gint_to_hR( diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp index 092735c10d..24021a870c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp @@ -8,8 +8,11 @@ namespace ModuleGint void Gint_dvlocal::cal_dvlocal() { + ModuleBase::TITLE("Gint", "cal_gint_dvlocal"); + ModuleBase::timer::tick("Gint", "cal_gint_dvlocal"); init_hr_gint_(); cal_hr_gint_(); + ModuleBase::timer::tick("Gint", "cal_gint_dvlocal"); } void Gint_dvlocal::init_hr_gint_() @@ -62,6 +65,8 @@ void Gint_dvlocal::cal_dvlocal_R_sparseMatrix( const Grid_Driver& gdriver, LCAO_HS_Arrays& hs_arrays) { + ModuleBase::TITLE("Gint", "cal_dvlocal_R_sparseMatrix"); + ModuleBase::timer::tick("Gint", "cal_dvlocal_R_sparseMatrix"); std::map, std::map>> pvdpRx_sparseMatrix; std::map, std::map>> pvdpRy_sparseMatrix; std::map, std::map>> pvdpRz_sparseMatrix; @@ -120,6 +125,7 @@ void Gint_dvlocal::cal_dvlocal_R_sparseMatrix( distribute_pvdpR_sparseMatrix(cspin, 0, nlocal, sparse_thr, pvdpRx_sparseMatrix, pv, hs_arrays); distribute_pvdpR_sparseMatrix(cspin, 1, nlocal, sparse_thr, pvdpRy_sparseMatrix, pv, hs_arrays); distribute_pvdpR_sparseMatrix(cspin, 2, nlocal, sparse_thr, pvdpRz_sparseMatrix, pv, hs_arrays); + ModuleBase::timer::tick("Gint", "cal_dvlocal_R_sparseMatrix"); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp index 5a251b6d08..2b856df887 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp @@ -18,6 +18,8 @@ Gint_env_gamma::Gint_env_gamma( void Gint_env_gamma::cal_env_band(const int iband) { + ModuleBase::TITLE("Gint", "cal_gint_env"); + ModuleBase::timer::tick("Gint", "cal_gint_env"); ModuleBase::GlobalFunc::ZEROS(rho_, gint_info_->get_local_mgrid_num()); const double* wfc_gint_band = &wfc_gint_[iband * gint_info_->get_lgd()]; #pragma omp parallel @@ -38,6 +40,7 @@ void Gint_env_gamma::cal_env_band(const int iband) phi_op.cal_env_gamma(phi.data(), wfc_gint_band, gint_info_->get_trace_lo(), rho_); } } + ModuleBase::timer::tick("Gint", "cal_gint_env"); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp index 941b1af9c8..9813710e3a 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp @@ -23,6 +23,8 @@ Gint_env_k::Gint_env_k( void Gint_env_k::cal_env_band(const int iband) { + ModuleBase::TITLE("Gint", "cal_gint_env"); + ModuleBase::timer::tick("Gint", "cal_gint_env"); ModuleBase::GlobalFunc::ZEROS(rho_, gint_info_->get_local_mgrid_num()); const std::complex* wfc_gint_band = &wfc_gint_[iband * gint_info_->get_lgd()]; #pragma omp parallel @@ -44,6 +46,7 @@ void Gint_env_k::cal_env_band(const int iband) npol_, gint_info_->get_lgd(), kvec_c_, kvec_d_, rho_); } } + ModuleBase::timer::tick("Gint", "cal_gint_env"); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp index 227b7906ac..3fc9bde005 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp @@ -8,9 +8,12 @@ namespace ModuleGint void Gint_fvl::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_fvl"); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_fvl_svl_(); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); } void Gint_fvl::init_dm_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp index 2cb20049cf..1d90304d2c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp @@ -9,9 +9,12 @@ namespace ModuleGint void Gint_fvl_gpu::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_fvl"); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_fvl_svl_(); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); } void Gint_fvl_gpu::init_dm_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp index 5a718fac9b..3299600c99 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp @@ -8,9 +8,12 @@ namespace ModuleGint void Gint_fvl_meta::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_fvl"); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_fvl_svl_(); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); } void Gint_fvl_meta::init_dm_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp index a0756eb90d..fa19925d04 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp @@ -9,9 +9,12 @@ namespace ModuleGint void Gint_fvl_meta_gpu::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_fvl"); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_fvl_svl_(); + ModuleBase::timer::tick("Gint", "cal_gint_fvl"); } void Gint_fvl_meta_gpu::init_dm_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index 091110c6c4..ff2a711ed0 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -27,7 +27,6 @@ void cal_gint_vl( const double* vr_eff, HContainer* hR) { - ModuleBase::timer::tick("Gint", "cal_gint_vl"); #ifdef __CUDA if(PARAM.inp.device == "gpu") { @@ -39,7 +38,6 @@ void cal_gint_vl( Gint_vl gint_vl(vr_eff, hR); gint_vl.cal_gint(); } - ModuleBase::timer::tick("Gint", "cal_gint_vl"); } // nspin == 4 case @@ -47,7 +45,6 @@ void cal_gint_vl( std::vector vr_eff, HContainer>* hR) { - ModuleBase::timer::tick("Gint", "cal_gint_vl"); #ifdef __CUDA if(PARAM.inp.device == "gpu") { @@ -59,7 +56,6 @@ void cal_gint_vl( Gint_vl_nspin4 gint_vl_nspin4(vr_eff, hR); gint_vl_nspin4.cal_gint(); } - ModuleBase::timer::tick("Gint", "cal_gint_vl"); } void cal_gint_vl_metagga( @@ -67,7 +63,6 @@ void cal_gint_vl_metagga( const double* vfork, HContainer* hR) { - ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); #ifdef __CUDA if(PARAM.inp.device == "gpu") { @@ -79,7 +74,6 @@ void cal_gint_vl_metagga( Gint_vl_metagga gint_vl_metagga(vr_eff, vfork, hR); gint_vl_metagga.cal_gint(); } - ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); } // nspin == 4 case @@ -88,7 +82,6 @@ void cal_gint_vl_metagga( std::vector vofk, HContainer>* hR) { - ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); #ifdef __CUDA if(PARAM.inp.device == "gpu") { @@ -100,7 +93,6 @@ void cal_gint_vl_metagga( Gint_vl_metagga_nspin4 gint_vl_metagga_nspin4(vr_eff, vofk, hR); gint_vl_metagga_nspin4.cal_gint(); } - ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga"); } void cal_gint_rho( @@ -108,7 +100,6 @@ void cal_gint_rho( const int nspin, double **rho) { - ModuleBase::timer::tick("Gint", "cal_gint_rho"); #ifdef __CUDA if(PARAM.inp.device == "gpu") { @@ -120,7 +111,6 @@ void cal_gint_rho( Gint_rho gint_rho(dm_vec, nspin, rho); gint_rho.cal_gint(); } - ModuleBase::timer::tick("Gint", "cal_gint_rho"); } void cal_gint_tau( @@ -128,7 +118,6 @@ void cal_gint_tau( const int nspin, double** tau) { - ModuleBase::timer::tick("Gint", "cal_gint_tau"); #ifdef __CUDA if(PARAM.inp.device == "gpu") { @@ -140,7 +129,6 @@ void cal_gint_tau( Gint_tau gint_tau(dm_vec, nspin, tau); gint_tau.cal_gint(); } - ModuleBase::timer::tick("Gint", "cal_gint_tau"); } void cal_gint_fvl( @@ -152,7 +140,6 @@ void cal_gint_fvl( ModuleBase::matrix* fvl, ModuleBase::matrix* svl) { - ModuleBase::timer::tick("Gint", "cal_gint_fvl"); #ifdef __CUDA if(PARAM.inp.device == "gpu") { @@ -164,7 +151,6 @@ void cal_gint_fvl( Gint_fvl gint_fvl(nspin, vr_eff, dm_vec, isforce, isstress, fvl, svl); gint_fvl.cal_gint(); } - ModuleBase::timer::tick("Gint", "cal_gint_fvl"); } void cal_gint_fvl_meta( @@ -177,7 +163,6 @@ void cal_gint_fvl_meta( ModuleBase::matrix* fvl, ModuleBase::matrix* svl) { - ModuleBase::timer::tick("Gint", "cal_gint_fvl_meta"); #ifdef __CUDA if(PARAM.inp.device == "gpu") { @@ -189,7 +174,6 @@ void cal_gint_fvl_meta( Gint_fvl_meta gint_fvl_meta(nspin, vr_eff, vofk, dm_vec, isforce, isstress, fvl, svl); gint_fvl_meta.cal_gint(); } - ModuleBase::timer::tick("Gint", "cal_gint_fvl_meta"); } void cal_dvlocal_R_sparseMatrix( diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp index c811908a3d..aed99af47c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp @@ -8,9 +8,12 @@ namespace ModuleGint void Gint_rho::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_rho"); + ModuleBase::timer::tick("Gint", "cal_gint_rho"); init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_rho_(); + ModuleBase::timer::tick("Gint", "cal_gint_rho"); } void Gint_rho::init_dm_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp index ad6f08d195..24490b8736 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp @@ -9,9 +9,12 @@ namespace ModuleGint void Gint_rho_gpu::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_rho"); + ModuleBase::timer::tick("Gint", "cal_gint_rho"); init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_rho_(); + ModuleBase::timer::tick("Gint", "cal_gint_rho"); } void Gint_rho_gpu::init_dm_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp index 4edfe459ab..1b5e282384 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp @@ -8,9 +8,12 @@ namespace ModuleGint void Gint_tau::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_tau"); + ModuleBase::timer::tick("Gint", "cal_gint_tau"); init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_tau_(); + ModuleBase::timer::tick("Gint", "cal_gint_tau"); } void Gint_tau::init_dm_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp index 6d02e0cce4..cbeeead322 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp @@ -9,9 +9,12 @@ namespace ModuleGint void Gint_tau_gpu::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_tau"); + ModuleBase::timer::tick("Gint", "cal_gint_tau"); init_dm_gint_(); transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_); cal_tau_(); + ModuleBase::timer::tick("Gint", "cal_gint_tau"); } void Gint_tau_gpu::init_dm_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp index 983d647574..ee40327d72 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp @@ -9,10 +9,13 @@ namespace ModuleGint void Gint_vl::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_); transfer_hr_gint_to_hR(hr_gint_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } //======================== diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp index 5a38e93036..fe9162bc4e 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp @@ -9,10 +9,13 @@ namespace ModuleGint void Gint_vl_gpu::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_); transfer_hr_gint_to_hR(hr_gint_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } void Gint_vl_gpu::init_hr_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp index aa1abee188..2c885adca2 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp @@ -9,10 +9,13 @@ namespace ModuleGint void Gint_vl_metagga::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_); transfer_hr_gint_to_hR(hr_gint_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } //======================== diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp index bd54d42c6a..9c2dad8421 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp @@ -9,10 +9,13 @@ namespace ModuleGint void Gint_vl_metagga_gpu::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_); transfer_hr_gint_to_hR(hr_gint_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } //======================== diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp index 0e81bb35a0..5c6086031c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp @@ -11,10 +11,13 @@ namespace ModuleGint void Gint_vl_metagga_nspin4::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); transfer_hr_gint_to_hR(hr_gint_full_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } void Gint_vl_metagga_nspin4::init_hr_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp index de6a85d9da..9adc4cb137 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp @@ -9,10 +9,13 @@ namespace ModuleGint void Gint_vl_metagga_nspin4_gpu::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); transfer_hr_gint_to_hR(hr_gint_full_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } void Gint_vl_metagga_nspin4_gpu::init_hr_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp index 3cbab74273..27db0a7db3 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp @@ -10,10 +10,13 @@ namespace ModuleGint { void Gint_vl_nspin4::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); transfer_hr_gint_to_hR(hr_gint_full_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } void Gint_vl_nspin4::init_hr_gint_() diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp index 178892c2b8..c070258db5 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp @@ -9,10 +9,13 @@ namespace ModuleGint void Gint_vl_nspin4_gpu::cal_gint() { + ModuleBase::TITLE("Gint", "cal_gint_vl"); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); init_hr_gint_(); cal_hr_gint_(); compose_hr_gint(hr_gint_part_, hr_gint_full_); transfer_hr_gint_to_hR(hr_gint_full_, *hR_); + ModuleBase::timer::tick("Gint", "cal_gint_vl"); } void Gint_vl_nspin4_gpu::init_hr_gint_() From ee8a933dce80181aa4b208e2a34cb7294faf7c84 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 12:01:17 +0800 Subject: [PATCH 43/63] Revert "change gemm function" This reverts commit ab61eb44b119b25d4a11f3370190ea6469be3e4d. --- .../module_gint/kernels/cuda/code_gen.cuh | 466 +++++++++--------- .../module_gint/kernels/cuda/code_gen_00.cu | 46 +- .../module_gint/kernels/cuda/code_gen_01.cu | 46 +- .../module_gint/kernels/cuda/code_gen_02.cu | 46 +- .../module_gint/kernels/cuda/code_gen_03.cu | 46 +- .../module_gint/kernels/cuda/code_gen_04.cu | 46 +- .../module_gint/kernels/cuda/code_gen_05.cu | 46 +- .../module_gint/kernels/cuda/code_gen_06.cu | 46 +- .../module_gint/kernels/cuda/code_gen_07.cu | 46 +- .../module_gint/kernels/cuda/code_gen_08.cu | 46 +- .../module_gint/kernels/cuda/code_gen_09.cu | 52 +- .../module_gint/kernels/cuda/gemm_selector.cu | 12 +- .../kernels/cuda/vbatch_matrix_mul.cuh | 98 ++-- 13 files changed, 521 insertions(+), 521 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cuh index b6e01e62d6..a4b1a75916 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cuh @@ -4,470 +4,470 @@ #include "gemm_selector.cuh" #include -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int, int, int*, int*, int*, double**, int*, double**, int*, double**, int*, int, cudaStream_t, float&, matrix_multiple_func_type&, double*, double*, double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -extern template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +extern template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); #endif \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_00.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_00.cu index 194e7bb863..a07c411485 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_00.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_00.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int, int, int*, int*, int*, double**, int*, double**, int*, double**, int*, int, cudaStream_t, float&, matrix_multiple_func_type&, double*, double*, double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_01.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_01.cu index 9d5b2dc664..9f725c23c6 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_01.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_01.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_02.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_02.cu index 7d0996d49a..090eab0709 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_02.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_02.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_03.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_03.cu index ab271c8783..046d0e5063 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_03.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_03.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_04.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_04.cu index 22d2fa85a2..f74209d829 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_04.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_04.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_05.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_05.cu index 33ad2ac892..c9cb81bd7c 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_05.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_05.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_06.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_06.cu index 91933f8bf6..f5fac39df2 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_06.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_06.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_07.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_07.cu index 08a47c3a9b..971c6eb0c0 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_07.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_07.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_08.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_08.cu index 89834ab470..8643faae70 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_08.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_08.cu @@ -1,48 +1,48 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_09.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_09.cu index 277e163712..8cf333bf6f 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_09.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen_09.cu @@ -1,53 +1,53 @@ #include "vbatch_matrix_mul.cuh" -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); -template void gemm_time_measure(int,int,const int*,const int*,const int*,const double* const*,const int*,const double* const*,const int*,double**,const int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); \ No newline at end of file +template void gemm_time_measure(int,int,int*,int*,int*,double**,int*,double**,int*,double**,int*,int,cudaStream_t,float&,matrix_multiple_func_type&,double*,double*,double*); \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cu index 48b0a9bf7f..b8dda451f4 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cu @@ -59,8 +59,8 @@ void gemm_algo_selector(int matrix_k, matrix_multiple_func_type& fastest_algo,co n.get_host_pointer()[index] = ucell.atoms[l].nw; k.get_host_pointer()[index] = matrix_k; - lda.get_host_pointer()[index] = ucell.atoms[j].nw; - ldb.get_host_pointer()[index] = ucell.atoms[l].nw; + lda.get_host_pointer()[index] = matrix_k; + ldb.get_host_pointer()[index] = matrix_k; ldc.get_host_pointer()[index] = ucell.atoms[l].nw; A_array.get_host_pointer()[index] @@ -71,19 +71,19 @@ void gemm_algo_selector(int matrix_k, matrix_multiple_func_type& fastest_algo,co = &C.get_device_pointer()[index * max_n * max_m]; // test atom add BlasConnector::gemm( - 'T', 'N', + 'T', m.get_host_pointer()[index], n.get_host_pointer()[index], matrix_k, 1.0, &A.get_host_pointer()[index * max_m * matrix_k], - lda.get_host_pointer()[index], + matrix_k, &B.get_host_pointer()[index * max_n * matrix_k], - ldb.get_host_pointer()[index], + matrix_k, 1.0, &cpu_result[index * max_m * max_n], - ldc.get_host_pointer()[index]); + n.get_host_pointer()[index]); index++; } } diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh index fbe12b318e..b450d06f7b 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh @@ -31,15 +31,15 @@ template shmem copy - T ra[BLK_K / DIM_XA][BLK_M / DIM_YA]; - T rb[BLK_K / DIM_XB][BLK_N / DIM_YB]; + T ra[BLK_M / DIM_YA][BLK_K / DIM_XA]; + T rb[BLK_N / DIM_YB][BLK_K / DIM_XB]; // bound is the correction to offs_d in order to not get out of memory bound // so bound could be negative value since offs_d could be out of bound @@ -89,22 +89,22 @@ static __device__ void vbatched_gemm_device(int M, // Load A dev->shmem #pragma unroll - for (n = 0; n < BLK_K; n += DIM_XA) + for (n = 0; n < BLK_M; n += DIM_YA) { #pragma unroll - for (m = 0; m < BLK_M; m += DIM_YA) + for (m = 0; m < BLK_K; m += DIM_XA) { - sA(m + idyA, n + idxA) = fetch(A, m, n, boundA); + sA(n + idyA, m + idxA) = fetch(A, m, n, boundA); } } #pragma unroll - for (n = 0; n < BLK_K; n += DIM_XB) + for (n = 0; n < BLK_N; n += DIM_YB) { #pragma unroll - for (m = 0; m < BLK_N; m += DIM_YB) + for (m = 0; m < BLK_K; m += DIM_XB) { - sB(n + idxB, m + idyB) = fetch(B, m, n, boundB); + sB(m + idxB, n + idyB) = fetch(B, m, n, boundB); } } @@ -112,31 +112,31 @@ static __device__ void vbatched_gemm_device(int M, for (kk = 0; kk < K - BLK_K; kk += BLK_K) { - offs_dA += BLK_K * LDA; - boundA -= BLK_K * LDA; + offs_dA += BLK_K; + boundA -= BLK_K; - offs_dB += BLK_K * LDB; - boundB -= BLK_K * LDB; + offs_dB += BLK_K; + boundB -= BLK_K; // Load A dev->regs #pragma unroll - for (n = 0; n < BLK_K / DIM_XA; n++) + for (n = 0; n < BLK_M / DIM_YA; n++) { #pragma unroll - for (m = 0; m < BLK_M / DIM_YA; m++) + for (m = 0; m < BLK_K / DIM_XA; m++) { - ra[n][m] = fetch(A, m * DIM_YA, n * DIM_XA, boundA); + ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA); } } // Load B dev->regs #pragma unroll - for (n = 0; n < BLK_K / DIM_XB; n++) + for (n = 0; n < BLK_N / DIM_YB; n++) { #pragma unroll - for (m = 0; m < BLK_N / DIM_YB; m++) + for (m = 0; m < BLK_K / DIM_XB; m++) { - rb[n][m] = fetch(B, m * DIM_YB, n * DIM_XB, boundB); + rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB); } } @@ -174,23 +174,23 @@ static __device__ void vbatched_gemm_device(int M, // Load A regs->shmem #pragma unroll - for (n = 0; n < BLK_K / DIM_XA; n++) + for (n = 0; n < BLK_M / DIM_YA; n++) { #pragma unroll - for (m = 0; m < BLK_M / DIM_YA; m++) + for (m = 0; m < BLK_K / DIM_XA; m++) { - sA(m * DIM_YA + idyA, n * DIM_XA + idxA) = ra[n][m]; + sA(n * DIM_YA + idyA, m * DIM_XA + idxA) = ra[n][m]; } } // Load B regs->shmem #pragma unroll - for (n = 0; n < BLK_K / DIM_XB; n++) + for (n = 0; n < BLK_N / DIM_YB; n++) { #pragma unroll - for (m = 0; m < BLK_N / DIM_YB; m++) + for (m = 0; m < BLK_K / DIM_XB; m++) { - sB(n * DIM_XB + idxB, m * DIM_YB + idyB) = rb[n][m]; + sB(m * DIM_XB + idxB, n * DIM_YB + idyB) = rb[n][m]; } } __syncthreads(); @@ -260,16 +260,16 @@ template -static __global__ void vbatched_gemm_kernel(const int* M, - const int* N, - const int* K, - const T* const* global_A_array, - const int* global_lda, - const T* const* global_B_array, - const int* global_ldb, +static __global__ void vbatched_gemm_kernel(int* M, + int* N, + int* K, + T** global_A_array, + int* global_lda, + T** global_B_array, + int* global_ldb, T** global_C_array, - const int* global_ldc, - const T* alpha) + int* global_ldc, + T* alpha) { extern __shared__ __align__(sizeof(T)) unsigned char smem[]; T* shared_mem = reinterpret_cast(smem); @@ -376,7 +376,7 @@ static __global__ void vbatched_gemm_kernel(const int* M, * especially the fact that we can relatively easily control the arrangement of * the matrix elements, we have only implemented one type of requirement for * matrix transposition. That is, we have implemented the operation C = alpha * - * A * trans(B) + C under the constraint of column-major order. + * trans(A) * B + C under the constraint of column-major order. * * Finally, we would like to thank Magma for its contributions to the field of * scientific computing. @@ -410,8 +410,8 @@ void vbatched_gemm_impl(int max_m, // The positions of A and B have been swapped here. // This is because the original code is for column-major matrices. // We use row-major matrices, so we need to swap A and B. - // The vbatched_gemm_impl is for C = A * trans(B) + C, but we need trans(C). - // Which means: trans(C) = trans(A * trans(B) + C) = B * trans(A) + trans(C) + // The vbatched_gemm_impl is for C = trans(A) * B + C, but we need trans(C). + // Which means: trans(C) = trans(trans(A)*B + C) = trans(B) * A + trans(C) // Then, ldc should be N, lda and ldb should be K size_t shared_mem_size = 0; @@ -447,7 +447,7 @@ void vbatched_gemm_impl(int max_m, if (remain_num > 0) { dim3 dimGrid(ceildiv(max_n, BLK_M), ceildiv(max_m, BLK_N), remain_num); - const T* alpha_tmp = nullptr; + T* alpha_tmp = nullptr; if (alpha != nullptr) { alpha_tmp = alpha + loop_num * max_batch_count; @@ -479,15 +479,15 @@ template void gemm_time_measure(int max_m, int max_n, - const int* m, - const int* n, - const int* k, - const T* const* global_A_array, - const int* global_lda, - const T* const* global_B_array, - const int* global_ldb, + int* m, + int* n, + int* k, + T** global_A_array, + int* global_lda, + T** global_B_array, + int* global_ldb, T** global_C_array, - const int* global_ldc, + int* global_ldc, int batchCount, cudaStream_t stream, float& fast_time, From cabc7a0848426599d8f605dc273bebc50e11820b Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 12:07:02 +0800 Subject: [PATCH 44/63] fix a bug --- source/module_lr/lr_spectrum.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/source/module_lr/lr_spectrum.cpp b/source/module_lr/lr_spectrum.cpp index 94b9474f73..95e380b6ed 100644 --- a/source/module_lr/lr_spectrum.cpp +++ b/source/module_lr/lr_spectrum.cpp @@ -35,6 +35,16 @@ elecstate::DensityMatrix LR::LR_Spectrum::cal_transition_density_matrix return DM_trans; } +#ifndef __NEW_GINT +template +void LR::LR_Spectrum::cal_gint_rho(double** rho, const int& nrxx) +{ + ModuleBase::GlobalFunc::ZEROS(rho[0], nrxx); + Gint_inout inout_rho(rho, Gint_Tools::job_type::rho, 1, false); + this->gint->cal_gint(&inout_rho); +} +#endif + inline void check_sum_rule(const double& osc_tot) { if (std::abs(osc_tot - 1.0) > 1e-3) { From 28b1dad74674e5b741105239b6b1a9f1fe4cf36f Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 12:27:36 +0800 Subject: [PATCH 45/63] fix bugs --- source/module_hamilt_lcao/module_gint/gint_k.h | 2 +- source/module_hamilt_lcao/module_gint/grid_technique.cpp | 2 +- source/source_esolver/lcao_before_scf.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/gint_k.h b/source/module_hamilt_lcao/module_gint/gint_k.h index aafa2ef5ba..4aeebefc27 100644 --- a/source/module_hamilt_lcao/module_gint/gint_k.h +++ b/source/module_hamilt_lcao/module_gint/gint_k.h @@ -67,7 +67,7 @@ class Gint_k : public Gint { Abfs::Vector3_Order, std::map>>>& pvdpR_soc_sparseMatrix, - LCAO_HS_Arrays& HS_arrays, + LCAO_HS_Arrays& HS_Arrays, const Parallel_Orbitals* pv); void cal_dvlocal_R_sparseMatrix(const int& current_spin, diff --git a/source/module_hamilt_lcao/module_gint/grid_technique.cpp b/source/module_hamilt_lcao/module_gint/grid_technique.cpp index 50dd92da6c..b343d5ea2c 100644 --- a/source/module_hamilt_lcao/module_gint/grid_technique.cpp +++ b/source/module_hamilt_lcao/module_gint/grid_technique.cpp @@ -122,7 +122,7 @@ void Grid_Technique::set_pbc_grid(const int& ncx_in, this->cal_trace_lo(ucell); #if ((defined __CUDA) /* || (defined __ROCM) */) if (PARAM.inp.device == "gpu") { - // this->init_gpu_gint_variables(ucell, num_stream); + this->init_gpu_gint_variables(ucell, num_stream); } #endif diff --git a/source/source_esolver/lcao_before_scf.cpp b/source/source_esolver/lcao_before_scf.cpp index 8e4884066c..69141a4690 100644 --- a/source/source_esolver/lcao_before_scf.cpp +++ b/source/source_esolver/lcao_before_scf.cpp @@ -60,6 +60,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) PARAM.inp.test_atom_input); //! 4) initialize NAO basis set +#ifndef __NEW_GINT double dr_uniform = 0.001; std::vector rcuts; std::vector> psi_u; @@ -69,7 +70,6 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) Gint_Tools::init_orb(dr_uniform, rcuts, ucell, orb_, psi_u, dpsi_u, d2psi_u); //! 5) set periodic boundary conditions -#ifndef __NEW_GINT this->GridT.set_pbc_grid(this->pw_rho->nx, this->pw_rho->ny, this->pw_rho->nz, From 4fecbe918748fc3a026c8b4d6e81ea7ba79b470c Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 12:31:48 +0800 Subject: [PATCH 46/63] delete unused declaration --- source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu index 6666c90f4b..c9bf122628 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu @@ -285,7 +285,6 @@ inline T* Cuda_Mem_Wrapper::get_host_pointer(const int stream_id) return this->host_pointer + stream_id * this->one_stream_size_aligned; } template class Cuda_Mem_Wrapper; -template class Cuda_Mem_Wrapper; template class Cuda_Mem_Wrapper; template class Cuda_Mem_Wrapper; template class Cuda_Mem_Wrapper; From e25e601c46178db5fe241470d220f306756231b9 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 16 Jun 2025 15:26:12 +0800 Subject: [PATCH 47/63] make new gint module the default compilation option --- CMakeLists.txt | 4 ++-- source/module_elecstate/elecstate_lcao.cpp | 8 +++---- .../elecstate_lcao_cal_tau.cpp | 4 ++-- .../operator_lcao/veff_lcao.cpp | 6 ++--- .../hamilt_lcaodft/operator_lcao/veff_lcao.h | 4 ++-- .../pulay_force_stress_gint.hpp | 2 +- .../hamilt_lcaodft/spar_dh.cpp | 2 +- .../module_gint/CMakeLists.txt | 2 +- source/module_io/cal_ldos.cpp | 2 +- source/module_io/get_pchg_lcao.cpp | 6 ++--- source/module_io/get_wf_lcao.cpp | 24 +++++++++---------- source/module_lr/esolver_lrtd_lcao.cpp | 2 +- source/module_lr/esolver_lrtd_lcao.h | 2 +- source/module_lr/lr_spectrum.cpp | 8 +++---- .../operator_casida/operator_lr_hxc.cpp | 8 +++---- source/module_rdmft/rdmft_tools.cpp | 16 ++++++------- source/module_rdmft/rdmft_tools.h | 4 ++-- source/module_rdmft/update_state_rdmft.cpp | 4 ++-- source/source_esolver/esolver_ks_lcao.cpp | 2 +- source/source_esolver/lcao_before_scf.cpp | 2 +- source/source_esolver/lcao_others.cpp | 2 +- 21 files changed, 57 insertions(+), 57 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d4e265adbe..f529aa075a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -252,8 +252,8 @@ if(ENABLE_LCAO) add_compile_definitions(__PEXSI) set(CMAKE_CXX_STANDARD 14) endif() - if(NEW_GINT) - add_compile_definitions(__NEW_GINT) + if(OLD_GINT) + add_compile_definitions(__OLD_GINT) endif() else() set(ENABLE_MLALGO OFF) diff --git a/source/module_elecstate/elecstate_lcao.cpp b/source/module_elecstate/elecstate_lcao.cpp index 8836775fac..b45606e503 100644 --- a/source/module_elecstate/elecstate_lcao.cpp +++ b/source/module_elecstate/elecstate_lcao.cpp @@ -34,7 +34,7 @@ void ElecStateLCAO>::psiToRho(const psi::Psigint_k->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer DM2D to DM_grid in gint Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); this->gint_k->cal_gint(&inout); @@ -71,7 +71,7 @@ void ElecStateLCAO::psiToRho(const psi::Psi& psi) //------------------------------------------------------------ ModuleBase::GlobalFunc::NOTE("Calculate the charge on real space grid!"); -#ifndef __NEW_GINT +#ifdef __OLD_GINT this->gint_gamma->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer DM2D to DM_grid in gint Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); this->gint_gamma->cal_gint(&inout); @@ -139,7 +139,7 @@ void ElecStateLCAO::dmToRho(std::vector pexsi_DM, std::vectorgint_gamma->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer DM2D to DM_grid in gint Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); this->gint_gamma->cal_gint(&inout); @@ -152,7 +152,7 @@ void ElecStateLCAO::dmToRho(std::vector pexsi_DM, std::vectorcharge->kin_r[0], this->charge->nrxx); } -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau); this->gint_gamma->cal_gint(&inout1); #else diff --git a/source/module_elecstate/elecstate_lcao_cal_tau.cpp b/source/module_elecstate/elecstate_lcao_cal_tau.cpp index d07aeba678..2b611f4c17 100644 --- a/source/module_elecstate/elecstate_lcao_cal_tau.cpp +++ b/source/module_elecstate/elecstate_lcao_cal_tau.cpp @@ -16,7 +16,7 @@ void ElecStateLCAO>::cal_tau(const psi::Psicharge->kin_r[is], this->charge->nrxx); } -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau, PARAM.inp.nspin); this->gint_k->cal_gint(&inout1); #else @@ -36,7 +36,7 @@ void ElecStateLCAO::cal_tau(const psi::Psi& psi) { ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[is], this->charge->nrxx); } -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau, PARAM.inp.nspin); this->gint_gamma->cal_gint(&inout1); #else diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp index 8c5850fdb1..f1df7f441e 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp @@ -68,7 +68,7 @@ void Veff>::contributeHR() double* vr_eff1 = this->pot->get_effective_v(this->current_spin); double* vofk_eff1 = this->pot->get_effective_vofk(this->current_spin); -#ifndef __NEW_GINT +#ifdef __OLD_GINT if(XC_Functional::get_ked_flag()) { Gint_inout inout(vr_eff1, vofk_eff1, Gint_Tools::job_type::vlocal_meta); @@ -113,7 +113,7 @@ void Veff, double>>::contributeHR() double* vr_eff1 = this->pot->get_effective_v(this->current_spin); double* vofk_eff1 = this->pot->get_effective_vofk(this->current_spin); -#ifndef __NEW_GINT +#ifdef __OLD_GINT // if you change the place of the following code, // rememeber to delete the #include if(XC_Functional::get_ked_flag()) @@ -155,7 +155,7 @@ void Veff, std::complex>>::contributeH ModuleBase::TITLE("Veff", "contributeHR"); ModuleBase::timer::tick("Veff", "contributeHR"); -#ifndef __NEW_GINT +#ifdef __OLD_GINT double* vr_eff1 = nullptr; double* vofk_eff1 = nullptr; for (int is = 0; is < 4; is++) diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h index c74f65f67c..817ae11a20 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h +++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h @@ -50,7 +50,7 @@ class Veff> : public OperatorLCAO this->cal_type = calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); -#ifndef __NEW_GINT +#ifdef __OLD_GINT GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin); #endif } @@ -71,7 +71,7 @@ class Veff> : public OperatorLCAO { this->cal_type = calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); -#ifndef __NEW_GINT +#ifdef __OLD_GINT GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin); #endif } diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp b/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp index c1e00aa3c1..0183ebc14b 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp @@ -20,7 +20,7 @@ namespace PulayForceStress { const int nspin = PARAM.inp.nspin; -#ifndef __NEW_GINT +#ifdef __OLD_GINT if (set_dmr_gint) { gint.transfer_DM2DtoGrid(dm.get_DMR_vector()); } // 2d block to grid for (int is = 0; is < nspin; ++is) { diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp index dba1f28393..381c61ec87 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp @@ -109,7 +109,7 @@ void sparse_format::cal_dH(const UnitCell& ucell, if(PARAM.inp.nspin==2) { -#ifndef __NEW_GINT +#ifdef __OLD_GINT gint_k.allocate_pvdpR(); // note: some MPI process will not have grids when MPI cores are too // many, v_eff in these processes are empty diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index 65afa13e5e..69db75605c 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -29,7 +29,7 @@ list(APPEND objects init_orb.cpp ) -if(NEW_GINT) +if(NOT DEFINED OLD_GINT) list(APPEND objects temp_gint/biggrid_info.cpp temp_gint/big_grid.cpp diff --git a/source/module_io/cal_ldos.cpp b/source/module_io/cal_ldos.cpp index 49b58f120a..0597461fe2 100644 --- a/source/module_io/cal_ldos.cpp +++ b/source/module_io/cal_ldos.cpp @@ -60,7 +60,7 @@ void Cal_ldos::cal_ldos_lcao(const elecstate::ElecStateLCAO* pelec, } // calculate ldos -#ifndef __NEW_GINT +#ifdef __OLD_GINT ModuleBase::WARNING_QUIT("Cal_ldos::dm2ldos", "do not support old grid integral, please recompile with __NEW_GINT"); #else diff --git a/source/module_io/get_pchg_lcao.cpp b/source/module_io/get_pchg_lcao.cpp index 25512cac1b..4d7dfa3027 100644 --- a/source/module_io/get_pchg_lcao.cpp +++ b/source/module_io/get_pchg_lcao.cpp @@ -106,7 +106,7 @@ void IState_Charge::begin(Gint_Gamma& gg, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(); -#ifndef __NEW_GINT +#ifdef __OLD_GINT gg.initialize_pvpR(*ucell_in, GridD_in, PARAM.inp.nspin); gg.transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); @@ -238,7 +238,7 @@ void IState_Charge::begin(Gint_k& gk, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(ik); -#ifndef __NEW_GINT +#ifdef __OLD_GINT gk.initialize_pvpR(*ucell_in, GridD_in, PARAM.inp.nspin); gk.transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); @@ -289,7 +289,7 @@ void IState_Charge::begin(Gint_k& gk, DM.init_DMR(GridD_in, ucell_in); DM.cal_DMR(); -#ifndef __NEW_GINT +#ifdef __OLD_GINT gk.initialize_pvpR(*ucell_in, GridD_in, PARAM.inp.nspin); gk.transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(rho, Gint_Tools::job_type::rho, PARAM.inp.nspin); diff --git a/source/module_io/get_wf_lcao.cpp b/source/module_io/get_wf_lcao.cpp index 5f8307c269..1af323e6e5 100644 --- a/source/module_io/get_wf_lcao.cpp +++ b/source/module_io/get_wf_lcao.cpp @@ -10,7 +10,7 @@ #include "module_io/write_wfc_r.h" #include "module_parameter/parameter.h" -#ifdef __NEW_GINT +#ifndef __OLD_GINT #include "module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h" #include "module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h" #endif @@ -49,7 +49,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, int fermi_band = 0; prepare_get_wf(GlobalV::ofs_running, nelec, fermi_band); -#ifndef __NEW_GINT +#ifdef __OLD_GINT // allocate grid wave functions for gamma_only std::vector wfc_gamma_grid(nspin); for (int is = 0; is < nspin; ++is) @@ -68,7 +68,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // if (out_wfc_pw || out_wfc_r) psi_g.resize(nspin, nbands, kv.ngk[0]); -#ifndef __NEW_GINT +#ifdef __OLD_GINT const double mem_size = sizeof(double) * double(gg.gridt->lgd) * double(nbands) * double(nspin) / 1024.0 / 1024.0; ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size); ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "On-the-fly memory consumption (MB)", mem_size); @@ -95,7 +95,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, for (int is = 0; is < nspin; ++is) { psid->fix_k(is); -#ifndef __NEW_GINT +#ifdef __OLD_GINT #ifdef __MPI wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); #else @@ -118,7 +118,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, { if (bands_picked_[ib]) { - #ifndef __NEW_GINT + #ifdef __OLD_GINT ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); #else @@ -173,7 +173,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, for (int is = 0; is < nspin; ++is) { psid->fix_k(is); -#ifndef __NEW_GINT +#ifdef __OLD_GINT #ifdef __MPI wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo); #else @@ -195,7 +195,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, { if (bands_picked_[ib]) { -#ifndef __NEW_GINT +#ifdef __OLD_GINT ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx); gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell); #else @@ -241,7 +241,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL, out_wfc_pw, PARAM.inp.ecutwfc, global_out_dir,psi_g, kv, pw_wfc, GlobalV::ofs_running); -#ifndef __NEW_GINT +#ifdef __OLD_GINT for (int is = 0; is < nspin; ++is) { for (int ib = 0; ib < nbands; ++ib) @@ -282,7 +282,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // allocate grid wave functions for multi-k const int nks = kv.get_nks(); std::vector**> wfc_k_grid(nks); -#ifndef __NEW_GINT +#ifdef __OLD_GINT for (int ik = 0; ik < nks; ++ik) { wfc_k_grid[ik] = new std::complex*[nbands]; @@ -327,7 +327,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, // 2d-to-grid conversion is unified into `wfc_2d_to_grid`. psi->fix_k(ik); -#ifndef __NEW_GINT +#ifdef __OLD_GINT #ifdef __MPI // need to deal with NSPIN=4 !!!! wfc_2d_to_grid(psi->get_pointer(), para_orb, wfc_k_grid[ik], gk.gridt->trace_lo); #else @@ -348,7 +348,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, { if (bands_picked_[ib]) { -#ifndef __NEW_GINT +#ifdef __OLD_GINT ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[ispin], pw_wfc->nrxx); // terrible, you make changes on another instance's data??? @@ -458,7 +458,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, } } } -#ifndef __NEW_GINT +#ifdef __OLD_GINT for (int ik = 0; ik < nks; ++ik) { for (int ib = 0; ib < nbands; ++ib) diff --git a/source/module_lr/esolver_lrtd_lcao.cpp b/source/module_lr/esolver_lrtd_lcao.cpp index 75ccc669d0..33d3ef46ad 100644 --- a/source/module_lr/esolver_lrtd_lcao.cpp +++ b/source/module_lr/esolver_lrtd_lcao.cpp @@ -397,7 +397,7 @@ LR::ESolver_LR::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu this->gint_->gridt = &this->gt_; // (3) Periodic condition search for each grid. -#ifndef __NEW_GINT +#ifdef __OLD_GINT double dr_uniform = 0.001; std::vector rcuts; std::vector> psi_u; diff --git a/source/module_lr/esolver_lrtd_lcao.h b/source/module_lr/esolver_lrtd_lcao.h index 9f6375aaac..0c4b6dea42 100644 --- a/source/module_lr/esolver_lrtd_lcao.h +++ b/source/module_lr/esolver_lrtd_lcao.h @@ -35,7 +35,7 @@ namespace LR ESolver_LR(const Input_para& inp, UnitCell& ucell); ~ESolver_LR() { delete this->psi_ks; -#ifdef __NEW_GINT +#ifndef __OLD_GINT ModuleGint::Gint::set_gint_info(nullptr); #endif } diff --git a/source/module_lr/lr_spectrum.cpp b/source/module_lr/lr_spectrum.cpp index 95e380b6ed..100f5bdece 100644 --- a/source/module_lr/lr_spectrum.cpp +++ b/source/module_lr/lr_spectrum.cpp @@ -35,7 +35,7 @@ elecstate::DensityMatrix LR::LR_Spectrum::cal_transition_density_matrix return DM_trans; } -#ifndef __NEW_GINT +#ifdef __OLD_GINT template void LR::LR_Spectrum::cal_gint_rho(double** rho, const int& nrxx) { @@ -65,7 +65,7 @@ ModuleBase::Vector3 LR::LR_Spectrum::cal_transition_dipole_istat // 2. transition density double** rho_trans; LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, this->rho_basis.nrxx); -#ifndef __NEW_GINT +#ifdef __OLD_GINT this->gint->transfer_DM2DtoGrid({ DM_trans.get_DMR_vector().at(is) }); this->cal_gint_rho(rho_trans, this->rho_basis.nrxx); #else @@ -115,7 +115,7 @@ ModuleBase::Vector3> LR::LR_Spectrum>: // real part LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'R'); -#ifndef __NEW_GINT +#ifdef __OLD_GINT this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); this->cal_gint_rho(rho_trans_real, this->rho_basis.nrxx); #else @@ -126,7 +126,7 @@ ModuleBase::Vector3> LR::LR_Spectrum>: // imag part LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'I'); -#ifndef __NEW_GINT +#ifdef __OLD_GINT this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); this->cal_gint_rho(rho_trans_imag, this->rho_basis.nrxx); #else diff --git a/source/module_lr/operator_casida/operator_lr_hxc.cpp b/source/module_lr/operator_casida/operator_lr_hxc.cpp index 71dd803210..d5b75a48d8 100644 --- a/source/module_lr/operator_casida/operator_lr_hxc.cpp +++ b/source/module_lr/operator_casida/operator_lr_hxc.cpp @@ -63,7 +63,7 @@ namespace LR const int& nrxx = this->pot.lock()->nrxx; LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // currently gint_kernel_rho uses PARAM.inp.nspin, it needs refactor ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx); -#ifndef __NEW_GINT +#ifdef __OLD_GINT this->gint->transfer_DM2DtoGrid(this->DM_trans->get_DMR_vector()); // 2d block to grid Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false); this->gint->cal_gint(&inout_rho); @@ -78,7 +78,7 @@ namespace LR // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r) this->hR->set_zero(); // clear hR for each bands -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal); this->gint->get_hRGint()->set_zero(); this->gint->cal_gint(&inout_vlocal); @@ -112,7 +112,7 @@ namespace LR LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // nspin=1 for transition density ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx); -#ifndef __NEW_GINT +#ifdef __OLD_GINT this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector()); // LR_Util::print_HR(*this->gint->get_DMRGint()[0], this->ucell.nat, "DMR(grid, real)"); Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false); @@ -131,7 +131,7 @@ namespace LR // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r) HR_real_imag.set_zero(); -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal); this->gint->get_hRGint()->set_zero(); this->gint->cal_gint(&inout_vlocal); diff --git a/source/module_rdmft/rdmft_tools.cpp b/source/module_rdmft/rdmft_tools.cpp index 460d4668bc..abbcbed5f4 100644 --- a/source/module_rdmft/rdmft_tools.cpp +++ b/source/module_rdmft/rdmft_tools.cpp @@ -263,7 +263,7 @@ void Veff_rdmft, double>::contributeHR() vr_eff_rdmft = &v_matrix_hartree(is, 0); // do grid integral calculation to get HR -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GK->cal_gint(&inout); #else @@ -282,7 +282,7 @@ void Veff_rdmft, double>::contributeHR() vr_eff_rdmft = &v_matrix_local(0, 0); // do grid integral calculation to get HR -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal); this->GK->cal_gint(&inout); #else @@ -306,7 +306,7 @@ void Veff_rdmft, double>::contributeHR() vr_eff_rdmft = &v_matrix_XC(is, 0); // do grid integral calculation to get HR -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GK->cal_gint(&inout); #else @@ -321,7 +321,7 @@ void Veff_rdmft, double>::contributeHR() // get HR for 2D-block parallel format // this->GK->transfer_pvpR(this->hR); -#ifndef __NEW_GINT +#ifdef __OLD_GINT this->GK->transfer_pvpR(this->hR,this->ucell,this->gd); #endif @@ -365,7 +365,7 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_hartree(is, 0); // do grid integral calculation to get HR -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GG->cal_gint(&inout); #else @@ -384,7 +384,7 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_local(0, 0); // do grid integral calculation to get HR -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal); // because in gamma_only, cal_gint would not set hRGint zero first @@ -411,7 +411,7 @@ void Veff_rdmft::contributeHR() vr_eff_rdmft = &v_matrix_XC(is, 0); // do grid integral calculation to get HR -#ifndef __NEW_GINT +#ifdef __OLD_GINT Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal); this->GG->cal_gint(&inout); #else @@ -424,7 +424,7 @@ void Veff_rdmft::contributeHR() std::cout << "\n\n!!!!!!\n there may be something wrong when use class Veff_rdmft\n\n!!!!!!\n"; } -#ifndef __NEW_GINT +#ifdef __OLD_GINT // get HR for 2D-block parallel format this->GG->transfer_pvpR(this->hR,this->ucell); #endif diff --git a/source/module_rdmft/rdmft_tools.h b/source/module_rdmft/rdmft_tools.h index 5a631c34ba..faeca1470e 100644 --- a/source/module_rdmft/rdmft_tools.h +++ b/source/module_rdmft/rdmft_tools.h @@ -284,7 +284,7 @@ class Veff_rdmft : public hamilt::OperatorLCAO this->cal_type = hamilt::calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); -#ifndef __NEW_GINT +#ifdef __OLD_GINT GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin); #endif } @@ -311,7 +311,7 @@ class Veff_rdmft : public hamilt::OperatorLCAO this->cal_type = hamilt::calculation_type::lcao_gint; this->initialize_HR(ucell_in, GridD_in); -#ifndef __NEW_GINT +#ifdef __OLD_GINT GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin); #endif } diff --git a/source/module_rdmft/update_state_rdmft.cpp b/source/module_rdmft/update_state_rdmft.cpp index 5bca6cdd67..eadce11f5e 100644 --- a/source/module_rdmft/update_state_rdmft.cpp +++ b/source/module_rdmft/update_state_rdmft.cpp @@ -106,7 +106,7 @@ void RDMFT::update_charge(UnitCell& ucell) { ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx); } -#ifndef __NEW_GINT +#ifdef __OLD_GINT GG->transfer_DM2DtoGrid(DM_gamma_only.get_DMR_vector()); Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin); GG->cal_gint(&inout); @@ -140,7 +140,7 @@ void RDMFT::update_charge(UnitCell& ucell) ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx); } -#ifndef __NEW_GINT +#ifdef __OLD_GINT GK->transfer_DM2DtoGrid(DM.get_DMR_vector()); Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin); GK->cal_gint(&inout); diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp index 15022436ea..15fb6777d3 100644 --- a/source/source_esolver/esolver_ks_lcao.cpp +++ b/source/source_esolver/esolver_ks_lcao.cpp @@ -94,7 +94,7 @@ ESolver_KS_LCAO::ESolver_KS_LCAO() template ESolver_KS_LCAO::~ESolver_KS_LCAO() { -#ifdef __NEW_GINT +#ifndef __OLD_GINT // release gint_info ModuleGint::Gint::set_gint_info(nullptr); #endif diff --git a/source/source_esolver/lcao_before_scf.cpp b/source/source_esolver/lcao_before_scf.cpp index 69141a4690..04f0322b98 100644 --- a/source/source_esolver/lcao_before_scf.cpp +++ b/source/source_esolver/lcao_before_scf.cpp @@ -60,7 +60,7 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) PARAM.inp.test_atom_input); //! 4) initialize NAO basis set -#ifndef __NEW_GINT +#ifdef __OLD_GINT double dr_uniform = 0.001; std::vector rcuts; std::vector> psi_u; diff --git a/source/source_esolver/lcao_others.cpp b/source/source_esolver/lcao_others.cpp index f4adc3617a..9c8cdb7362 100644 --- a/source/source_esolver/lcao_others.cpp +++ b/source/source_esolver/lcao_others.cpp @@ -93,7 +93,7 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) PARAM.inp.test_atom_input); // (3) Periodic condition search for each grid. -#ifndef __NEW_GINT +#ifdef __OLD_GINT double dr_uniform = 0.001; std::vector rcuts; std::vector> psi_u; From abed0818295f18e0ed405d4d858e6ca68a5a4875 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Tue, 17 Jun 2025 15:13:21 +0800 Subject: [PATCH 48/63] add more specific debug info --- .../module_gint/kernels/cuda/cuda_tools.cuh | 4 ++-- .../module_gint/temp_gint/kernel/gint_helper.cuh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh index 929191d12a..dab697df8c 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh @@ -15,8 +15,8 @@ inline void check(cudaError_t result, char const *const func, const char *const file, int const line) { if (result) { - fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, - static_cast(result), func); + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), cudaGetErrorString(result), func); exit(EXIT_FAILURE); } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh index 20fbebef03..7a6e925531 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh @@ -46,8 +46,8 @@ inline int ceil_div(const int a, const int b) inline void check(cudaError_t result, char const *const func, const char *const file, int const line) { if (result) { - fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, - static_cast(result), func); + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), cudaGetErrorString(result), func); exit(EXIT_FAILURE); } } From 2c172e5f83d8d28cdb8a223397acacb8aedf168a Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sun, 22 Jun 2025 21:28:28 +0800 Subject: [PATCH 49/63] fix bug of gint_env_k.cpp --- .../module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp index 51a8bf0496..5df52f9453 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp @@ -192,7 +192,7 @@ void PhiOperator::cal_env_k( iw_lo = trace_lo[iw_start] / npol + lgd / npol * is; for (int iw = 0; iw < atom->get_nw(); ++iw, ++iw_lo) { - tmp += std::complex(phi[iw], 0.0) * wfc[iw_lo] * kphase; + tmp += std::complex(phi[phi_start_idx + iw], 0.0) * wfc[iw_lo] * kphase; } } } @@ -201,7 +201,7 @@ void PhiOperator::cal_env_k( iw_lo = trace_lo[iw_start]; for (int iw = 0; iw < atom->get_nw(); ++iw, ++iw_lo) { - tmp += std::complex(phi[iw], 0.0) * wfc[iw_lo] * kphase; + tmp += std::complex(phi[phi_start_idx + iw], 0.0) * wfc[iw_lo] * kphase; } } rho[meshgrids_local_idx_[j]] += tmp.real(); From 435046f5b96beb069f3a1af09363fffa6694c99d Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sun, 22 Jun 2025 22:57:11 +0800 Subject: [PATCH 50/63] fix a bug about mixing_dmr --- source/source_esolver/esolver_ks_lcao.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp index 15fb6777d3..246b04a03b 100644 --- a/source/source_esolver/esolver_ks_lcao.cpp +++ b/source/source_esolver/esolver_ks_lcao.cpp @@ -828,7 +828,7 @@ void ESolver_KS_LCAO::iter_finish(UnitCell& ucell, const int istep, int& // 5) mix density matrix if mixing_restart + mixing_dmr + not first // mixing_restart at every iter - if (PARAM.inp.mixing_restart > 0 && this->p_chgmix->mixing_restart_count > 0 && PARAM.inp.mixing_dmr) + if (!conv_esolver && PARAM.inp.mixing_restart > 0 && this->p_chgmix->mixing_restart_count > 0 && PARAM.inp.mixing_dmr) { elecstate::DensityMatrix* dm = dynamic_cast*>(this->pelec)->get_DM(); this->p_chgmix->mix_dmr(dm); From 661ede205ac8a1a61a8a13d52ca9de8d6cb51609 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 23 Jun 2025 02:08:19 +0800 Subject: [PATCH 51/63] small modification --- source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp index 0f46930048..12e60761e7 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp @@ -72,7 +72,7 @@ HContainer GintInfo::get_hr(int npol) const } hr.insert_ijrs(&ijr_info_, *ucell_, npol); hr.allocate(nullptr, true); - return std::move(hr); + return hr; } void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital* Phi) From 6992cb00883697a895d7a39cc3aa9db956957be7 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 23 Jun 2025 16:41:41 +0800 Subject: [PATCH 52/63] fix bug of mixing_dmr --- source/source_esolver/esolver_ks_lcao.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp index 246b04a03b..a4f009c216 100644 --- a/source/source_esolver/esolver_ks_lcao.cpp +++ b/source/source_esolver/esolver_ks_lcao.cpp @@ -827,13 +827,16 @@ void ESolver_KS_LCAO::iter_finish(UnitCell& ucell, const int istep, int& ESolver_KS::iter_finish(ucell, istep, iter, conv_esolver); // 5) mix density matrix if mixing_restart + mixing_dmr + not first + // don't mix density matrix after the last step of iteration // mixing_restart at every iter - if (!conv_esolver && PARAM.inp.mixing_restart > 0 && this->p_chgmix->mixing_restart_count > 0 && PARAM.inp.mixing_dmr) + if(iter != PARAM.inp.scf_nmax && !conv_esolver) { - elecstate::DensityMatrix* dm = dynamic_cast*>(this->pelec)->get_DM(); - this->p_chgmix->mix_dmr(dm); + if (PARAM.inp.mixing_restart > 0 && this->p_chgmix->mixing_restart_count > 0 && PARAM.inp.mixing_dmr) + { + elecstate::DensityMatrix* dm = dynamic_cast*>(this->pelec)->get_DM(); + this->p_chgmix->mix_dmr(dm); + } } - // 6) save charge density // Peize Lin add 2020.04.04 if (GlobalC::restart.info_save.save_charge) From 9f0cd28e18e73c5d49342df54febf733781e910d Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 23 Jun 2025 16:48:55 +0800 Subject: [PATCH 53/63] modify comment --- source/source_esolver/esolver_ks_lcao.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp index a4f009c216..b5974fa11c 100644 --- a/source/source_esolver/esolver_ks_lcao.cpp +++ b/source/source_esolver/esolver_ks_lcao.cpp @@ -827,8 +827,7 @@ void ESolver_KS_LCAO::iter_finish(UnitCell& ucell, const int istep, int& ESolver_KS::iter_finish(ucell, istep, iter, conv_esolver); // 5) mix density matrix if mixing_restart + mixing_dmr + not first - // don't mix density matrix after the last step of iteration - // mixing_restart at every iter + // mixing_restart at every iter except the last iter if(iter != PARAM.inp.scf_nmax && !conv_esolver) { if (PARAM.inp.mixing_restart > 0 && this->p_chgmix->mixing_restart_count > 0 && PARAM.inp.mixing_dmr) From f186829185d6cf0119f0d5d58eb38189d79297ec Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 23 Jun 2025 23:03:57 +0800 Subject: [PATCH 54/63] replace pointer with std::vector in Numerical_Orbital --- source/source_basis/module_ao/ORB_atomic.cpp | 7 +------ source/source_basis/module_ao/ORB_atomic.h | 7 +++---- source/source_basis/module_ao/ORB_read.cpp | 3 +-- source/source_basis/module_nao/radial_set.cpp | 5 ++--- 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/source/source_basis/module_ao/ORB_atomic.cpp b/source/source_basis/module_ao/ORB_atomic.cpp index 9b40d923ef..99f5953bda 100644 --- a/source/source_basis/module_ao/ORB_atomic.cpp +++ b/source/source_basis/module_ao/ORB_atomic.cpp @@ -7,8 +7,6 @@ Numerical_Orbital::Numerical_Orbital() { // make std::pair of new and delete // question remains - this->nchi = nullptr; - this->phiLN = new Numerical_Orbital_Lm[1]; this->rcut = 0.0; this->max_nchi = 0; this->type = 0; @@ -16,8 +14,6 @@ Numerical_Orbital::Numerical_Orbital() Numerical_Orbital::~Numerical_Orbital() { - delete[] nchi; - delete[] phiLN; } void Numerical_Orbital::set_orbital_info(const int& type_in, @@ -34,8 +30,7 @@ void Numerical_Orbital::set_orbital_info(const int& type_in, this->lmax = lmax_in; // (2) set nchi and total nchi. - delete[] this->nchi; - this->nchi = new int[this->lmax + 1]; + this->nchi.resize(this->lmax + 1); for (int i = 0; i < this->lmax + 1; i++) { this->nchi[i] = nchi_in[i]; diff --git a/source/source_basis/module_ao/ORB_atomic.h b/source/source_basis/module_ao/ORB_atomic.h index 71212f8b28..e71c0958d3 100644 --- a/source/source_basis/module_ao/ORB_atomic.h +++ b/source/source_basis/module_ao/ORB_atomic.h @@ -66,7 +66,6 @@ class Numerical_Orbital const inline Numerical_Orbital_Lm& PhiLN( const int &L, const int &N)const { - assert(this->phiLN != nullptr); return this->phiLN[ this->find_chi(L, N) ]; } @@ -98,7 +97,7 @@ class Numerical_Orbital NOAR.set_position(R1_in, R2_in); } - Numerical_Orbital_Lm*& chi() { return this->phiLN; } + std::vector& chi() { return this->phiLN; } private: @@ -115,13 +114,13 @@ class Numerical_Orbital int type; int lmax; - int* nchi; + std::vector nchi; int total_nchi; int max_nchi; ModuleBase::IntArray find_chi; double rcut; - Numerical_Orbital_Lm* phiLN;// length: total_nchi (only store radial function ) + std::vector phiLN;// length: total_nchi (only store radial function ) //========================================================== // Keep the old interface diff --git a/source/source_basis/module_ao/ORB_read.cpp b/source/source_basis/module_ao/ORB_read.cpp index 36d5d55f35..8cb0e4075f 100644 --- a/source/source_basis/module_ao/ORB_read.cpp +++ b/source/source_basis/module_ao/ORB_read.cpp @@ -419,8 +419,7 @@ void LCAO_Orbitals::read_orb_file(std::ofstream& ofs_in, // GlobalV::ofs_running } // OUT(GlobalV::ofs_running,"Total number of chi(l,n)",total_nchi); - delete[] ao[it].phiLN; - ao[it].phiLN = new Numerical_Orbital_Lm[total_nchi]; + ao[it].phiLN.resize(total_nchi); int meshr = 0; // number of mesh points int meshr_read = 0; diff --git a/source/source_basis/module_nao/radial_set.cpp b/source/source_basis/module_nao/radial_set.cpp index 9c83590926..2570e99806 100644 --- a/source/source_basis/module_nao/radial_set.cpp +++ b/source/source_basis/module_nao/radial_set.cpp @@ -87,9 +87,8 @@ RadialSet& RadialSet::operator=(const RadialSet& rhs) void RadialSet::to_numerical_orbital(Numerical_Orbital& no, const int nk_legacy, const double lcao_dk) const { - delete[] no.chi(); - - no.chi() = new Numerical_Orbital_Lm[nchi_]; + no.chi().clear(); + no.chi().resize(nchi_); for (int i = 0; i < nchi_; i++) { chi_[i].to_numerical_orbital_lm(no.chi()[i], nk_legacy, lcao_dk); From 52dff7415399491effe42d4ad3d53e3581fdd211 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Tue, 24 Jun 2025 12:38:32 +0800 Subject: [PATCH 55/63] change copy assignment function in intarray --- source/source_base/intarray.cpp | 7 --- source/source_base/intarray.h | 60 ++++++++++++----------- source/source_base/test/intarray_test.cpp | 10 ---- 3 files changed, 31 insertions(+), 46 deletions(-) diff --git a/source/source_base/intarray.cpp b/source/source_base/intarray.cpp index a2e3dcce4d..10c3b7f39d 100644 --- a/source/source_base/intarray.cpp +++ b/source/source_base/intarray.cpp @@ -6,8 +6,6 @@ namespace ModuleBase { -int IntArray::arrayCount = 0; - void IntArrayAlloc() { std::cout << "\n Allocation error for IntArray " << std::endl; @@ -23,7 +21,6 @@ IntArray::IntArray(const int d1,const int d2) size = bound1 * bound2; ptr = new int[size];zero_out(); assert( ptr != nullptr); - ++arrayCount; } IntArray::IntArray(const int d1,const int d2,const int d3) @@ -37,7 +34,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3) size = bound1 * bound2 * bound3 ; //* sizeof(float); ptr = new int[size];zero_out(); assert(ptr != nullptr); - ++arrayCount; } IntArray::IntArray(const int d1,const int d2,const int d3,const int d4) @@ -52,7 +48,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3,const int d4) size = bound1 * bound2 * bound3 * bound4 ; //* sizeof(float); ptr = new int[size];zero_out(); assert(ptr != nullptr); - ++arrayCount; } IntArray::IntArray(const int d1,const int d2,const int d3, @@ -68,7 +63,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3, size = bound1 * bound2 * bound3 * bound4 * bound5; ptr = new int[size];zero_out(); assert(ptr != nullptr); - ++arrayCount; } IntArray::IntArray(const int d1,const int d2,const int d3, @@ -85,7 +79,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3, size = bound1 * bound2 * bound3 * bound4 * bound5 * bound6; ptr = new int[size];zero_out(); assert(ptr != nullptr); - ++arrayCount; } //******************************** diff --git a/source/source_base/intarray.h b/source/source_base/intarray.h index 96996b5b22..9147dc184e 100644 --- a/source/source_base/intarray.h +++ b/source/source_base/intarray.h @@ -48,17 +48,30 @@ class IntArray void create(const int d1, const int d2, const int d3, const int d4, const int d5, const int d6); /** - * @brief Equal an IntArray to another one + * @brief copy assignment * * @param right * @return const IntArray& */ - const IntArray &operator=(const IntArray &right) - { - assert( this->size == right.size ); - for (int i = 0;i < size;i++) ptr[i] = right.ptr[i]; - return *this;// enables x = y = z; - }; + IntArray &operator=(const IntArray &other) + { + if(this != &other) + { + delete[] ptr; + size = other.size; + dim = other.dim; + bound1 = other.bound1; + bound2 = other.bound2; + bound3 = other.bound3; + bound4 = other.bound4; + bound5 = other.bound5; + bound6 = other.bound6; + ptr = new int[size]; + for (int i = 0;i < size;i++) + { ptr[i] = other.ptr[i]; } + } + return *this; + } /** * @brief Equal all elements of an IntArray to an @@ -71,7 +84,7 @@ class IntArray { for (int i = 0;i < size;i++) ptr[i] = right; return *this;// enables x = y = z; - }; + } /** * @brief Access elements by using operator "()" @@ -85,14 +98,14 @@ class IntArray assert( d1 < bound1 ); assert( d2 < bound2 ); return ptr[ d1 * bound2 + d2 ]; - }; + } int &operator()(const int d1, const int d2, const int d3) { assert( d1 < bound1 ); assert( d2 < bound2 ); assert( d3 < bound3 ); return ptr[ (d1 * bound2 + d2) * bound3 + d3 ]; - }; + } int &operator()(const int d1, const int d2, const int d3, const int d4) { assert( d1 < bound1 ); @@ -100,7 +113,7 @@ class IntArray assert( d3 < bound3 ); assert( d4 < bound4 ); return ptr[ ((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4 ]; - }; + } int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5) { assert( d1 < bound1 ); @@ -109,7 +122,7 @@ class IntArray assert( d4 < bound4 ); assert( d5 < bound5 ); return ptr[ (((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5 ]; - }; + } int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5, const int d6) { assert( d1 < bound1 ); @@ -119,7 +132,7 @@ class IntArray assert( d5 < bound5 ); assert( d6 < bound6 ); return ptr[ ((((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5) * bound6 + d6 ]; - }; + } /** * @brief Access elements by using "()" through pointer @@ -134,14 +147,14 @@ class IntArray assert( d1 < bound1 ); assert( d2 < bound2 ); return ptr[ d1 * bound2 + d2 ]; - }; + } const int &operator()(const int d1, const int d2, const int d3) const { assert( d1 < bound1 ); assert( d2 < bound2 ); assert( d3 < bound3 ); return ptr[ (d1 * bound2 + d2) * bound3 + d3 ]; - }; + } const int &operator()(const int d1, const int d2, const int d3, const int d4) const { assert( d1 < bound1 ); @@ -149,7 +162,7 @@ class IntArray assert( d3 < bound3 ); assert( d4 < bound4 ); return ptr[ ((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4 ]; - }; + } const int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5) const { assert( d1 < bound1 ); @@ -158,7 +171,7 @@ class IntArray assert( d4 < bound4 ); assert( d5 < bound5 ); return ptr[ (((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5 ]; - }; + } const int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5, const int d6) const { assert( d1 < bound1 ); @@ -168,7 +181,7 @@ class IntArray assert( d5 < bound5 ); assert( d6 < bound6 ); return ptr[ ((((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5) * bound6 + d6 ]; - }; + } /** * @brief Set all elements of an IntArray to zero @@ -209,16 +222,6 @@ class IntArray return bound6; } - /** - * @brief Get the Array Count object - * - * @return int - */ - static int getArrayCount(void) - { - return arrayCount; - } - private: int size=0; int dim=0; @@ -228,7 +231,6 @@ class IntArray int bound4=0; int bound5=0; int bound6=0; - static int arrayCount; void freemem(); }; } // namespace ModuleBase diff --git a/source/source_base/test/intarray_test.cpp b/source/source_base/test/intarray_test.cpp index 6ccfb24452..7372b4e115 100644 --- a/source/source_base/test/intarray_test.cpp +++ b/source/source_base/test/intarray_test.cpp @@ -12,8 +12,6 @@ * - construct an int array (2 to 6 dimensions) * - Creat * - create an int array (2 to 6 dimensions) - * - GetArrayCount - * - get the total number of int array created * - GetSize * - get the total size of an int array * - GetDim @@ -51,14 +49,6 @@ class IntArrayTest : public testing::Test const int zero = 0; }; -TEST_F(IntArrayTest,GetArrayCount) -{ - count0 = ModuleBase::IntArray::getArrayCount(); - ModuleBase::IntArray c3, c4; - count1 = ModuleBase::IntArray::getArrayCount(); - EXPECT_EQ((count1-count0),2); -} - TEST_F(IntArrayTest,Construct) { ModuleBase::IntArray x2(1,5); From f7238583ca63cecf679786b45cc58ed58e1ef662 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Tue, 24 Jun 2025 14:00:17 +0800 Subject: [PATCH 56/63] fix error in module_lr --- .../module_gint/temp_gint/gint.cpp | 2 +- .../module_gint/temp_gint/gint.h | 4 ++-- .../module_gint/temp_gint/gint_common.cpp | 13 +++---------- .../module_gint/temp_gint/gint_info.cpp | 6 ++++-- .../module_gint/temp_gint/gint_info.h | 5 ++++- .../module_gint/temp_gint/gint_interface.cpp | 7 ++++--- .../module_gint/temp_gint/gint_interface.h | 3 ++- .../module_gint/temp_gint/gint_rho.cpp | 2 +- .../module_gint/temp_gint/gint_rho.h | 9 +++++++-- .../module_gint/temp_gint/gint_rho_gpu.cpp | 3 +-- .../module_gint/temp_gint/gint_rho_gpu.h | 9 +++++++-- source/module_lr/esolver_lrtd_lcao.cpp | 15 +++++++++------ source/module_lr/esolver_lrtd_lcao.h | 8 ++++---- source/module_lr/lr_spectrum.cpp | 10 +++++----- .../module_lr/operator_casida/operator_lr_hxc.cpp | 6 ++---- source/source_esolver/esolver_ks_lcao.cpp | 4 ---- source/source_esolver/esolver_ks_lcao.h | 5 +++++ source/source_esolver/lcao_before_scf.cpp | 7 ++++--- source/source_esolver/lcao_others.cpp | 7 ++++--- 19 files changed, 69 insertions(+), 56 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp index e766c46d9f..d7de110f24 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp @@ -3,6 +3,6 @@ namespace ModuleGint { -std::shared_ptr Gint::gint_info_ = nullptr; +GintInfo* Gint::gint_info_ = nullptr; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint.h index 64e941c380..1255bae971 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint.h @@ -14,13 +14,13 @@ class Gint // note that gint_info_ is a static member variable // it is shared by all instances of Gint - static void set_gint_info(std::shared_ptr gint_info) + static void set_gint_info(GintInfo* gint_info) { gint_info_ = gint_info; } protected: - static std::shared_ptr gint_info_; + static GintInfo* gint_info_; }; } \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp index 74c726c2db..a34774a142 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp @@ -146,19 +146,12 @@ void transfer_dm_2d_to_gint( { ModuleBase::TITLE("Gint", "transfer_dm_2d_to_gint"); ModuleBase::timer::tick("Gint", "transfer_dm_2d_to_gint"); - assert(PARAM.inp.nspin == dm_gint.size() - && "The size of dm_gint should be equal to the number of spins!"); - if(PARAM.inp.nspin != 4) - { - assert(dm.size() == PARAM.inp.nspin); - } else - { - assert(dm.size() == 1); - } if (PARAM.inp.nspin != 4) { - for (int is = 0; is < PARAM.inp.nspin; is++) + // dm_gint.size() usually equals to PARAM.inp.nspin, + // but there is exception within module_lr + for (int is = 0; is < dm_gint.size(); is++) { #ifdef __MPI hamilt::transferParallels2Serials(*dm[is], &dm_gint[is]); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp index 12e60761e7..b0738e28e4 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp @@ -41,7 +41,7 @@ GintInfo::GintInfo( biggrids_.push_back(std::make_shared(i)); } - // initialize the atoms + // initialize the atoms and the numerical orbital init_atoms_(ucell_->ntype, ucell_->atoms, Phi); // initialize trace_lo_ and lgd_ @@ -81,12 +81,14 @@ void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital int iat = 0; is_atom_in_proc_.resize(ucell_->nat, false); atoms_.resize(ucell_->nat); + orbs_.resize(ntype); // TODO: USE OPENMP TO PARALLELIZE THIS LOOP for(int i = 0; i < ntype; i++) { const auto& atom = atoms[i]; - const auto *orb = &Phi[i]; + orbs_[i] = Phi[i]; + const auto *orb = &orbs_[i]; // rcut extends to the maximum big grids in x, y, z directions Vec3i ext_bgrid = biggrid_info_->max_ext_bgrid_num(atom.Rcut); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h index 08bc0088f5..88f9b7c6bc 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h @@ -92,8 +92,11 @@ class GintInfo // map the global index of atomic orbitals to local index std::vector trace_lo_; + // store the information about Numerical orbitals + std::vector orbs_; + // total num of atomic orbitals on this proc - int lgd_; + int lgd_ = 0; #ifdef __CUDA public: diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index ff2a711ed0..5941c5cf4c 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -98,17 +98,18 @@ void cal_gint_vl_metagga( void cal_gint_rho( const std::vector*>& dm_vec, const int nspin, - double **rho) + double **rho, + bool is_dm_symm) { #ifdef __CUDA if(PARAM.inp.device == "gpu") { - Gint_rho_gpu gint_rho(dm_vec, nspin, rho); + Gint_rho_gpu gint_rho(dm_vec, nspin, rho, is_dm_symm); gint_rho.cal_gint(); } else #endif { - Gint_rho gint_rho(dm_vec, nspin, rho); + Gint_rho gint_rho(dm_vec, nspin, rho, is_dm_symm); gint_rho.cal_gint(); } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h index 0d064be2f2..f674e24011 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h @@ -28,7 +28,8 @@ void cal_gint_vl_metagga( void cal_gint_rho( const std::vector*>& dm_vec, const int nspin, - double **rho); + double **rho, + bool is_dm_symm = true); void cal_gint_tau( const std::vector*>& dm_vec, diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp index aed99af47c..c96b10a731 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp @@ -46,7 +46,7 @@ void Gint_rho::cal_rho_() phi_op.set_phi(phi.data()); for (int is = 0; is < nspin_; is++) { - phi_op.phi_mul_dm(phi.data(), dm_gint_vec_[is], true, phi_dm.data()); + phi_op.phi_mul_dm(phi.data(), dm_gint_vec_[is], is_dm_symm_, phi_dm.data()); phi_op.phi_dot_phi(phi.data(), phi_dm.data(), rho_[is]); } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h index 7e4816b729..e0a15edbdc 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h @@ -15,8 +15,9 @@ class Gint_rho : public Gint Gint_rho( const std::vector*>& dm_vec, const int nspin, - double **rho) - : dm_vec_(dm_vec), nspin_(nspin), rho_(rho) {} + double **rho, + bool is_dm_symm = true) + : dm_vec_(dm_vec), nspin_(nspin), rho_(rho), is_dm_symm_(is_dm_symm) {} void cal_gint(); @@ -28,6 +29,10 @@ class Gint_rho : public Gint // input const std::vector*> dm_vec_; const int nspin_; + + // if true, it means the DMR matrix is symmetric, + // which leads to faster computations compared to the asymmetric case. + const bool is_dm_symm_; // output double **rho_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp index 24490b8736..ca24002579 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp @@ -72,9 +72,8 @@ void Gint_rho_gpu::cal_rho_() phi_op.set_phi(phi.get_device_ptr()); for(int is = 0; is < nspin_; is++) { - const bool is_symm = true; phi_op.phi_mul_dm(phi.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is], - is_symm, phi_dm.get_device_ptr()); + is_dm_symm_, phi_dm.get_device_ptr()); phi_op.phi_dot_phi(phi.get_device_ptr(), phi_dm.get_device_ptr(), rho_d_vec_[is].get_device_ptr()); } } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h index f5a172ea33..13db0f5a85 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h @@ -16,8 +16,9 @@ class Gint_rho_gpu: public Gint Gint_rho_gpu( const std::vector*>& dm_vec, const int nspin, - double **rho) - : dm_vec_(dm_vec), nspin_(nspin), rho_(rho) {} + double **rho, + bool is_dm_symm = true) + : dm_vec_(dm_vec), nspin_(nspin), rho_(rho), is_dm_symm_(is_dm_symm) {} void cal_gint(); @@ -34,6 +35,10 @@ class Gint_rho_gpu: public Gint const std::vector*> dm_vec_; const int nspin_; + // if true, it means the DMR matrix is symmetric, + // which leads to faster computations compared to the asymmetric case. + const bool is_dm_symm_; + // output double **rho_; diff --git a/source/module_lr/esolver_lrtd_lcao.cpp b/source/module_lr/esolver_lrtd_lcao.cpp index 33d3ef46ad..d51fbea6a7 100644 --- a/source/module_lr/esolver_lrtd_lcao.cpp +++ b/source/module_lr/esolver_lrtd_lcao.cpp @@ -241,7 +241,7 @@ LR::ESolver_LR::ESolver_LR(ModuleESolver::ESolver_KS_LCAO&& ks_sol this->nupdown = cal_nupdown_form_occ(ks_sol.pelec->wg); reset_dim_spin2(); } - +#ifdef __OLD_GINT //grid integration this->gt_ = std::move(ks_sol.GridT); @@ -255,7 +255,9 @@ LR::ESolver_LR::ESolver_LR(ModuleESolver::ESolver_KS_LCAO&& ks_sol } this->set_gint(); this->gint_->reset_DMRGint(1); - +#else + this->gint_info_ = std::move(ks_sol.gint_info_); +#endif // move pw basis if (this->pw_rho_flag) { @@ -393,11 +395,11 @@ LR::ESolver_LR::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu this->ucell, search_radius, PARAM.inp.test_atom_input); +#ifdef __OLD_GINT this->set_gint(); this->gint_->gridt = &this->gt_; // (3) Periodic condition search for each grid. -#ifdef __OLD_GINT double dr_uniform = 0.001; std::vector rcuts; std::vector> psi_u; @@ -453,7 +455,8 @@ LR::ESolver_LR::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu &orb); this->gint_->initialize_pvpR(ucell, &this->gd, 1); // always use nspin=1 for transition density #else - auto gint_info = std::make_shared( + gint_info_.reset( + new ModuleGint::GintInfo( this->pw_big->nbx, this->pw_big->nby, this->pw_big->nbz, @@ -468,8 +471,8 @@ LR::ESolver_LR::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu this->pw_big->nbzp, orb.Phi, ucell, - this->gd); - ModuleGint::Gint::set_gint_info(gint_info); + this->gd)); + ModuleGint::Gint::set_gint_info(gint_info_.get()); #endif // if EXX from scratch, init 2-center integral and calculate Cs, Vs #ifdef __EXX diff --git a/source/module_lr/esolver_lrtd_lcao.h b/source/module_lr/esolver_lrtd_lcao.h index 0c4b6dea42..7f053d2d04 100644 --- a/source/module_lr/esolver_lrtd_lcao.h +++ b/source/module_lr/esolver_lrtd_lcao.h @@ -17,7 +17,7 @@ #include "module_elecstate/module_dm/density_matrix.h" #include "module_lr/potentials/pot_hxc_lrtd.h" #include "module_lr/hamilt_casida.h" -#include "module_hamilt_lcao/module_gint/temp_gint/gint.h" +#include "module_hamilt_lcao/module_gint/temp_gint/gint_info.h" #ifdef __EXX // #include #include "module_ri/Exx_LRI.h" @@ -35,9 +35,6 @@ namespace LR ESolver_LR(const Input_para& inp, UnitCell& ucell); ~ESolver_LR() { delete this->psi_ks; -#ifndef __OLD_GINT - ModuleGint::Gint::set_gint_info(nullptr); -#endif } ///input: input, call, basis(LCAO), psi(ground state), elecstate @@ -97,6 +94,9 @@ namespace LR Gint_Gamma gint_g_; Gint_k gint_k_; typename TGint::type* gint_ = nullptr; + #ifndef __OLD_GINT + std::unique_ptr gint_info_ = nullptr; + #endif void set_gint(); /// @brief variables for parallel distribution of KS orbitals diff --git a/source/module_lr/lr_spectrum.cpp b/source/module_lr/lr_spectrum.cpp index 100f5bdece..28c23270e7 100644 --- a/source/module_lr/lr_spectrum.cpp +++ b/source/module_lr/lr_spectrum.cpp @@ -70,7 +70,7 @@ ModuleBase::Vector3 LR::LR_Spectrum::cal_transition_dipole_istat this->cal_gint_rho(rho_trans, this->rho_basis.nrxx); #else ModuleBase::GlobalFunc::ZEROS(rho_trans[0], this->rho_basis.nrxx); - ModuleGint::cal_gint_rho({ DM_trans.get_DMR_vector().at(is) }, 1, rho_trans); + ModuleGint::cal_gint_rho({ DM_trans.get_DMR_vector().at(is) }, 1, rho_trans, false); #endif // 3. transition dipole moment @@ -86,7 +86,7 @@ ModuleBase::Vector3 LR::LR_Spectrum::cal_transition_dipole_istat } LR_Util::_deallocate_2order_nested_ptr(rho_trans, 1); } - trans_dipole *= (ucell.omega / static_cast(gint->get_ncxyz())); // dv + trans_dipole *= (ucell.omega / static_cast(rho_basis.nxyz)); // dv trans_dipole *= static_cast(this->nk); // nk is divided inside DM_trans, now recover it if (this->nspin_x == 1) { trans_dipole *= sqrt(2.0); } // *2 for 2 spins, /sqrt(2) for the halfed dimension of X in the normalizaiton Parallel_Reduce::reduce_all(trans_dipole.x); @@ -120,7 +120,7 @@ ModuleBase::Vector3> LR::LR_Spectrum>: this->cal_gint_rho(rho_trans_real, this->rho_basis.nrxx); #else ModuleBase::GlobalFunc::ZEROS(rho_trans_real[0], this->rho_basis.nrxx); - ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_real); + ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_real, false); #endif // LR_Util::print_grid_nonzero(rho_trans_real[0], this->rho_basis.nrxx, 10, "rho_trans"); @@ -131,7 +131,7 @@ ModuleBase::Vector3> LR::LR_Spectrum>: this->cal_gint_rho(rho_trans_imag, this->rho_basis.nrxx); #else ModuleBase::GlobalFunc::ZEROS(rho_trans_imag[0], this->rho_basis.nrxx); - ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_imag); + ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_imag, false); #endif // LR_Util::print_grid_nonzero(rho_trans_imag[0], this->rho_basis.nrxx, 10, "rho_trans"); @@ -150,7 +150,7 @@ ModuleBase::Vector3> LR::LR_Spectrum>: LR_Util::_deallocate_2order_nested_ptr(rho_trans_real, 1); LR_Util::_deallocate_2order_nested_ptr(rho_trans_imag, 1); } - trans_dipole *= (ucell.omega / static_cast(gint->get_ncxyz())); // dv + trans_dipole *= (ucell.omega / static_cast(rho_basis.nxyz)); // dv trans_dipole *= static_cast(this->nk); // nk is divided inside DM_trans, now recover it if (this->nspin_x == 1) { trans_dipole *= sqrt(2.0); } // *2 for 2 spins, /sqrt(2) for the halfed dimension of X in the normalizaiton Parallel_Reduce::reduce_all(trans_dipole.x); diff --git a/source/module_lr/operator_casida/operator_lr_hxc.cpp b/source/module_lr/operator_casida/operator_lr_hxc.cpp index d5b75a48d8..28e3bbac7a 100644 --- a/source/module_lr/operator_casida/operator_lr_hxc.cpp +++ b/source/module_lr/operator_casida/operator_lr_hxc.cpp @@ -23,7 +23,6 @@ namespace LR ModuleBase::TITLE("OperatorLRHxc", "act"); const int& sl = ispin_ks[0]; const auto psil_ks = LR_Util::get_psi_spin(psi_ks, sl, nk); - const int& lgd = gint->gridt->lgd; this->DM_trans->cal_DMR(); //DM_trans->get_DMR_vector() is 2d-block parallized // LR_Util::print_DMR(*DM_trans, ucell.nat, "DMR"); @@ -68,9 +67,8 @@ namespace LR Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false); this->gint->cal_gint(&inout_rho); #else - ModuleGint::cal_gint_rho(this->DM_trans->get_DMR_vector(), 1, rho_trans); + ModuleGint::cal_gint_rho(this->DM_trans->get_DMR_vector(), 1, rho_trans, false); #endif - // 3. v_hxc = f_hxc * rho_trans ModuleBase::matrix vr_hxc(1, nrxx); //grid this->pot.lock()->cal_v_eff(rho_trans, ucell, vr_hxc, ispin_ks); @@ -118,7 +116,7 @@ namespace LR Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false); this->gint->cal_gint(&inout_rho); #else - ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans); + ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans, false); #endif // print_grid_nonzero(rho_trans[0], nrxx, 10, "rho_trans"); diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp index b5974fa11c..b2596fb2c8 100644 --- a/source/source_esolver/esolver_ks_lcao.cpp +++ b/source/source_esolver/esolver_ks_lcao.cpp @@ -94,10 +94,6 @@ ESolver_KS_LCAO::ESolver_KS_LCAO() template ESolver_KS_LCAO::~ESolver_KS_LCAO() { -#ifndef __OLD_GINT - // release gint_info - ModuleGint::Gint::set_gint_info(nullptr); -#endif } template diff --git a/source/source_esolver/esolver_ks_lcao.h b/source/source_esolver/esolver_ks_lcao.h index 43a180104f..59e427f013 100644 --- a/source/source_esolver/esolver_ks_lcao.h +++ b/source/source_esolver/esolver_ks_lcao.h @@ -95,6 +95,11 @@ class ESolver_KS_LCAO : public ESolver_KS //! Grid integration: used to store some basic information Grid_Technique GridT; +#ifndef __OLD_GINT + //! GintInfo: used to store some basic infomation about module_gint + std::unique_ptr gint_info_; +#endif + //! NAO orbitals: two-center integrations TwoCenterBundle two_center_bundle_; diff --git a/source/source_esolver/lcao_before_scf.cpp b/source/source_esolver/lcao_before_scf.cpp index 04f0322b98..278ae85444 100644 --- a/source/source_esolver/lcao_before_scf.cpp +++ b/source/source_esolver/lcao_before_scf.cpp @@ -104,7 +104,8 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) //! 6) prepare grid integral #else - auto gint_info = std::make_shared( + gint_info_.reset( + new ModuleGint::GintInfo( this->pw_big->nbx, this->pw_big->nby, this->pw_big->nbz, @@ -119,8 +120,8 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) this->pw_big->nbzp, orb_.Phi, ucell, - this->gd); - ModuleGint::Gint::set_gint_info(gint_info); + this->gd)); + ModuleGint::Gint::set_gint_info(gint_info_.get()); #endif // 7) For each atom, calculate the adjacent atoms in different cells diff --git a/source/source_esolver/lcao_others.cpp b/source/source_esolver/lcao_others.cpp index 9c8cdb7362..2bcd563473 100644 --- a/source/source_esolver/lcao_others.cpp +++ b/source/source_esolver/lcao_others.cpp @@ -134,7 +134,8 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) // prepare grid in Gint LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big); #else - auto gint_info = std::make_shared( + gint_info_.reset( + new ModuleGint::GintInfo( this->pw_big->nbx, this->pw_big->nby, this->pw_big->nbz, @@ -149,8 +150,8 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) this->pw_big->nbzp, orb_.Phi, ucell, - this->gd); - ModuleGint::Gint::set_gint_info(gint_info); + this->gd)); + ModuleGint::Gint::set_gint_info(gint_info_.get()); #endif // (2)For each atom, calculate the adjacent atoms in different cells From 28daad70f039de95b4f47269c2fcfbcc62be617b Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Tue, 24 Jun 2025 17:08:08 +0800 Subject: [PATCH 57/63] replace module with source --- .../module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp | 2 +- .../module_gint/temp_gint/gint_common.cpp | 2 +- .../module_gint/temp_gint/gint_dvlocal.cpp | 2 +- .../module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h | 2 +- .../module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h | 2 +- .../module_gint/temp_gint/gint_fvl_meta_gpu.h | 2 +- .../module_gint/temp_gint/kernel/cuda_mem_wrapper.h | 2 +- .../module_gint/temp_gint/kernel/gint_gpu_vars.cpp | 2 +- .../module_gint/temp_gint/kernel/gint_gpu_vars.h | 6 +++--- 9 files changed, 11 insertions(+), 11 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp index 4d0227bf71..7121694244 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp @@ -1,7 +1,7 @@ #include "source_base/ylm.h" #include "source_base/array_pool.h" #include "gint_atom.h" -#include "module_cell/unitcell.h" +#include "source_cell/unitcell.h" #include "gint_helper.h" namespace ModuleGint diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp index a34774a142..77c2b1dc76 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp @@ -4,7 +4,7 @@ #include "module_parameter/parameter.h" #ifdef __MPI -#include "module_base/blacs_connector.h" +#include "source_base/blacs_connector.h" #include #endif diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp index 24021a870c..78a8b91069 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp @@ -1,7 +1,7 @@ #include #include "gint_dvlocal.h" #include "phi_operator.h" -#include "module_base/parallel_reduce.h" +#include "source_base/parallel_reduce.h" namespace ModuleGint { diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h index 3160d11c1c..77976aad78 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h @@ -3,7 +3,7 @@ #include #include "module_hamilt_lcao/module_hcontainer/hcontainer.h" #include "module_hamilt_lcao/hamilt_lcaodft/LCAO_HS_arrays.hpp" -#include "module_base/abfs-vector3_order.h" +#include "source_base/abfs-vector3_order.h" #include "gint.h" #include "gint_info.h" diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h index 6509968a85..6d3d341e64 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h @@ -3,7 +3,7 @@ #include #include #include "module_hamilt_lcao/module_hcontainer/hcontainer.h" -#include "module_base/matrix.h" +#include "source_base/matrix.h" #include "gint.h" #include "gint_info.h" #include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h index 648bc6877a..22baba9d6d 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h @@ -3,7 +3,7 @@ #include #include #include "module_hamilt_lcao/module_hcontainer/hcontainer.h" -#include "module_base/matrix.h" +#include "source_base/matrix.h" #include "gint.h" #include "gint_info.h" #include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h" diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h index b23fa15792..9b7ad27e83 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h @@ -1,6 +1,6 @@ #pragma once #include -#include "module_base/tool_quit.h" +#include "source_base/tool_quit.h" #include "gint_helper.cuh" template diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp index 9d75a3fed4..f4443762f0 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp @@ -1,5 +1,5 @@ #include "gint_gpu_vars.h" -#include "module_base/module_device/device.h" +#include "source_base/module_device/device.h" namespace ModuleGint { diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h index 50bb80c02f..7d2515b3b0 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h @@ -2,9 +2,9 @@ #include #include "set_const_mem.cuh" -#include "module_base/ylm.h" -#include "module_cell/unitcell.h" -#include "module_cell/atom_spec.h" +#include "source_base/ylm.h" +#include "source_cell/unitcell.h" +#include "source_cell/atom_spec.h" #include "module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h" #include "gint_helper.cuh" #include "module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cuh" From 57b4e7f465753318a18ac78e3ff0474572bc1302 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Tue, 24 Jun 2025 17:30:43 +0800 Subject: [PATCH 58/63] include missing header --- .../module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp index 5941c5cf4c..a66b061ab3 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp @@ -1,5 +1,6 @@ #include "gint_interface.h" #include "source_base/timer.h" +#include "module_parameter/parameter.h" #include "gint_vl.h" #include "gint_vl_metagga.h" #include "gint_vl_nspin4.h" From d6459d2457999a9e28883f35142187b145aa633b Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Tue, 24 Jun 2025 18:38:02 +0800 Subject: [PATCH 59/63] fix a bug --- source/module_io/write_HS_R.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/source/module_io/write_HS_R.cpp b/source/module_io/write_HS_R.cpp index 01907432cb..e2540dca40 100644 --- a/source/module_io/write_HS_R.cpp +++ b/source/module_io/write_HS_R.cpp @@ -146,8 +146,6 @@ void ModuleIO::output_dHR(const int& istep, GlobalV::ofs_running << " | |" << std::endl; GlobalV::ofs_running << " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" << std::endl; - gint_k.allocate_pvdpR(); - const int nspin = PARAM.inp.nspin; if (nspin == 1 || nspin == 4) From 12a1883ce7d1da897be57db612170905bc688b89 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Wed, 25 Jun 2025 11:15:01 +0800 Subject: [PATCH 60/63] update some result.ref --- tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref | 6 +++--- tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref b/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref index 4fb5aa8716..540c8789a6 100644 --- a/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref +++ b/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref @@ -1,8 +1,8 @@ etotref -74.3929556166736603 etotperatomref -14.8785911233 -totalforceref 778.174241 -totalstressref 1272.711589 +totalforceref 1495.625575 +totalstressref 574.321174 totaldosref 12 deepks_desc 8.045214 -deepks_dm_eig 29.53046025202608 +deepks_dm_eig 29.530460252025964 totaltimeref 1.12 diff --git a/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref b/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref index 8d613e5354..123d104b33 100644 --- a/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref +++ b/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref @@ -1,7 +1,7 @@ etotref -469.5735907784966230 etotperatomref -156.5245302595 -totalforceref 330.972666 -totalstressref 24771.556634 +totalforceref 10.194156 +totalstressref 510.485544 totaldosref 28 deepks_desc 2.126589 deepks_dm_eig 10.532812121143177 From 5580c3406bd3fa068f7aa7fec6ac3bda2a769f50 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Wed, 25 Jun 2025 15:58:14 +0800 Subject: [PATCH 61/63] fix compilation error when MPI is disabled --- .../module_gint/temp_gint/gint_common.cpp | 16 +++++++++++----- .../module_gint/temp_gint/gint_common.h | 2 +- .../module_gint/temp_gint/gint_env_gamma.cpp | 3 ++- .../module_gint/temp_gint/gint_env_gamma.h | 2 +- .../module_gint/temp_gint/gint_env_k.cpp | 3 ++- .../module_gint/temp_gint/gint_env_k.h | 1 + source/module_io/get_wf_lcao.cpp | 6 +++--- 7 files changed, 21 insertions(+), 12 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp index 77c2b1dc76..39b63191e7 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp @@ -167,7 +167,7 @@ void transfer_dm_2d_to_gint( HContainer dm_full = gint_info.get_hr(npol); hamilt::transferParallels2Serials(*dm[0], &dm_full); #else - HContainer& dm_full = dm[0]; + HContainer& dm_full = *(dm[0]); #endif std::vector tmp_pointer(4, nullptr); for (int iap = 0; iap < dm_full.size_atom_pairs(); iap++) @@ -220,6 +220,8 @@ int localIndex(int globalindex, int nblk, int nprocs, int& myproc) template void wfc_2d_to_gint(const T* wfc_2d, + int nbands, // needed if MPI is disabled + int nlocal, // needed if MPI is disabled const Parallel_Orbitals& pv, T* wfc_gint, const GintInfo& gint_info) @@ -227,11 +229,11 @@ void wfc_2d_to_gint(const T* wfc_2d, ModuleBase::TITLE("Gint", "wfc_2d_to_gint"); ModuleBase::timer::tick("Gint", "wfc_2d_to_gint"); +#ifdef __MPI // dimension related - const int nlocal = pv.desc_wfc[2]; - const int nbands = pv.desc_wfc[3]; + nlocal = pv.desc_wfc[2]; + nbands = pv.desc_wfc[3]; -#ifdef __MPI const std::vector& trace_lo = gint_info.get_trace_lo(); // MPI and memory related @@ -303,7 +305,7 @@ void wfc_2d_to_gint(const T* wfc_2d, { for (int j = 0; j < nlocal; ++j) { - wfc_k_grid[i * nlocal + j] = psi[0](i, j); + wfc_gint[i * nlocal + j] = wfc_2d[i * nlocal + j]; } } #endif @@ -326,11 +328,15 @@ template void transfer_dm_2d_to_gint( std::vector>>& dm_gint); template void wfc_2d_to_gint( const double* wfc_2d, + int nbands, + int nlocal, const Parallel_Orbitals& pv, double* wfc_grid, const GintInfo& gint_info); template void wfc_2d_to_gint( const std::complex* wfc_2d, + int nbands, + int nlocal, const Parallel_Orbitals& pv, std::complex* wfc_grid, const GintInfo& gint_info); diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h index 15258e92ad..485978ccf8 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h @@ -20,5 +20,5 @@ namespace ModuleGint std::vector>& dm_gint); template - void wfc_2d_to_gint(const T* wfc_2d, const Parallel_Orbitals& pv, T* wfc_grid, const GintInfo& gint_info); + void wfc_2d_to_gint(const T* wfc_2d, int nbands, int nlocal, const Parallel_Orbitals& pv, T* wfc_grid, const GintInfo& gint_info); } diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp index 2b856df887..71fabbd703 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp @@ -9,11 +9,12 @@ Gint_env_gamma::Gint_env_gamma( const double* psid, const Parallel_Orbitals* pv, const int nbands, + const int nlocal, double* rho) :rho_(rho) { wfc_gint_.resize(nbands * gint_info_->get_lgd()); - wfc_2d_to_gint(psid, *pv, wfc_gint_.data(), *gint_info_); + wfc_2d_to_gint(psid, nbands, nlocal, *pv, wfc_gint_.data(), *gint_info_); } void Gint_env_gamma::cal_env_band(const int iband) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h index fbd7c85754..6ba3dca4fa 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h @@ -16,12 +16,12 @@ class Gint_env_gamma : public Gint const double* psid, const Parallel_Orbitals* pv, const int nbands, + const int nlocal, double* rho); void cal_env_band(const int iband); private: - // output double* rho_; diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp index 9813710e3a..b92ed8ddfc 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp @@ -11,6 +11,7 @@ Gint_env_k::Gint_env_k( const std::vector& kvec_c, const std::vector& kvec_d, const int nbands, + const int nlocal, const int ik, const int nspin, const int npol, @@ -18,7 +19,7 @@ Gint_env_k::Gint_env_k( :kvec_c_(kvec_c), kvec_d_(kvec_d), ik_(ik), nspin_(nspin), npol_(npol), rho_(rho) { wfc_gint_.resize(nbands * gint_info_->get_lgd()); - wfc_2d_to_gint(psid, *pv, wfc_gint_.data(), *gint_info_); + wfc_2d_to_gint(psid, nbands, nlocal, *pv, wfc_gint_.data(), *gint_info_); } void Gint_env_k::cal_env_band(const int iband) diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h index 31938bc73e..4d1232e591 100644 --- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h +++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h @@ -18,6 +18,7 @@ class Gint_env_k : public Gint const std::vector& kvec_c, const std::vector& kvec_d, const int nbands, + const int nlocal, const int ik, const int nspin, const int npol, diff --git a/source/module_io/get_wf_lcao.cpp b/source/module_io/get_wf_lcao.cpp index 1af323e6e5..5de26f61a1 100644 --- a/source/module_io/get_wf_lcao.cpp +++ b/source/module_io/get_wf_lcao.cpp @@ -112,7 +112,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, } #endif #else - ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, pes_->charge->rho[is]); + ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, nlocal, pes_->charge->rho[is]); #endif for (int ib = 0; ib < nbands; ++ib) { @@ -189,7 +189,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, } #endif #else - ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, pes_->charge->rho[is]); + ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), ¶_orb, nbands, nlocal, pes_->charge->rho[is]); #endif for (int ib = 0; ib < nbands; ++ib) { @@ -341,7 +341,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell, #endif #else ModuleGint::Gint_env_k gint_env(psi->get_pointer(), ¶_orb, kv.kvec_c, kv.kvec_d, - nbands, ik, PARAM.inp.nspin, PARAM.globalv.npol, pes_->charge->rho[ispin]); + nbands, nlocal, ik, PARAM.inp.nspin, PARAM.globalv.npol, pes_->charge->rho[ispin]); #endif for (int ib = 0; ib < nbands; ++ib) From c6547fc9b0935ac172553fc992d9c3c69f507328 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 26 Jun 2025 11:23:21 +0800 Subject: [PATCH 62/63] add makefile support --- source/Makefile.Objects | 35 ++++++++++++++++--- .../module_gint/CMakeLists.txt | 10 +++--- .../{gint_fvl.cpp => gint_fvl_old.cpp} | 0 .../module_gint/{gint.cpp => gint_old.cpp} | 0 .../{gint_rho.cpp => gint_rho_old.cpp} | 0 .../{gint_tau.cpp => gint_tau_old.cpp} | 0 .../{gint_vl.cpp => gint_vl_old.cpp} | 0 7 files changed, 35 insertions(+), 10 deletions(-) rename source/module_hamilt_lcao/module_gint/{gint_fvl.cpp => gint_fvl_old.cpp} (100%) rename source/module_hamilt_lcao/module_gint/{gint.cpp => gint_old.cpp} (100%) rename source/module_hamilt_lcao/module_gint/{gint_rho.cpp => gint_rho_old.cpp} (100%) rename source/module_hamilt_lcao/module_gint/{gint_tau.cpp => gint_tau_old.cpp} (100%) rename source/module_hamilt_lcao/module_gint/{gint_vl.cpp => gint_vl_old.cpp} (100%) diff --git a/source/Makefile.Objects b/source/Makefile.Objects index 4430902fae..0135a5b05f 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -61,6 +61,7 @@ VPATH=./src_global:\ ./module_hamilt_lcao/module_deltaspin:\ ./module_hamilt_lcao/hamilt_lcaodft/operator_lcao:\ ./module_hamilt_lcao/module_gint:\ +./module_hamilt_lcao/module_gint/temp_gint:\ ./module_relax:\ ./module_hamilt_general/module_vdw:\ ./module_io:\ @@ -273,13 +274,13 @@ OBJS_ESOLVER_LCAO=esolver_ks_lcao.o\ lcao_others.o\ esolver_dm2rho.o\ -OBJS_GINT=gint.o\ +OBJS_GINT=gint_old.o\ gint_gamma_env.o\ gint_gamma_vl.o\ - gint_fvl.o\ - gint_rho.o\ - gint_tau.o\ - gint_vl.o\ + gint_fvl_old.o\ + gint_rho_old.o\ + gint_tau_old.o\ + gint_vl_old.o\ gint_k_env.o\ gint_k_sparse1.o\ gint_k_pvpr.o\ @@ -298,6 +299,30 @@ OBJS_GINT=gint.o\ cal_ddpsir_ylm.o\ mult_psi_dmr.o\ init_orb.o\ + batch_biggrid.o\ + big_grid.o\ + biggrid_info.o\ + divide_info.o\ + gint_atom.o\ + gint_common.o\ + gint_dvlocal.o\ + gint_env_gamma.o\ + gint_env_k.o\ + gint_fvl_meta.o\ + gint_fvl.o\ + gint_info.o\ + gint_interface.o\ + gint_rho.o\ + gint_tau.o\ + gint_vl_metagga_nspin4.o\ + gint_vl_metagga.o\ + gint_vl_nspin4.o\ + gint_vl.o\ + gint.o\ + localcell_info.o\ + phi_operator.o\ + set_ddphi.o\ + unitcell_info.o\ OBJS_HAMILT=hamilt_pw.o\ hamilt_sdft_pw.o\ diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index 69db75605c..0505957b9c 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -2,13 +2,13 @@ if(ENABLE_LCAO) list(APPEND objects - gint.cpp + gint_old.cpp gint_gamma_env.cpp gint_gamma_vl.cpp - gint_fvl.cpp - gint_rho.cpp - gint_tau.cpp - gint_vl.cpp + gint_fvl_old.cpp + gint_rho_old.cpp + gint_tau_old.cpp + gint_vl_old.cpp gint_k_env.cpp gint_k_sparse1.cpp gint_k_pvpr.cpp diff --git a/source/module_hamilt_lcao/module_gint/gint_fvl.cpp b/source/module_hamilt_lcao/module_gint/gint_fvl_old.cpp similarity index 100% rename from source/module_hamilt_lcao/module_gint/gint_fvl.cpp rename to source/module_hamilt_lcao/module_gint/gint_fvl_old.cpp diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint_old.cpp similarity index 100% rename from source/module_hamilt_lcao/module_gint/gint.cpp rename to source/module_hamilt_lcao/module_gint/gint_old.cpp diff --git a/source/module_hamilt_lcao/module_gint/gint_rho.cpp b/source/module_hamilt_lcao/module_gint/gint_rho_old.cpp similarity index 100% rename from source/module_hamilt_lcao/module_gint/gint_rho.cpp rename to source/module_hamilt_lcao/module_gint/gint_rho_old.cpp diff --git a/source/module_hamilt_lcao/module_gint/gint_tau.cpp b/source/module_hamilt_lcao/module_gint/gint_tau_old.cpp similarity index 100% rename from source/module_hamilt_lcao/module_gint/gint_tau.cpp rename to source/module_hamilt_lcao/module_gint/gint_tau_old.cpp diff --git a/source/module_hamilt_lcao/module_gint/gint_vl.cpp b/source/module_hamilt_lcao/module_gint/gint_vl_old.cpp similarity index 100% rename from source/module_hamilt_lcao/module_gint/gint_vl.cpp rename to source/module_hamilt_lcao/module_gint/gint_vl_old.cpp From 36564908704290753a8267aec99e0e64e1730e18 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 26 Jun 2025 23:09:16 +0800 Subject: [PATCH 63/63] fix compilation error --- source/module_io/get_pchg_lcao.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/module_io/get_pchg_lcao.cpp b/source/module_io/get_pchg_lcao.cpp index 9d19c19596..316ffdb3f6 100644 --- a/source/module_io/get_pchg_lcao.cpp +++ b/source/module_io/get_pchg_lcao.cpp @@ -76,7 +76,7 @@ void Get_pchg_lcao::begin(Gint_Gamma& gg, Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin); gg.cal_gint(&inout); #else - ModuleGint::cal_gint_rho(DM.get_DMR_vector(), PARAM.inp.nspin, rho); + ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho); #endif // A solution to replace the original implementation of the following code: @@ -175,7 +175,7 @@ void Get_pchg_lcao::begin(Gint_k& gk, Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin); gk.cal_gint(&inout); #else - ModuleGint::cal_gint_rho(DM.get_DMR_vector(), PARAM.inp.nspin, rho); + ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho); #endif @@ -222,7 +222,7 @@ void Get_pchg_lcao::begin(Gint_k& gk, Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin); gk.cal_gint(&inout); #else - ModuleGint::cal_gint_rho(DM.get_DMR_vector(), PARAM.inp.nspin, rho); + ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho); #endif // Using std::vector to replace the original double** rho_save std::vector> rho_save(nspin, std::vector(rhopw_nrxx));