From f05a1071730d91100d918db49bdc3e6d6e95a99f Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 6 Nov 2025 19:28:56 +0800 Subject: [PATCH 1/7] correct some OpenMP usages not supported by nvc++ --- source/source_base/kernels/math_kernel_op.cpp | 8 ++--- .../kernels/math_kernel_op_vec.cpp | 12 +++---- .../source_base/module_device/memory_op.cpp | 2 +- .../module_external/blas_connector_vector.cpp | 8 ++--- .../module_mixing/broyden_mixing.cpp | 8 ++--- .../source_base/module_mixing/mixing_data.h | 2 +- .../module_mixing/plain_mixing.cpp | 8 ++--- .../module_mixing/pulay_mixing.cpp | 6 ++-- .../source_basis/module_pw/pw_gatherscatter.h | 4 +-- .../source_basis/module_pw/pw_transform.cpp | 34 +++++++++---------- .../source_basis/module_pw/pw_transform_k.cpp | 24 ++++++------- source/source_estate/kernels/elecstate_op.cpp | 2 +- .../source_hsolver/kernels/bpcg_kernel_op.cpp | 2 +- source/source_psi/psi_initializer.cpp | 6 ++-- source/source_psi/psi_initializer_nao.cpp | 2 +- source/source_pw/module_pwdft/VNL_in_pw.cpp | 2 +- .../module_pwdft/kernels/veff_op.cpp | 2 +- 17 files changed, 66 insertions(+), 66 deletions(-) diff --git a/source/source_base/kernels/math_kernel_op.cpp b/source/source_base/kernels/math_kernel_op.cpp index 343d342e00..aa5d365319 100644 --- a/source/source_base/kernels/math_kernel_op.cpp +++ b/source/source_base/kernels/math_kernel_op.cpp @@ -81,7 +81,7 @@ struct matrixTranspose_op T* temp = nullptr; base_device::memory::resize_memory_op()(temp, row * col, "MTransOp"); #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T)) +#pragma omp parallel for collapse(2) schedule(static) #endif for (int j = 0; j < col; j++) { @@ -91,7 +91,7 @@ struct matrixTranspose_op } } #ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(T)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < row * col; i++) { @@ -107,7 +107,7 @@ struct matrixCopy void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB) { #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T)) +#pragma omp parallel for collapse(2) schedule(static) #endif for (int i = 0; i < n1; i++) { @@ -130,7 +130,7 @@ struct matrix_mul_vector_op { T *c, const int &ldc){ #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T)) +#pragma omp parallel for collapse(2) schedule(static) #endif for (int j = 0; j < n; j++){ for (int i = 0; i < m; i++){ diff --git a/source/source_base/kernels/math_kernel_op_vec.cpp b/source/source_base/kernels/math_kernel_op_vec.cpp index 8353b82660..da3fdb8f06 100644 --- a/source/source_base/kernels/math_kernel_op_vec.cpp +++ b/source/source_base/kernels/math_kernel_op_vec.cpp @@ -24,7 +24,7 @@ struct vector_mul_real_op void operator()(const int dim, T* result, const T* vector, const Real constant) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { @@ -42,7 +42,7 @@ struct vector_mul_vector_op if (add) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { @@ -52,7 +52,7 @@ struct vector_mul_vector_op else { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { @@ -69,7 +69,7 @@ struct vector_div_constant_op void operator()(const int& dim, T* result, const T* vector, const Real constant) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { @@ -85,7 +85,7 @@ struct vector_div_vector_op void operator()(const int& dim, T* result, const T* vector1, const Real* vector2) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { @@ -121,7 +121,7 @@ struct vector_add_vector_op const Real constant2) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(T)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { diff --git a/source/source_base/module_device/memory_op.cpp b/source/source_base/module_device/memory_op.cpp index ac2549e182..2ef4be588a 100644 --- a/source/source_base/module_device/memory_op.cpp +++ b/source/source_base/module_device/memory_op.cpp @@ -108,7 +108,7 @@ struct cast_memory_op::type; if (device_type == base_device::AbacusDevice_t::CpuDevice) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { @@ -391,7 +391,7 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec using Real = typename GetTypeReal::type; if (device_type == base_device::AbacusDevice_t::CpuDevice) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { @@ -412,7 +412,7 @@ void vector_add_vector(const int& dim, float *result, const float *vector1, cons { if (device_type == base_device::CpuDevice){ #ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(float)) +#pragma omp parallel for schedule(static ) #endif for (int i = 0; i < dim; i++) { @@ -454,7 +454,7 @@ void vector_add_vector(const int& dim, std::complex *result, const std::c { if (device_type == base_device::CpuDevice){ #ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { diff --git a/source/source_base/module_mixing/broyden_mixing.cpp b/source/source_base/module_mixing/broyden_mixing.cpp index c5f8e5e025..eabc0297a7 100644 --- a/source/source_base/module_mixing/broyden_mixing.cpp +++ b/source/source_base/module_mixing/broyden_mixing.cpp @@ -32,7 +32,7 @@ void Broyden_Mixing::tem_push_data(Mixing_Data& mdata, const size_t length = mdata.length; std::vector F_tmp(length); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < length; ++i) { @@ -72,7 +72,7 @@ void Broyden_Mixing::tem_push_data(Mixing_Data& mdata, dF = malloc(sizeof(FPTYPE) * length * mixing_ndim); FP_dF = static_cast(dF); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < length; ++i) { @@ -84,7 +84,7 @@ void Broyden_Mixing::tem_push_data(Mixing_Data& mdata, this->ndim_cal_dF = std::min(this->ndim_cal_dF + 1, this->mixing_ndim); start_dF = (this->start_dF + 1) % this->mixing_ndim; #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < length; ++i) { @@ -192,7 +192,7 @@ void Broyden_Mixing::tem_cal_coef(const Mixing_Data& mdata, std::functionndim_history; FPTYPE* FP_startdata = static_cast(this->data) + this->start * this->length; #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096/sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (std::size_t i = 0; i < length; ++i) { diff --git a/source/source_base/module_mixing/plain_mixing.cpp b/source/source_base/module_mixing/plain_mixing.cpp index 81bdd659ef..591519e79f 100644 --- a/source/source_base/module_mixing/plain_mixing.cpp +++ b/source/source_base/module_mixing/plain_mixing.cpp @@ -30,7 +30,7 @@ void Plain_Mixing::tem_push_data(Mixing_Data& mdata, const size_t length = mdata.length; std::vector F_tmp(length); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < length; ++i) { @@ -68,7 +68,7 @@ void Plain_Mixing::simple_mix(FPTYPE* data_new, if (screen == nullptr) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ig = 0; ig < length; ig++) { @@ -79,7 +79,7 @@ void Plain_Mixing::simple_mix(FPTYPE* data_new, { std::vector F_tmp(length); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < length; ++i) { @@ -87,7 +87,7 @@ void Plain_Mixing::simple_mix(FPTYPE* data_new, } screen(F_tmp.data()); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < length; ++i) { diff --git a/source/source_base/module_mixing/pulay_mixing.cpp b/source/source_base/module_mixing/pulay_mixing.cpp index c283a5c2e7..db25d2171d 100644 --- a/source/source_base/module_mixing/pulay_mixing.cpp +++ b/source/source_base/module_mixing/pulay_mixing.cpp @@ -32,7 +32,7 @@ void Pulay_Mixing::tem_push_data(Mixing_Data& mdata, std::vector F_tmp(length); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (std::size_t i = 0; i < length; ++i) { @@ -67,7 +67,7 @@ void Pulay_Mixing::tem_push_data(Mixing_Data& mdata, F = malloc(sizeof(FPTYPE) * length * mixing_ndim); FP_F = static_cast(F); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (std::size_t i = 0; i < length; ++i) { @@ -79,7 +79,7 @@ void Pulay_Mixing::tem_push_data(Mixing_Data& mdata, start_F = (this->start_F + 1) % this->mixing_ndim; FPTYPE* FP_startF = FP_F + start_F * length; #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (std::size_t i = 0; i < length; ++i) { diff --git a/source/source_basis/module_pw/pw_gatherscatter.h b/source/source_basis/module_pw/pw_gatherscatter.h index e6b5998446..5a1a5a37c1 100644 --- a/source/source_basis/module_pw/pw_gatherscatter.h +++ b/source/source_basis/module_pw/pw_gatherscatter.h @@ -102,7 +102,7 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const if(this->poolnproc == 1) //In this case nrxx=fftnx*fftny*nz, nst = nstot, { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096/sizeof(T)) +#pragma omp parallel for schedule(static) #endif for(int i = 0; i < this->nrxx; ++i) { @@ -160,7 +160,7 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const } #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096/sizeof(T)) +#pragma omp parallel for schedule(static) #endif for(int i = 0; i < this->nrxx; ++i) { diff --git a/source/source_basis/module_pw/pw_transform.cpp b/source/source_basis/module_pw/pw_transform.cpp index 31dbf8b954..1e4e45bbbc 100644 --- a/source/source_basis/module_pw/pw_transform.cpp +++ b/source/source_basis/module_pw/pw_transform.cpp @@ -31,7 +31,7 @@ void PW_Basis::real2recip(const std::complex* in, assert(this->gamma_only == false); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ir = 0; ir < this->nrxx; ++ir) { @@ -47,7 +47,7 @@ void PW_Basis::real2recip(const std::complex* in, { FPTYPE tmpfac = factor / FPTYPE(this->nxyz); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ig = 0; ig < this->npw; ++ig) { @@ -58,7 +58,7 @@ void PW_Basis::real2recip(const std::complex* in, { FPTYPE tmpfac = 1.0 / FPTYPE(this->nxyz); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ig = 0; ig < this->npw; ++ig) { @@ -83,7 +83,7 @@ void PW_Basis::real2recip(const FPTYPE* in, std::complex* out, const boo { const int npy = this->ny * this->nplane; #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for collapse(2) schedule(static) #endif for (int ix = 0; ix < this->nx; ++ix) { @@ -98,7 +98,7 @@ void PW_Basis::real2recip(const FPTYPE* in, std::complex* out, const boo else { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ir = 0; ir < this->nrxx; ++ir) { @@ -114,7 +114,7 @@ void PW_Basis::real2recip(const FPTYPE* in, std::complex* out, const boo { FPTYPE tmpfac = factor / FPTYPE(this->nxyz); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ig = 0; ig < this->npw; ++ig) { @@ -125,7 +125,7 @@ void PW_Basis::real2recip(const FPTYPE* in, std::complex* out, const boo { FPTYPE tmpfac = 1.0 / FPTYPE(this->nxyz); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ig = 0; ig < this->npw; ++ig) { @@ -151,7 +151,7 @@ void PW_Basis::recip2real(const std::complex* in, ModuleBase::timer::tick(this->classname, "recip2real"); assert(this->gamma_only == false); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < this->nst * this->nz; ++i) { @@ -159,7 +159,7 @@ void PW_Basis::recip2real(const std::complex* in, } #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ig = 0; ig < this->npw; ++ig) { @@ -174,7 +174,7 @@ void PW_Basis::recip2real(const std::complex* in, if (add) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ir = 0; ir < this->nrxx; ++ir) { @@ -184,7 +184,7 @@ void PW_Basis::recip2real(const std::complex* in, else { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ir = 0; ir < this->nrxx; ++ir) { @@ -206,7 +206,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo { ModuleBase::timer::tick(this->classname, "recip2real"); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < this->nst * this->nz; ++i) { @@ -214,7 +214,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo } #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ig = 0; ig < this->npw; ++ig) { @@ -234,7 +234,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo if (add) { #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for collapse(2) schedule(static) #endif for (int ix = 0; ix < this->nx; ++ix) { @@ -247,7 +247,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo else { #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for collapse(2) schedule(static) #endif for (int ix = 0; ix < this->nx; ++ix) { @@ -264,7 +264,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo if (add) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ir = 0; ir < this->nrxx; ++ir) { @@ -274,7 +274,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo else { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ir = 0; ir < this->nrxx; ++ir) { diff --git a/source/source_basis/module_pw/pw_transform_k.cpp b/source/source_basis/module_pw/pw_transform_k.cpp index 36290d091a..a8adc6a67b 100644 --- a/source/source_basis/module_pw/pw_transform_k.cpp +++ b/source/source_basis/module_pw/pw_transform_k.cpp @@ -34,7 +34,7 @@ void PW_Basis_K::real2recip(const std::complex* in, assert(this->gamma_only == false); auto* auxr = this->fft_bundle.get_auxr_data(); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ir = 0; ir < this->nrxx; ++ir) { @@ -53,7 +53,7 @@ void PW_Basis_K::real2recip(const std::complex* in, { FPTYPE tmpfac = factor / FPTYPE(this->nxyz); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int igl = 0; igl < npwk; ++igl) { @@ -64,7 +64,7 @@ void PW_Basis_K::real2recip(const std::complex* in, { FPTYPE tmpfac = 1.0 / FPTYPE(this->nxyz); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int igl = 0; igl < npwk; ++igl) { @@ -103,7 +103,7 @@ void PW_Basis_K::real2recip(const FPTYPE* in, // r2c in place const int npy = this->ny * this->nplane; #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for collapse(2) schedule(static) #endif for (int ix = 0; ix < this->nx; ++ix) { @@ -126,7 +126,7 @@ void PW_Basis_K::real2recip(const FPTYPE* in, { FPTYPE tmpfac = factor / FPTYPE(this->nxyz); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int igl = 0; igl < npwk; ++igl) { @@ -137,7 +137,7 @@ void PW_Basis_K::real2recip(const FPTYPE* in, { FPTYPE tmpfac = 1.0 / FPTYPE(this->nxyz); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int igl = 0; igl < npwk; ++igl) { @@ -176,7 +176,7 @@ void PW_Basis_K::recip2real(const std::complex* in, const int npwk = this->npwk[ik]; auto* auxg = this->fft_bundle.get_auxg_data(); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int igl = 0; igl < npwk; ++igl) { @@ -191,7 +191,7 @@ void PW_Basis_K::recip2real(const std::complex* in, if (add) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ir = 0; ir < this->nrxx; ++ir) { @@ -201,7 +201,7 @@ void PW_Basis_K::recip2real(const std::complex* in, else { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ir = 0; ir < this->nrxx; ++ir) { @@ -239,7 +239,7 @@ void PW_Basis_K::recip2real(const std::complex* in, const int npwk = this->npwk[ik]; auto* auxg = this->fft_bundle.get_auxg_data(); #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int igl = 0; igl < npwk; ++igl) { @@ -262,7 +262,7 @@ void PW_Basis_K::recip2real(const std::complex* in, if (add) { #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for collapse(2) schedule(static) #endif for (int ix = 0; ix < this->nx; ++ix) { @@ -275,7 +275,7 @@ void PW_Basis_K::recip2real(const std::complex* in, else { #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for collapse(2) schedule(static) #endif for (int ix = 0; ix < this->nx; ++ix) { diff --git a/source/source_estate/kernels/elecstate_op.cpp b/source/source_estate/kernels/elecstate_op.cpp index 0a1bd731da..c3933319fc 100644 --- a/source/source_estate/kernels/elecstate_op.cpp +++ b/source/source_estate/kernels/elecstate_op.cpp @@ -70,7 +70,7 @@ struct elecstate_pw_op } else { #ifdef _OPENMP -#pragma omp parallel for collapse(2) schedule(static, 4096/sizeof(FPTYPE)) +#pragma omp parallel for collapse(2) schedule(static) #endif for (int is = 1; is < 4; is++) { diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp index 3d74591783..88f94e288c 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp +++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp @@ -196,7 +196,7 @@ struct refresh_hcc_scc_vcc_op const T &one) { #ifdef _OPENMP -#pragma omp parallel for collapse(1) schedule(static, 8192 / sizeof(T)) +#pragma omp parallel for collapse(1) schedule(static) #endif for (int i = 0; i < n; i++) { diff --git a/source/source_psi/psi_initializer.cpp b/source/source_psi/psi_initializer.cpp index 2f1fd46f99..cc260ec90c 100644 --- a/source/source_psi/psi_initializer.cpp +++ b/source/source_psi/psi_initializer.cpp @@ -92,7 +92,7 @@ void psi_initializer::random_t(T* psi, const int iw_start, const int iw_end, } // then for each g-component, initialize the wavefunction value #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(T)) +#pragma omp parallel for schedule(static) #endif for (int ig = 0; ig < ng; ig++) { @@ -104,7 +104,7 @@ void psi_initializer::random_t(T* psi, const int iw_start, const int iw_end, psi_slice[ig] = this->template cast_to_T(std::complex(rr * cos(arg), rr * sin(arg))); } #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(T)) +#pragma omp parallel for schedule(static) #endif for (int ig = ng; ig < npwk_max; ++ig) { @@ -151,7 +151,7 @@ void psi_initializer::random_t(T* psi, const int iw_start, const int iw_end, for (int ipol = 0; ipol < npol; ipol++) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(T)) +#pragma omp parallel for schedule(static) #endif for (int ig = 0; ig < ng; ig++) { diff --git a/source/source_psi/psi_initializer_nao.cpp b/source/source_psi/psi_initializer_nao.cpp index ff4f5667d4..7f74b40c44 100644 --- a/source/source_psi/psi_initializer_nao.cpp +++ b/source/source_psi/psi_initializer_nao.cpp @@ -264,7 +264,7 @@ void psi_initializer_nao::init_psig(T* psig, const int& ik) std::vector qnorm(npw); std::vector> q(npw); -#pragma omp parallel for schedule(static, 4096 / sizeof(double)) +#pragma omp parallel for schedule(static) for (int ig = 0; ig < npw; ig++) { q[ig] = this->pw_wfc_->getgpluskcar(ik, ig); diff --git a/source/source_pw/module_pwdft/VNL_in_pw.cpp b/source/source_pw/module_pwdft/VNL_in_pw.cpp index e26e09fa3e..5201722214 100644 --- a/source/source_pw/module_pwdft/VNL_in_pw.cpp +++ b/source/source_pw/module_pwdft/VNL_in_pw.cpp @@ -340,7 +340,7 @@ void pseudopot_cell_vnl::getvnl(Device* ctx, ModuleBase::Vector3* _gk = new ModuleBase::Vector3[npw]; #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096 / sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ig = 0; ig < npw; ig++) { diff --git a/source/source_pw/module_pwdft/kernels/veff_op.cpp b/source/source_pw/module_pwdft/kernels/veff_op.cpp index 23646e9608..e8f4b18b3a 100644 --- a/source/source_pw/module_pwdft/kernels/veff_op.cpp +++ b/source/source_pw/module_pwdft/kernels/veff_op.cpp @@ -8,7 +8,7 @@ struct veff_pw_op void operator()(const base_device::DEVICE_CPU* dev, const int& size, std::complex* out, const FPTYPE* in) { #ifdef _OPENMP -#pragma omp parallel for schedule(static, 4096/sizeof(FPTYPE)) +#pragma omp parallel for schedule(static) #endif for (int ir = 0; ir < size; ++ir) { From 54c7b96affe9644194ddfb8ac93f5d2d7c029e2b Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 6 Nov 2025 19:36:45 +0800 Subject: [PATCH 2/7] correct some usage not supported by nvc++ in gint module --- source/source_lcao/module_gint/gint_dvlocal.cpp | 5 +++-- source/source_lcao/module_gint/gint_env_gamma.cpp | 5 +++-- source/source_lcao/module_gint/gint_env_k.cpp | 5 +++-- source/source_lcao/module_gint/gint_fvl.cpp | 3 ++- source/source_lcao/module_gint/gint_fvl_meta.cpp | 3 ++- source/source_lcao/module_gint/gint_info.h | 1 + source/source_lcao/module_gint/gint_rho.cpp | 3 ++- source/source_lcao/module_gint/gint_tau.cpp | 3 ++- source/source_lcao/module_gint/gint_vl.cpp | 5 +++-- source/source_lcao/module_gint/gint_vl_metagga.cpp | 3 ++- source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp | 5 +++-- source/source_lcao/module_gint/gint_vl_metagga_nspin4.cpp | 3 ++- source/source_lcao/module_gint/gint_vl_nspin4.cpp | 3 ++- 13 files changed, 30 insertions(+), 17 deletions(-) diff --git a/source/source_lcao/module_gint/gint_dvlocal.cpp b/source/source_lcao/module_gint/gint_dvlocal.cpp index 78a8b91069..3d3888a0b9 100644 --- a/source/source_lcao/module_gint/gint_dvlocal.cpp +++ b/source/source_lcao/module_gint/gint_dvlocal.cpp @@ -33,9 +33,10 @@ void Gint_dvlocal::cal_hr_gint_() std::vector dphi_y; std::vector dphi_z; #pragma omp for schedule(dynamic) - for(const auto& biggrid: gint_info_->get_biggrids()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { - if(biggrid->get_atoms().empty()) + const auto& biggrid = gint_info_->get_biggrids()[i]; + if(biggrid->get_atoms().size() == 0) { continue; } diff --git a/source/source_lcao/module_gint/gint_env_gamma.cpp b/source/source_lcao/module_gint/gint_env_gamma.cpp index 71fabbd703..af024edf2f 100644 --- a/source/source_lcao/module_gint/gint_env_gamma.cpp +++ b/source/source_lcao/module_gint/gint_env_gamma.cpp @@ -28,9 +28,10 @@ void Gint_env_gamma::cal_env_band(const int iband) PhiOperator phi_op; std::vector phi; #pragma omp for schedule(dynamic) - for(const auto& biggrid: gint_info_->get_biggrids()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { - if(biggrid->get_atoms().empty()) + const auto& biggrid = gint_info_->get_biggrids()[i]; + if(biggrid->get_atoms().size() == 0) { continue; } diff --git a/source/source_lcao/module_gint/gint_env_k.cpp b/source/source_lcao/module_gint/gint_env_k.cpp index b92ed8ddfc..4f6ab1c0cf 100644 --- a/source/source_lcao/module_gint/gint_env_k.cpp +++ b/source/source_lcao/module_gint/gint_env_k.cpp @@ -33,9 +33,10 @@ void Gint_env_k::cal_env_band(const int iband) PhiOperator phi_op; std::vector phi; #pragma omp for schedule(dynamic) - for(const auto& biggrid: gint_info_->get_biggrids()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { - if(biggrid->get_atoms().empty()) + const auto& biggrid = gint_info_->get_biggrids()[i]; + if(biggrid->get_atoms().size() == 0) { continue; } diff --git a/source/source_lcao/module_gint/gint_fvl.cpp b/source/source_lcao/module_gint/gint_fvl.cpp index 3fc9bde005..11a7e1620d 100644 --- a/source/source_lcao/module_gint/gint_fvl.cpp +++ b/source/source_lcao/module_gint/gint_fvl.cpp @@ -49,8 +49,9 @@ void Gint_fvl::cal_fvl_svl_() svl_thread->zero_out(); } #pragma omp for schedule(dynamic) - for(const auto& biggrid: gint_info_->get_biggrids()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { + const auto& biggrid = gint_info_->get_biggrids()[i]; if(biggrid->get_atoms().size() == 0) { continue; diff --git a/source/source_lcao/module_gint/gint_fvl_meta.cpp b/source/source_lcao/module_gint/gint_fvl_meta.cpp index 3299600c99..728cd042a4 100644 --- a/source/source_lcao/module_gint/gint_fvl_meta.cpp +++ b/source/source_lcao/module_gint/gint_fvl_meta.cpp @@ -61,8 +61,9 @@ void Gint_fvl_meta::cal_fvl_svl_() svl_thread->zero_out(); } #pragma omp for schedule(dynamic) - for(const auto& biggrid: gint_info_->get_biggrids()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { + const auto& biggrid = gint_info_->get_biggrids()[i]; if(biggrid->get_atoms().size() == 0) { continue; diff --git a/source/source_lcao/module_gint/gint_info.h b/source/source_lcao/module_gint/gint_info.h index a2e35b6642..fc1dd106a2 100644 --- a/source/source_lcao/module_gint/gint_info.h +++ b/source/source_lcao/module_gint/gint_info.h @@ -35,6 +35,7 @@ class GintInfo // getter functions const std::vector>& get_biggrids() { return biggrids_; } + int get_bgrids_num() const { return static_cast(biggrids_.size()); } const std::vector& get_trace_lo() const{ return trace_lo_; } int get_lgd() const { return lgd_; } int get_nat() const { return ucell_->nat; } // return the number of atoms in the unitcell diff --git a/source/source_lcao/module_gint/gint_rho.cpp b/source/source_lcao/module_gint/gint_rho.cpp index c96b10a731..709be0507f 100644 --- a/source/source_lcao/module_gint/gint_rho.cpp +++ b/source/source_lcao/module_gint/gint_rho.cpp @@ -33,8 +33,9 @@ void Gint_rho::cal_rho_() std::vector phi; std::vector phi_dm; #pragma omp for schedule(dynamic) - for(const auto& biggrid: gint_info_->get_biggrids()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { + const auto& biggrid = gint_info_->get_biggrids()[i]; if(biggrid->get_atoms().size() == 0) { continue; diff --git a/source/source_lcao/module_gint/gint_tau.cpp b/source/source_lcao/module_gint/gint_tau.cpp index 1b5e282384..eb42b55b21 100644 --- a/source/source_lcao/module_gint/gint_tau.cpp +++ b/source/source_lcao/module_gint/gint_tau.cpp @@ -37,8 +37,9 @@ void Gint_tau::cal_tau_() std::vector dphi_y_dm; std::vector dphi_z_dm; #pragma omp for schedule(dynamic) - for(const auto& biggrid: gint_info_->get_biggrids()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { + const auto& biggrid = gint_info_->get_biggrids()[i]; if(biggrid->get_atoms().size() == 0) { continue; diff --git a/source/source_lcao/module_gint/gint_vl.cpp b/source/source_lcao/module_gint/gint_vl.cpp index 3cdd3c4549..4194bc77cb 100644 --- a/source/source_lcao/module_gint/gint_vl.cpp +++ b/source/source_lcao/module_gint/gint_vl.cpp @@ -34,9 +34,10 @@ void Gint_vl::cal_hr_gint_() std::vector phi; std::vector phi_vldr3; #pragma omp for schedule(dynamic) - for(const auto& biggrid: gint_info_->get_biggrids()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { - if(biggrid->get_atoms().empty()) + const auto& biggrid = gint_info_->get_biggrids()[i]; + if(biggrid->get_atoms().size() == 0) { continue; } diff --git a/source/source_lcao/module_gint/gint_vl_metagga.cpp b/source/source_lcao/module_gint/gint_vl_metagga.cpp index 56dd2ff6b0..a24fe5fe86 100644 --- a/source/source_lcao/module_gint/gint_vl_metagga.cpp +++ b/source/source_lcao/module_gint/gint_vl_metagga.cpp @@ -40,8 +40,9 @@ void Gint_vl_metagga::cal_hr_gint_() std::vector dphi_y_vldr3; std::vector dphi_z_vldr3; #pragma omp for schedule(dynamic) - for(const auto& biggrid: gint_info_->get_biggrids()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { + const auto& biggrid = gint_info_->get_biggrids()[i]; if(biggrid->get_atoms().size() == 0) { continue; diff --git a/source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp b/source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp index 9c2dad8421..8546d34045 100644 --- a/source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp +++ b/source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp @@ -64,9 +64,10 @@ void Gint_vl_metagga_gpu::cal_hr_gint_() CudaMemWrapper dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); CudaMemWrapper dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); #pragma omp for schedule(dynamic) - for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { - if(bgrid_batch->empty()) + const auto& biggrid = gint_info_->get_biggrids()[i]; + if(biggrid->get_atoms().size() == 0) { continue; } diff --git a/source/source_lcao/module_gint/gint_vl_metagga_nspin4.cpp b/source/source_lcao/module_gint/gint_vl_metagga_nspin4.cpp index 5b4c45e5db..229152ea9b 100644 --- a/source/source_lcao/module_gint/gint_vl_metagga_nspin4.cpp +++ b/source/source_lcao/module_gint/gint_vl_metagga_nspin4.cpp @@ -41,8 +41,9 @@ void Gint_vl_metagga_nspin4::cal_hr_gint_() std::vector dphi_y_vldr3; std::vector dphi_z_vldr3; #pragma omp for schedule(dynamic) - for(const auto& biggrid: gint_info_->get_biggrids()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { + const auto& biggrid = gint_info_->get_biggrids()[i]; if(biggrid->get_atoms().size() == 0) { continue; diff --git a/source/source_lcao/module_gint/gint_vl_nspin4.cpp b/source/source_lcao/module_gint/gint_vl_nspin4.cpp index 14350eb218..576200397a 100644 --- a/source/source_lcao/module_gint/gint_vl_nspin4.cpp +++ b/source/source_lcao/module_gint/gint_vl_nspin4.cpp @@ -34,8 +34,9 @@ void Gint_vl_nspin4::cal_hr_gint_() std::vector phi; std::vector phi_vldr3; #pragma omp for schedule(dynamic) - for(const auto& biggrid: gint_info_->get_biggrids()) + for (int i = 0; i < gint_info_->get_bgrids_num(); i++) { + const auto& biggrid = gint_info_->get_biggrids()[i]; if(biggrid->get_atoms().size() == 0) { continue; From 3c0da861e096d59ee233f70513bcabb9f9dda9fb Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 4 Dec 2025 00:00:29 +0800 Subject: [PATCH 3/7] correct OpenMP usages not supported by nvc++ --- .../source_base/module_external/blas_connector_vector.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/source_base/module_external/blas_connector_vector.cpp b/source/source_base/module_external/blas_connector_vector.cpp index 2c0679837a..b650f16f8f 100644 --- a/source/source_base/module_external/blas_connector_vector.cpp +++ b/source/source_base/module_external/blas_connector_vector.cpp @@ -412,7 +412,7 @@ void vector_add_vector(const int& dim, float *result, const float *vector1, cons { if (device_type == base_device::CpuDevice){ #ifdef _OPENMP -#pragma omp parallel for schedule(static ) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { @@ -433,7 +433,7 @@ void vector_add_vector(const int& dim, double *result, const double *vector1, co { if (device_type == base_device::CpuDevice){ #ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(double)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { @@ -475,7 +475,7 @@ void vector_add_vector(const int& dim, std::complex *result, const std:: { if (device_type == base_device::CpuDevice){ #ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex)) +#pragma omp parallel for schedule(static) #endif for (int i = 0; i < dim; i++) { From a9221269ea8100e7a412df4040390c564f55b3c0 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 4 Dec 2025 00:11:55 +0800 Subject: [PATCH 4/7] Update multiple Gint GPU module files --- source/source_lcao/module_gint/gint_fvl_gpu.cpp | 3 ++- source/source_lcao/module_gint/gint_fvl_meta_gpu.cpp | 3 ++- source/source_lcao/module_gint/gint_info.h | 1 + source/source_lcao/module_gint/gint_rho_gpu.cpp | 3 ++- source/source_lcao/module_gint/gint_tau_gpu.cpp | 3 ++- source/source_lcao/module_gint/gint_vl_gpu.cpp | 3 ++- source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp | 6 +++--- .../source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.cpp | 3 ++- source/source_lcao/module_gint/gint_vl_nspin4_gpu.cpp | 3 ++- 9 files changed, 18 insertions(+), 10 deletions(-) diff --git a/source/source_lcao/module_gint/gint_fvl_gpu.cpp b/source/source_lcao/module_gint/gint_fvl_gpu.cpp index 1d90304d2c..0bd8d58f21 100644 --- a/source/source_lcao/module_gint/gint_fvl_gpu.cpp +++ b/source/source_lcao/module_gint/gint_fvl_gpu.cpp @@ -93,8 +93,9 @@ void Gint_fvl_gpu::cal_fvl_svl_() CudaMemWrapper dphi_z(BatchBigGrid::get_max_phi_len(), stream, false); #pragma omp for schedule(dynamic) - for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + for (int i = 0; i < gint_info_->get_bgrid_batches_num(); ++i) { + const auto& bgrid_batch = gint_info_->get_bgrid_batches()[i]; if(bgrid_batch->empty()) { continue; diff --git a/source/source_lcao/module_gint/gint_fvl_meta_gpu.cpp b/source/source_lcao/module_gint/gint_fvl_meta_gpu.cpp index fa19925d04..9275f3f4c0 100644 --- a/source/source_lcao/module_gint/gint_fvl_meta_gpu.cpp +++ b/source/source_lcao/module_gint/gint_fvl_meta_gpu.cpp @@ -108,8 +108,9 @@ void Gint_fvl_meta_gpu::cal_fvl_svl_() CudaMemWrapper ddphi_yz(BatchBigGrid::get_max_phi_len(), stream, false); CudaMemWrapper ddphi_zz(BatchBigGrid::get_max_phi_len(), stream, false); #pragma omp for schedule(dynamic) - for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + for (int i = 0; i < gint_info_->get_bgrid_batches_num(); ++i) { + const auto& bgrid_batch = gint_info_->get_bgrid_batches()[i]; if(bgrid_batch->empty()) { continue; diff --git a/source/source_lcao/module_gint/gint_info.h b/source/source_lcao/module_gint/gint_info.h index fc1dd106a2..91ea1d913a 100644 --- a/source/source_lcao/module_gint/gint_info.h +++ b/source/source_lcao/module_gint/gint_info.h @@ -104,6 +104,7 @@ class GintInfo #ifdef __CUDA public: std::vector>& get_bgrid_batches() { return bgrid_batches_; }; + int get_bgrid_batches_num() const { return static_cast(bgrid_batches_.size()); }; std::shared_ptr get_gpu_vars() const { return gpu_vars_; }; int get_dev_id() const { return gpu_vars_->dev_id_; }; int get_streams_num() const { return streams_num_; }; diff --git a/source/source_lcao/module_gint/gint_rho_gpu.cpp b/source/source_lcao/module_gint/gint_rho_gpu.cpp index ca24002579..a51f7524a4 100644 --- a/source/source_lcao/module_gint/gint_rho_gpu.cpp +++ b/source/source_lcao/module_gint/gint_rho_gpu.cpp @@ -62,8 +62,9 @@ void Gint_rho_gpu::cal_rho_() CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); CudaMemWrapper phi_dm(BatchBigGrid::get_max_phi_len(), stream, false); #pragma omp for schedule(dynamic) - for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + for (int i = 0; i < gint_info_->get_bgrid_batches_num(); ++i) { + const auto& bgrid_batch = gint_info_->get_bgrid_batches()[i]; if(bgrid_batch->empty()) { continue; diff --git a/source/source_lcao/module_gint/gint_tau_gpu.cpp b/source/source_lcao/module_gint/gint_tau_gpu.cpp index cbeeead322..2063cc6855 100644 --- a/source/source_lcao/module_gint/gint_tau_gpu.cpp +++ b/source/source_lcao/module_gint/gint_tau_gpu.cpp @@ -66,8 +66,9 @@ void Gint_tau_gpu::cal_tau_() CudaMemWrapper dphi_y_dm(BatchBigGrid::get_max_phi_len(), stream, false); CudaMemWrapper dphi_z_dm(BatchBigGrid::get_max_phi_len(), stream, false); #pragma omp for schedule(dynamic) - for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + for (int i = 0; i < gint_info_->get_bgrid_batches_num(); ++i) { + const auto& bgrid_batch = gint_info_->get_bgrid_batches()[i]; if(bgrid_batch->empty()) { continue; diff --git a/source/source_lcao/module_gint/gint_vl_gpu.cpp b/source/source_lcao/module_gint/gint_vl_gpu.cpp index fe9162bc4e..359a078689 100644 --- a/source/source_lcao/module_gint/gint_vl_gpu.cpp +++ b/source/source_lcao/module_gint/gint_vl_gpu.cpp @@ -51,8 +51,9 @@ void Gint_vl_gpu::cal_hr_gint_() CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); #pragma omp for schedule(dynamic) - for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + for (int i = 0; i < gint_info_->get_bgrid_batches_num(); ++i) { + const auto& bgrid_batch = gint_info_->get_bgrid_batches()[i]; if(bgrid_batch->empty()) { continue; diff --git a/source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp b/source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp index 8546d34045..0dd101693d 100644 --- a/source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp +++ b/source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp @@ -64,10 +64,10 @@ void Gint_vl_metagga_gpu::cal_hr_gint_() CudaMemWrapper dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); CudaMemWrapper dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); #pragma omp for schedule(dynamic) - for (int i = 0; i < gint_info_->get_bgrids_num(); i++) + for (int i = 0; i < gint_info_->get_bgrid_batches_num(); ++i) { - const auto& biggrid = gint_info_->get_biggrids()[i]; - if(biggrid->get_atoms().size() == 0) + const auto& bgrid_batch = gint_info_->get_bgrid_batches()[i]; + if(bgrid_batch->empty()) { continue; } diff --git a/source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.cpp b/source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.cpp index ec8b940710..6cc178bb9c 100644 --- a/source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.cpp +++ b/source/source_lcao/module_gint/gint_vl_metagga_nspin4_gpu.cpp @@ -72,8 +72,9 @@ void Gint_vl_metagga_nspin4_gpu::cal_hr_gint_() CudaMemWrapper dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); CudaMemWrapper dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); #pragma omp for schedule(dynamic) - for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + for (int i = 0; i < gint_info_->get_bgrid_batches_num(); ++i) { + const auto& bgrid_batch = gint_info_->get_bgrid_batches()[i]; if(bgrid_batch->empty()) { continue; diff --git a/source/source_lcao/module_gint/gint_vl_nspin4_gpu.cpp b/source/source_lcao/module_gint/gint_vl_nspin4_gpu.cpp index 4f9b86c0b4..d5c21ef71d 100644 --- a/source/source_lcao/module_gint/gint_vl_nspin4_gpu.cpp +++ b/source/source_lcao/module_gint/gint_vl_nspin4_gpu.cpp @@ -63,8 +63,9 @@ void Gint_vl_nspin4_gpu::cal_hr_gint_() CudaMemWrapper phi(BatchBigGrid::get_max_phi_len(), stream, false); CudaMemWrapper phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false); #pragma omp for schedule(dynamic) - for(const auto& bgrid_batch: gint_info_->get_bgrid_batches()) + for (int i = 0; i < gint_info_->get_bgrid_batches_num(); ++i) { + const auto& bgrid_batch = gint_info_->get_bgrid_batches()[i]; if(bgrid_batch->empty()) { continue; From cf892e9eb242d3162ee830afedeac5e21a83efda Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 4 Dec 2025 00:54:46 +0800 Subject: [PATCH 5/7] fix: workaround NVC++ internal compiler error in dftu_yukawa.cpp Add -O1 compile flag for dftu_yukawa.cpp when using NVHPC/PGI compiler to avoid 'gen_llvm_expr(): unknown opcode' internal compiler error. --- source/source_lcao/module_dftu/CMakeLists.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/source/source_lcao/module_dftu/CMakeLists.txt b/source/source_lcao/module_dftu/CMakeLists.txt index d412154970..dd09977b0b 100644 --- a/source/source_lcao/module_dftu/CMakeLists.txt +++ b/source/source_lcao/module_dftu/CMakeLists.txt @@ -16,6 +16,12 @@ add_library( ${objects} ) +# Workaround for NVC++ internal compiler error in dftu_yukawa.cpp +# Use -O1 optimization level for this specific file +if(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC|PGI") + set_source_files_properties(dftu_yukawa.cpp PROPERTIES COMPILE_FLAGS "-O1") +endif() + if(ENABLE_COVERAGE) add_coverage(dftu) -endif() \ No newline at end of file +endif() From 36b228ed56bb1a0f785474fc9db519fe11201890 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 4 Dec 2025 01:02:01 +0800 Subject: [PATCH 6/7] perf: use static const array for C6 and r0ab parameters in vdwd3 Replace std::vector with static const array to avoid runtime initialization overhead and reduce memory allocations. --- source/source_hamilt/module_vdw/vdwd3_parameters_tab.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/source_hamilt/module_vdw/vdwd3_parameters_tab.cpp b/source/source_hamilt/module_vdw/vdwd3_parameters_tab.cpp index bd06018e6e..534f845ea4 100644 --- a/source/source_hamilt/module_vdw/vdwd3_parameters_tab.cpp +++ b/source/source_hamilt/module_vdw/vdwd3_parameters_tab.cpp @@ -11,7 +11,7 @@ namespace vdw void Vdwd3Parameters::init_C6() { - std::vector C6_tmp = { + static const double C6_tmp[] = { 0.30267000E+1,0.100E+1,0.100E+1,0.91180000E+0,0.91180000E+0 ,0.20835000E+1,0.200E+1,0.100E+1,0.00000000E+0,0.91180000E+0 ,0.15583000E+1,0.200E+1,0.200E+1,0.00000000E+0,0.00000000E+0 @@ -32475,7 +32475,7 @@ void Vdwd3Parameters::init_rcov() void Vdwd3Parameters::init_r0ab() { - std::vector r = { + static const double r[] = { 2.1823, 1.8547, 1.7347, 2.9086, 2.5732, 3.4956, 2.3550, 2.5095, 2.9802, 3.0982, 2.5141, 2.3917, 2.9977, 2.9484, 3.2160, 2.4492, 2.2527, 3.1933, 3.0214, 2.9531, 2.9103, From 90f991c20231d5aa5db672684c2b36d46983fd97 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 4 Dec 2025 11:36:37 +0800 Subject: [PATCH 7/7] refactor: simplify loop and conditional logic in dftu_yukawa.cpp Refactor the loop over 'n' (which was restricted to n=0) and the switch statement in 'cal_slater_UJ' to use direct array access with index 0 and if-else blocks. This simplification resolves the NVC++ internal compiler error without needing the -O1 optimization workaround, so the workaround in CMakeLists.txt has been removed. --- source/source_lcao/module_dftu/CMakeLists.txt | 6 -- .../source_lcao/module_dftu/dftu_yukawa.cpp | 56 ++++++++----------- 2 files changed, 22 insertions(+), 40 deletions(-) diff --git a/source/source_lcao/module_dftu/CMakeLists.txt b/source/source_lcao/module_dftu/CMakeLists.txt index dd09977b0b..42a58af7ba 100644 --- a/source/source_lcao/module_dftu/CMakeLists.txt +++ b/source/source_lcao/module_dftu/CMakeLists.txt @@ -16,12 +16,6 @@ add_library( ${objects} ) -# Workaround for NVC++ internal compiler error in dftu_yukawa.cpp -# Use -O1 optimization level for this specific file -if(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC|PGI") - set_source_files_properties(dftu_yukawa.cpp PROPERTIES COMPILE_FLAGS "-O1") -endif() - if(ENABLE_COVERAGE) add_coverage(dftu) endif() diff --git a/source/source_lcao/module_dftu/dftu_yukawa.cpp b/source/source_lcao/module_dftu/dftu_yukawa.cpp index 4f124e9509..5f77388a75 100644 --- a/source/source_lcao/module_dftu/dftu_yukawa.cpp +++ b/source/source_lcao/module_dftu/dftu_yukawa.cpp @@ -149,42 +149,30 @@ void Plus_U::cal_slater_UJ(const UnitCell& ucell, double** rho, const int& nrxx) } this->cal_slater_Fk(ucell,L, T); - for (int n = 0; n < N; n++) - { - if (n != 0) - { - continue; - } - switch (L) - { - case 1: // p electrons - this->U_Yukawa[T][L][n] = this->Fk[T][L][n][0]; - this->J_Yukawa[T][L][n] = this->Fk[T][L][n][1] / 5.0; - break; - - case 2: // d electrons - this->U_Yukawa[T][L][n] = this->Fk[T][L][n][0]; - this->J_Yukawa[T][L][n] = (this->Fk[T][L][n][1] + this->Fk[T][L][n][2]) / 14.0; - break; - - case 3: // f electrons - if (Yukawa) - { - this->U_Yukawa[T][L][n] = this->Fk[T][L][n][0]; - } - this->J_Yukawa[T][L][n] = (286.0 * this->Fk[T][L][n][1] + 195.0 * this->Fk[T][L][n][2] - + 250.0 * this->Fk[T][L][n][3]) - / 6435.0; - break; - } + if( L == 1) + { + this->U_Yukawa[T][L][0] = this->Fk[T][L][0][0]; + this->J_Yukawa[T][L][0] = this->Fk[T][L][0][1] / 5.0; + } + else if( L == 2) + { + this->U_Yukawa[T][L][0] = this->Fk[T][L][0][0]; + this->J_Yukawa[T][L][0] = (this->Fk[T][L][0][1] + this->Fk[T][L][0][2]) / 14.0; + } + else if( L == 3) + { + this->U_Yukawa[T][L][0] = this->Fk[T][L][0][0]; + this->J_Yukawa[T][L][0] = (286.0 * this->Fk[T][L][0][1] + 195.0 * this->Fk[T][L][0][2] + + 250.0 * this->Fk[T][L][0][3]) + / 6435.0; + } - // Hartree to Rydeberg - this->U_Yukawa[T][L][n] *= 2.0; - this->J_Yukawa[T][L][n] *= 2.0; - // update current U with calculated U-J from Slater integrals - this->U[T] = this->U_Yukawa[T][L][n] - this->J_Yukawa[T][L][n]; - } // end n + // Hartree to Rydeberg + this->U_Yukawa[T][L][0] *= 2.0; + this->J_Yukawa[T][L][0] *= 2.0; + // update current U with calculated U-J from Slater integrals + this->U[T] = this->U_Yukawa[T][L][0] - this->J_Yukawa[T][L][0]; } // end if } // end L } // end T