From f9b4d362ec35c25449c65ad264b4c1570984d6a1 Mon Sep 17 00:00:00 2001 From: Bonan Zhu Date: Sun, 1 Feb 2026 21:25:00 +0800 Subject: [PATCH 1/4] Fix double precision GPU bug by using GEMV instead of GEMM It appears that GEMM with dimension 1 can be buggy for GPU (cuBLAS) --- .../kernels/cuda/math_kernel_op.cu | 21 +++- source/source_hsolver/diago_dav_subspace.cpp | 87 ++++++++++++++ source/source_hsolver/diago_david.cpp | 112 +++++++++++++----- 3 files changed, 189 insertions(+), 31 deletions(-) diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu index 562760ca7a..041d7868ef 100644 --- a/source/source_base/kernels/cuda/math_kernel_op.cu +++ b/source/source_base/kernels/cuda/math_kernel_op.cu @@ -178,6 +178,25 @@ void gemv_op::operator()(const char& trans, cublasErrcheck(cublasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx)); } +template <> +void gemv_op::operator()(const char& trans, + const int& m, + const int& n, + const float* alpha, + const float* A, + const int& lda, + const float* X, + const int& incx, + const float* beta, + float* Y, + const int& incy) +{ + cublasOperation_t cutrans = judge_trans_op(false, trans, "gemv_op"); + cublasErrcheck(cublasSgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx)); +} + + + template <> void gemv_op, base_device::DEVICE_GPU>::operator()(const char& trans, const int& m, @@ -215,7 +234,7 @@ void gemv_op, base_device::DEVICE_GPU>::operator()(const ch cuDoubleComplex beta = make_cuDoubleComplex(beta_in->real(), beta_in->imag()); // icpc and nvcc have some compatible problems // We must use cuDoubleComplex instead of converting std::complex* to cuDoubleComplex* - cublasErrcheck(cublasZgemv(cublas_handle, cutrans, m, n, &alpha, (cuDoubleComplex*)A, lda, (cuDoubleComplex*)X, incx, &beta, (cuDoubleComplex*)Y, incx)); + cublasErrcheck(cublasZgemv(cublas_handle, cutrans, m, n, &alpha, (cuDoubleComplex*)A, lda, (cuDoubleComplex*)X, incx, &beta, (cuDoubleComplex*)Y, incy)); } template <> diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp index 8fa997774b..3bc53523d8 100644 --- a/source/source_hsolver/diago_dav_subspace.cpp +++ b/source/source_hsolver/diago_dav_subspace.cpp @@ -295,6 +295,8 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, } } + if (notconv > 1){ + #ifdef __DSP ModuleBase::gemm_op_mt() #else @@ -313,6 +315,28 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, this->zero, psi_iter + (nbase) * this->dim, this->dim); + } else + { + +#ifdef __DSP + ModuleBase::gemv_op_mt() +#else + ModuleBase::gemv_op() +#endif + ('N', + this->dim, // m: row of A + nbase, // n: col of A + this->one, // alpha + hpsi, // A dim * nbase + this->dim, // LDA: if(N) max(1,m) + vcc, // X nbase + 1, // incx + this->zero, // beta + psi_iter + (nbase) * this->dim, // Y dim + 1 // incy + ); + } + // Eigenvalues operation section Real* e_temp_hd = eigenvalue_iter->data(); @@ -325,6 +349,8 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, // vcc = - vcc * eigenvalue ModuleBase::matrix_mul_vector_op()(nbase, notconv, vcc, this->nbase_x, e_temp_hd, -1.0, vcc, this->nbase_x); + if (notconv > 1){ + #ifdef __DSP ModuleBase::gemm_op_mt() #else @@ -343,6 +369,26 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, this->one, psi_iter + nbase * this->dim, this->dim); + } else + { +#ifdef __DSP + ModuleBase::gemv_op_mt() +#else + ModuleBase::gemv_op() +#endif + ('N', + this->dim, // m: row of A + nbase, // n: col of A + this->one, // alpha + spsi, // A dim * nbase + this->dim, // LDA: if(N) max(1,m) + vcc, // X nbase + 1, // incx + this->one, // beta + psi_iter + nbase * this->dim, // Y dim + 1 // incy + ); + } // Precondition section #if defined(__CUDA) || defined(__ROCM) @@ -413,6 +459,7 @@ void Diago_DavSubspace::cal_elem(const int& dim, { ModuleBase::timer::tick("Diago_DavSubspace", "cal_elem"); + if (notconv > 1){ #ifdef __DSP ModuleBase::gemm_op_mt() #else @@ -451,6 +498,46 @@ void Diago_DavSubspace::cal_elem(const int& dim, &scc[nbase * this->nbase_x], this->nbase_x); + } else { + +#ifdef __DSP + ModuleBase::gemv_op_mt() +#else + ModuleBase::gemv_op() +#endif + ('C', + this->dim, // m: row of A + nbase + notconv, // n: col of A + this->one, // alpha + psi_iter, // A dim * nbase + this->dim, // LDA: if(N) max(1,m) + &hpsi[nbase * this->dim], // X nbase + 1, // incx + this->zero, // beta + &hcc[nbase * this->nbase_x], // Y dim + 1 // incy + ); +#ifdef __DSP + ModuleBase::gemv_op_mt() +#else + ModuleBase::gemv_op() +#endif + ('C', + this->dim, // m: row of A + nbase + notconv, // n: col of A + this->one, // alpha + psi_iter, // A dim * nbase + this->dim, // LDA: if(N) max(1,m) + spsi + nbase * this->dim, // X nbase + 1, // incx + this->zero, // beta + &scc[nbase * this->nbase_x], // Y dim + 1 // incy + ); + + } + + #ifdef __MPI if (this->diag_comm.nproc > 1) { diff --git a/source/source_hsolver/diago_david.cpp b/source/source_hsolver/diago_david.cpp index bf390104af..ef4ba67cf3 100644 --- a/source/source_hsolver/diago_david.cpp +++ b/source/source_hsolver/diago_david.cpp @@ -351,7 +351,24 @@ void DiagoDavid::cal_grad(const HPsiFunc& hpsi_func, // basis[nbase] = hpsi * vc_ev_vector = hpsi*vcc // basis' = vc_ev_vector' * hpsi' // (dim, notconv) (dim, nbase) (nbase, notconv) - ModuleBase::gemm_op()('N', + if (notconv == 1){ + //Reuse gemv for vector case to avoid potential bug using gemm call with n=1 + ModuleBase::gemv_op()('N', + dim, // m: row of A + nbase, // n: col of A + this->one, // alpha + hpsi, // A dim * nbase + dim, // LDA: if(N) max(1,m) + vc_ev_vector, // X nbase + 1, // incx + this->zero, // beta + basis + dim * nbase, // Y dim + 1 // incy + ); + + }else + { + ModuleBase::gemm_op()('N', 'N', dim, // m: row of A,C notconv, // n: col of B,C @@ -364,7 +381,8 @@ void DiagoDavid::cal_grad(const HPsiFunc& hpsi_func, this->zero, // belta basis + dim * nbase, // C dim * notconv dim // LDC: if(N) max(1, m) - ); + ); + } //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< // for (int m = 0; m < notconv; m++) @@ -411,20 +429,37 @@ void DiagoDavid::cal_grad(const HPsiFunc& hpsi_func, // = (H - lambda * S) * psi * vcc // = (H - lambda * S) * psi_new //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - ModuleBase::gemm_op()('N', - 'N', - dim, // m: row of A,C - notconv, // n: col of B,C - nbase, // k: col of A, row of B - this->one, // alpha - spsi, // A - dim, // LDA: if(N) max(1,m) if(T) max(1,k) - vc_ev_vector, // B - nbase, // LDB: if(N) max(1,k) if(T) max(1,n) - this->one, // belta - basis + dim * nbase, // C dim * notconv - dim // LDC: if(N) max(1, m) - ); + if (notconv == 1){ + //Use gemv for vector case to avoid potential bug using gemm call with n=1 + ModuleBase::gemv_op()('N', + dim, // m: row of A + nbase, // n: col of A + this->one, // alpha + spsi, // A dim * nbase + dim, // LDA: if(N) max(1,m) + vc_ev_vector, // X nbase + 1, // incx + this->one, // beta + basis + dim * nbase, // Y dim + 1 //incy + ); + } else + { + ModuleBase::gemm_op()('N', + 'N', + dim, // m: row of A,C + notconv, // n: col of B,C + nbase, // k: col of A, row of B + this->one, // alpha + spsi, // A + dim, // LDA: if(N) max(1,m) if(T) max(1,k) + vc_ev_vector, // B + nbase, // LDB: if(N) max(1,k) if(T) max(1,n) + this->one, // beta + basis + dim * nbase, // C dim * notconv + dim // LDC: if(N) max(1, m) + ); + } //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< // Preconditioning @@ -478,20 +513,37 @@ void DiagoDavid::cal_grad(const HPsiFunc& hpsi_func, // first nbase bands psi* dot notconv bands spsi to prepare lagrange_matrix // calculate the square matrix for future lagranges - ModuleBase::gemm_op()('C', - 'N', - nbase, // m: row of A,C - notconv, // n: col of B,C - dim, // k: col of A, row of B - this->one, // alpha - basis, // A - dim, // LDA: if(N) max(1,m) if(T) max(1,k) - &spsi[nbase * dim], // B - dim, // LDB: if(N) max(1,k) if(T) max(1,n) - this->zero, // belta - lagrange, // C - nbase + notconv // LDC: if(N) max(1, m) - ); + if (notconv == 1){ + //Use gemv for vector case to avoid potential bug using gemm call with n=1 + ModuleBase::gemv_op()('C', + dim, // m: row of A + nbase, // n: col of A + this->one, // alpha + basis, // A dim * nbase + dim, // LDA: if(N) max(1,m) + &spsi[nbase * dim], // X dim + 1, // incx + this->zero, // beta + lagrange, // Y nbase + 1 + ); + } else + { + ModuleBase::gemm_op()('C', + 'N', + nbase, // m: row of A,C + notconv, // n: col of B,C + dim, // k: col of A, row of B + this->one, // alpha + basis, // A + dim, // LDA: if(N) max(1,m) if(T) max(1,k) + &spsi[nbase * dim], // B + dim, // LDB: if(N) max(1,k) if(T) max(1,n) + this->zero, // belta + lagrange, // C + nbase + notconv // LDC: if(N) max(1, m) + ); + } for (int m = 0; m < notconv; m++) { From b93565ca1bf2c716dc33b4ae795bf7a2e2c50e24 Mon Sep 17 00:00:00 2001 From: Bonan Zhu Date: Mon, 2 Feb 2026 10:09:03 +0800 Subject: [PATCH 2/4] Fix GPU single precision energy error in dav_subspace solver - Restore d_precondition host-to-device sync that was commented out in #5199 (this caused uninitialized GPU memory to be used as the preconditioner) - Fix cuBLAS gemv calls using incx instead of incy for Y parameter - Fix gemv_batched using incy instead of incx for x parameter Fixes GPU single precision energy being ~0.027 eV off from correct value. --- source/source_base/kernels/cuda/math_kernel_op.cu | 6 +++--- .../source_base/module_container/base/third_party/cublas.h | 2 +- source/source_hsolver/diago_dav_subspace.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu index 041d7868ef..65dfd8bed9 100644 --- a/source/source_base/kernels/cuda/math_kernel_op.cu +++ b/source/source_base/kernels/cuda/math_kernel_op.cu @@ -175,7 +175,7 @@ void gemv_op::operator()(const char& trans, const int& incy) { cublasOperation_t cutrans = judge_trans_op(false, trans, "gemv_op"); - cublasErrcheck(cublasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx)); + cublasErrcheck(cublasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incy)); } template <> @@ -192,7 +192,7 @@ void gemv_op::operator()(const char& trans, const int& incy) { cublasOperation_t cutrans = judge_trans_op(false, trans, "gemv_op"); - cublasErrcheck(cublasSgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx)); + cublasErrcheck(cublasSgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incy)); } @@ -213,7 +213,7 @@ void gemv_op, base_device::DEVICE_GPU>::operator()(const cha cublasOperation_t cutrans = judge_trans_op(true, trans, "gemv_op"); cuFloatComplex alpha = make_cuFloatComplex(alpha_in->real(), alpha_in->imag()); cuFloatComplex beta = make_cuFloatComplex(beta_in->real(), beta_in->imag()); - cublasErrcheck(cublasCgemv(cublas_handle, cutrans, m, n, &alpha, (cuFloatComplex*)A, lda, (cuFloatComplex*)X, incx, &beta, (cuFloatComplex*)Y, incx)); + cublasErrcheck(cublasCgemv(cublas_handle, cutrans, m, n, &alpha, (cuFloatComplex*)A, lda, (cuFloatComplex*)X, incx, &beta, (cuFloatComplex*)Y, incy)); } template <> diff --git a/source/source_base/module_container/base/third_party/cublas.h b/source/source_base/module_container/base/third_party/cublas.h index cea046f30e..664b9d9b36 100644 --- a/source/source_base/module_container/base/third_party/cublas.h +++ b/source/source_base/module_container/base/third_party/cublas.h @@ -152,7 +152,7 @@ void gemv_batched(cublasHandle_t& handle, const char& trans, const int& m, const { for (int ii = 0; ii < batch_size; ++ii) { // Call the single GEMV for each pair of matrix A[ii] and vector x[ii] - cuBlasConnector::gemv(handle, trans, m, n, alpha, A[ii], lda, x[ii], incy, beta, y[ii], incy); + cuBlasConnector::gemv(handle, trans, m, n, alpha, A[ii], lda, x[ii], incx, beta, y[ii], incy); } } diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp index 3bc53523d8..27c6a5b348 100644 --- a/source/source_hsolver/diago_dav_subspace.cpp +++ b/source/source_hsolver/diago_dav_subspace.cpp @@ -77,7 +77,7 @@ Diago_DavSubspace::Diago_DavSubspace(const std::vector& precond if (this->device == base_device::GpuDevice) { resmem_real_op()(this->d_precondition, nbasis_in); - // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in); + syncmem_var_h2d_op()(this->d_precondition, this->precondition.data(), nbasis_in); resmem_complex_op()(this->d_scc, this->nbase_x * this->nbase_x); resmem_real_op()(this->d_eigenvalue, this->nbase_x); } @@ -369,7 +369,7 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, this->one, psi_iter + nbase * this->dim, this->dim); - } else + } else { #ifdef __DSP ModuleBase::gemv_op_mt() From ad93d4ca1e5897018b84d662874048af02dcc4d2 Mon Sep 17 00:00:00 2001 From: Bonan Zhu Date: Mon, 2 Feb 2026 10:34:59 +0800 Subject: [PATCH 3/4] Fix ROCm gemv incy parameter bug (same as CUDA fix) Fixed 3 hipBLAS gemv calls that incorrectly used incx instead of incy for the Y vector stride parameter: - hipblasDgemv (double) - hipblasCgemv (complex) - hipblasZgemv (complex) This is the same bug that was fixed in the CUDA version (math_kernel_op.cu). Co-Authored-By: Claude Opus 4.5 --- source/source_base/kernels/rocm/math_kernel_op.hip.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/source_base/kernels/rocm/math_kernel_op.hip.cu b/source/source_base/kernels/rocm/math_kernel_op.hip.cu index f60cdfc3b1..1b4f30d6b2 100644 --- a/source/source_base/kernels/rocm/math_kernel_op.hip.cu +++ b/source/source_base/kernels/rocm/math_kernel_op.hip.cu @@ -188,7 +188,7 @@ void gemv_op::operator()(const char& trans, const int& incy) { hipblasOperation_t cutrans = judge_trans_op(false, trans, "gemv_op"); - hipblasErrcheck(hipblasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx)); + hipblasErrcheck(hipblasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incy)); } template <> @@ -205,7 +205,7 @@ void gemv_op, base_device::DEVICE_GPU>::operator()(const cha const int& incy) { hipblasOperation_t cutrans = judge_trans_op(true, trans, "gemv_op"); - hipblasErrcheck(hipblasCgemv(cublas_handle, cutrans, m, n, (hipblasComplex*)alpha, (hipblasComplex*)A, lda, (hipblasComplex*)X, incx, (hipblasComplex*)beta, (hipblasComplex*)Y, incx)); + hipblasErrcheck(hipblasCgemv(cublas_handle, cutrans, m, n, (hipblasComplex*)alpha, (hipblasComplex*)A, lda, (hipblasComplex*)X, incx, (hipblasComplex*)beta, (hipblasComplex*)Y, incy)); } template <> @@ -222,7 +222,7 @@ void gemv_op, base_device::DEVICE_GPU>::operator()(const ch const int& incy) { hipblasOperation_t cutrans = judge_trans_op(true, trans, "gemv_op"); - hipblasErrcheck(hipblasZgemv(cublas_handle, cutrans, m, n, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)A, lda, (hipblasDoubleComplex*)X, incx, (hipblasDoubleComplex*)beta, (hipblasDoubleComplex*)Y, incx)); + hipblasErrcheck(hipblasZgemv(cublas_handle, cutrans, m, n, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)A, lda, (hipblasDoubleComplex*)X, incx, (hipblasDoubleComplex*)beta, (hipblasDoubleComplex*)Y, incy)); } template <> From 3ce6158c49eb6079587b00145e41d96147f6537c Mon Sep 17 00:00:00 2001 From: Bonan Zhu Date: Tue, 3 Feb 2026 17:38:47 +0800 Subject: [PATCH 4/4] Replace cudaErrCheck with CHECK_CUBLAS --- source/source_base/kernels/cuda/math_kernel_op.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu index 3222c7dc04..c5b0648c49 100644 --- a/source/source_base/kernels/cuda/math_kernel_op.cu +++ b/source/source_base/kernels/cuda/math_kernel_op.cu @@ -2,6 +2,7 @@ #include "source_base/kernels/math_kernel_op.h" #include "source_psi/psi.h" #include "source_base/tool_quit.h" +#include "source_base/module_container/base/third_party/cublas.h" #include #include @@ -192,7 +193,7 @@ void gemv_op::operator()(const char& trans, const int& incy) { cublasOperation_t cutrans = judge_trans_op(false, trans, "gemv_op"); - cublasErrcheck(cublasSgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incy)); + CHECK_CUBLAS(cublasSgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incy)); } @@ -213,7 +214,7 @@ void gemv_op, base_device::DEVICE_GPU>::operator()(const cha cublasOperation_t cutrans = judge_trans_op(true, trans, "gemv_op"); cuFloatComplex alpha = make_cuFloatComplex(alpha_in->real(), alpha_in->imag()); cuFloatComplex beta = make_cuFloatComplex(beta_in->real(), beta_in->imag()); - CHECK_CUDBLAS(cublasCgemv(cublas_handle, cutrans, m, n, &alpha, (cuFloatComplex*)A, lda, (cuFloatComplex*)X, incx, &beta, (cuFloatComplex*)Y, incy)); + CHECK_CUBLAS(cublasCgemv(cublas_handle, cutrans, m, n, &alpha, (cuFloatComplex*)A, lda, (cuFloatComplex*)X, incx, &beta, (cuFloatComplex*)Y, incy)); } template <>