From f9b4d362ec35c25449c65ad264b4c1570984d6a1 Mon Sep 17 00:00:00 2001
From: Bonan Zhu <zhubonan@outlook.com>
Date: Sun, 1 Feb 2026 21:25:00 +0800
Subject: [PATCH 1/4] Fix double precision GPU bug by using GEMV instead of
 GEMM

It appears that GEMM with dimension 1 can be buggy for GPU (cuBLAS)
---
 .../kernels/cuda/math_kernel_op.cu            |  21 +++-
 source/source_hsolver/diago_dav_subspace.cpp  |  87 ++++++++++++++
 source/source_hsolver/diago_david.cpp         | 112 +++++++++++++-----
 3 files changed, 189 insertions(+), 31 deletions(-)
diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu
index 562760ca7a..041d7868ef 100644
--- a/source/source_base/kernels/cuda/math_kernel_op.cu
+++ b/source/source_base/kernels/cuda/math_kernel_op.cu
@@ -178,6 +178,25 @@ void gemv_op<double, base_device::DEVICE_GPU>::operator()(const char& trans,
     cublasErrcheck(cublasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx));
 }
 
+template <>
+void gemv_op<float, base_device::DEVICE_GPU>::operator()(const char& trans,
+                                                          const int& m,
+                                                          const int& n,
+                                                          const float* alpha,
+                                                          const float* A,
+                                                          const int& lda,
+                                                          const float* X,
+                                                          const int& incx,
+                                                          const float* beta,
+                                                          float* Y,
+                                                          const int& incy)
+{
+    cublasOperation_t cutrans = judge_trans_op(false, trans, "gemv_op");
+    cublasErrcheck(cublasSgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx));
+}
+
+
+
 template <>
 void gemv_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const char& trans,
                                                                        const int& m,
@@ -215,7 +234,7 @@ void gemv_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const ch
     cuDoubleComplex beta = make_cuDoubleComplex(beta_in->real(), beta_in->imag());
     // icpc and nvcc have some compatible problems
     // We must use cuDoubleComplex instead of converting std::complex<double>* to cuDoubleComplex*
-    cublasErrcheck(cublasZgemv(cublas_handle, cutrans, m, n, &alpha, (cuDoubleComplex*)A, lda, (cuDoubleComplex*)X, incx, &beta, (cuDoubleComplex*)Y, incx));
+    cublasErrcheck(cublasZgemv(cublas_handle, cutrans, m, n, &alpha, (cuDoubleComplex*)A, lda, (cuDoubleComplex*)X, incx, &beta, (cuDoubleComplex*)Y, incy));
 }
 
 template <>
diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp
index 8fa997774b..3bc53523d8 100644
--- a/source/source_hsolver/diago_dav_subspace.cpp
+++ b/source/source_hsolver/diago_dav_subspace.cpp
@@ -295,6 +295,8 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
         }
     }
 
+    if (notconv > 1){
+
 #ifdef __DSP
     ModuleBase::gemm_op_mt<T, Device>()
 #else
@@ -313,6 +315,28 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
                          this->zero,
                          psi_iter + (nbase) * this->dim,
                          this->dim);
+    } else
+    {
+
+#ifdef __DSP
+    ModuleBase::gemv_op_mt<T, Device>()
+#else
+    ModuleBase::gemv_op<T, Device>()
+#endif
+                        ('N',
+                         this->dim,     // m: row of A
+                         nbase,         // n: col of A
+                         this->one,     // alpha
+                         hpsi,          // A dim * nbase
+                         this->dim,     // LDA: if(N) max(1,m)
+                         vcc,           // X nbase
+                         1,             // incx
+                         this->zero,    // beta
+                         psi_iter + (nbase) * this->dim,  // Y dim
+                         1              // incy
+                        );
+    }
+
 
     // Eigenvalues operation section
     Real* e_temp_hd = eigenvalue_iter->data();
@@ -325,6 +349,8 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     // vcc = - vcc * eigenvalue
     ModuleBase::matrix_mul_vector_op<T, Device>()(nbase, notconv, vcc, this->nbase_x, e_temp_hd, -1.0, vcc, this->nbase_x);
 
+    if (notconv > 1){
+
 #ifdef __DSP
     ModuleBase::gemm_op_mt<T, Device>()
 #else
@@ -343,6 +369,26 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
          this->one,
          psi_iter + nbase * this->dim,
          this->dim);
+    } else 
+    {
+#ifdef __DSP
+    ModuleBase::gemv_op_mt<T, Device>()
+#else
+    ModuleBase::gemv_op<T, Device>()
+#endif
+        ('N',
+         this->dim,     // m: row of A
+         nbase,         // n: col of A
+         this->one,     // alpha
+         spsi,          // A dim * nbase
+         this->dim,     // LDA: if(N) max(1,m)
+         vcc,           // X nbase
+         1,             // incx
+         this->one,    // beta
+         psi_iter + nbase * this->dim,  // Y dim
+         1              // incy
+        );
+    }
 
     // Precondition section
 #if defined(__CUDA) || defined(__ROCM)
@@ -413,6 +459,7 @@ void Diago_DavSubspace<T, Device>::cal_elem(const int& dim,
 {
     ModuleBase::timer::tick("Diago_DavSubspace", "cal_elem");
 
+    if (notconv > 1){
 #ifdef __DSP
     ModuleBase::gemm_op_mt<T, Device>()
 #else
@@ -451,6 +498,46 @@ void Diago_DavSubspace<T, Device>::cal_elem(const int& dim,
          &scc[nbase * this->nbase_x],
          this->nbase_x);
 
+        } else {
+
+#ifdef __DSP
+    ModuleBase::gemv_op_mt<T, Device>()
+#else
+    ModuleBase::gemv_op<T, Device>()
+#endif
+        ('C',
+         this->dim,                 // m: row of A
+         nbase + notconv,          // n: col of A
+         this->one,                // alpha
+         psi_iter,                 // A dim * nbase
+         this->dim,                // LDA: if(N) max(1,m)
+         &hpsi[nbase * this->dim], // X nbase
+         1,                        // incx
+         this->zero,               // beta
+         &hcc[nbase * this->nbase_x], // Y dim
+         1                         // incy
+        );
+#ifdef __DSP
+    ModuleBase::gemv_op_mt<T, Device>()
+#else
+    ModuleBase::gemv_op<T, Device>()
+#endif
+        ('C',
+         this->dim,                 // m: row of A
+         nbase + notconv,          // n: col of A
+         this->one,                // alpha
+         psi_iter,                 // A dim * nbase
+         this->dim,                // LDA: if(N) max(1,m)
+         spsi + nbase * this->dim, // X nbase
+         1,                        // incx
+         this->zero,               // beta
+         &scc[nbase * this->nbase_x], // Y dim
+         1                         // incy
+        );
+
+        }
+
+
 #ifdef __MPI
     if (this->diag_comm.nproc > 1)
     {
diff --git a/source/source_hsolver/diago_david.cpp b/source/source_hsolver/diago_david.cpp
index bf390104af..ef4ba67cf3 100644
--- a/source/source_hsolver/diago_david.cpp
+++ b/source/source_hsolver/diago_david.cpp
@@ -351,7 +351,24 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     // basis[nbase] = hpsi * vc_ev_vector = hpsi*vcc
     // basis'        =   vc_ev_vector' * hpsi'
     // (dim, notconv)  (dim, nbase) (nbase, notconv)
-    ModuleBase::gemm_op<T, Device>()('N',
+    if (notconv == 1){
+        //Reuse gemv for vector case to avoid potential bug using gemm call with n=1
+        ModuleBase::gemv_op<T, Device>()('N',
+                                     dim,                 // m: row of A
+                                     nbase,               // n: col of A
+                                     this->one,           // alpha
+                                     hpsi,                // A dim * nbase
+                                     dim,                 // LDA: if(N) max(1,m)
+                                     vc_ev_vector,        // X nbase
+                                     1,                   // incx
+                                     this->zero,          // beta
+                                     basis + dim * nbase, // Y dim
+                                     1                    // incy
+        );
+
+    }else
+    {
+        ModuleBase::gemm_op<T, Device>()('N',
                                      'N',
                                      dim,                 // m: row of A,C
                                      notconv,             // n: col of B,C
@@ -364,7 +381,8 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
                                      this->zero,          // belta
                                      basis + dim * nbase, // C dim * notconv
                                      dim                  // LDC: if(N) max(1, m)
-    );
+        );
+    }
 
     //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
     // for (int m = 0; m < notconv; m++)
@@ -411,20 +429,37 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     //              = (H - lambda * S) * psi * vcc
     //              = (H - lambda * S) * psi_new
     //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-    ModuleBase::gemm_op<T, Device>()('N',
-                                     'N',
-                                     dim,                 // m: row of A,C
-                                     notconv,             // n: col of B,C
-                                     nbase,               // k: col of A, row of B
-                                     this->one,           // alpha
-                                     spsi,                // A
-                                     dim,                 // LDA: if(N) max(1,m) if(T) max(1,k)
-                                     vc_ev_vector,        // B
-                                     nbase,               // LDB: if(N) max(1,k) if(T) max(1,n)
-                                     this->one,           // belta
-                                     basis + dim * nbase, // C dim * notconv
-                                     dim                  // LDC: if(N) max(1, m)
-    );
+    if (notconv == 1){
+        //Use gemv for vector case to avoid potential bug using gemm call with n=1
+        ModuleBase::gemv_op<T, Device>()('N',
+                                        dim,                  // m: row of A
+                                        nbase,                // n: col of A
+                                        this->one,            // alpha
+                                        spsi,                 // A dim * nbase
+                                        dim,                  // LDA: if(N) max(1,m)
+                                        vc_ev_vector,         // X nbase
+                                        1,                    // incx
+                                        this->one,            // beta
+                                        basis + dim * nbase,  // Y dim
+                                        1                     //incy
+        );
+    } else 
+    {
+        ModuleBase::gemm_op<T, Device>()('N',
+                                        'N',
+                                        dim,                 // m: row of A,C
+                                        notconv,             // n: col of B,C
+                                        nbase,               // k: col of A, row of B
+                                        this->one,           // alpha
+                                        spsi,                // A
+                                        dim,                 // LDA: if(N) max(1,m) if(T) max(1,k)
+                                        vc_ev_vector,        // B
+                                        nbase,               // LDB: if(N) max(1,k) if(T) max(1,n)
+                                        this->one,           // beta
+                                        basis + dim * nbase, // C dim * notconv
+                                        dim                  // LDC: if(N) max(1, m)
+        );
+    }
     //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
     // Preconditioning
@@ -478,20 +513,37 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     // first nbase bands psi* dot notconv bands spsi to prepare lagrange_matrix
 
     // calculate the square matrix for future lagranges
-    ModuleBase::gemm_op<T, Device>()('C',
-                                     'N',
-                                     nbase,              // m: row of A,C
-                                     notconv,            // n: col of B,C
-                                     dim,                // k: col of A, row of B
-                                     this->one,          // alpha
-                                     basis,              // A
-                                     dim,                // LDA: if(N) max(1,m) if(T) max(1,k)
-                                     &spsi[nbase * dim], // B
-                                     dim,                // LDB: if(N) max(1,k) if(T) max(1,n)
-                                     this->zero,         // belta
-                                     lagrange,           // C
-                                     nbase + notconv     // LDC: if(N) max(1, m)
-    );
+    if (notconv == 1){
+        //Use gemv for vector case to avoid potential bug using gemm call with n=1
+        ModuleBase::gemv_op<T, Device>()('C',
+                                     dim,                 // m: row of A
+                                     nbase,               // n: col of A
+                                     this->one,           // alpha
+                                     basis,               // A dim * nbase
+                                     dim,                 // LDA: if(N) max(1,m)
+                                     &spsi[nbase * dim], // X dim
+                                     1,           // incx
+                                     this->zero,          // beta
+                                     lagrange,           // Y nbase
+                                     1
+        );
+    } else
+    {
+        ModuleBase::gemm_op<T, Device>()('C',
+                                        'N',
+                                        nbase,              // m: row of A,C
+                                        notconv,            // n: col of B,C
+                                        dim,                // k: col of A, row of B
+                                        this->one,          // alpha
+                                        basis,              // A
+                                        dim,                // LDA: if(N) max(1,m) if(T) max(1,k)
+                                        &spsi[nbase * dim], // B
+                                        dim,                // LDB: if(N) max(1,k) if(T) max(1,n)
+                                        this->zero,         // belta
+                                        lagrange,           // C
+                                        nbase + notconv     // LDC: if(N) max(1, m)
+        );
+    }
 
     for (int m = 0; m < notconv; m++)
     {

From b93565ca1bf2c716dc33b4ae795bf7a2e2c50e24 Mon Sep 17 00:00:00 2001
From: Bonan Zhu <zhubonan@outlook.com>
Date: Mon, 2 Feb 2026 10:09:03 +0800
Subject: [PATCH 2/4] Fix GPU single precision energy error in dav_subspace
 solver

  - Restore d_precondition host-to-device sync that was commented out in #5199
    (this caused uninitialized GPU memory to be used as the preconditioner)
  - Fix cuBLAS gemv calls using incx instead of incy for Y parameter
  - Fix gemv_batched using incy instead of incx for x parameter

  Fixes GPU single precision energy being ~0.027 eV off from correct value.
---
 source/source_base/kernels/cuda/math_kernel_op.cu           | 6 +++---
 .../source_base/module_container/base/third_party/cublas.h  | 2 +-
 source/source_hsolver/diago_dav_subspace.cpp                | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu
index 041d7868ef..65dfd8bed9 100644
--- a/source/source_base/kernels/cuda/math_kernel_op.cu
+++ b/source/source_base/kernels/cuda/math_kernel_op.cu
@@ -175,7 +175,7 @@ void gemv_op<double, base_device::DEVICE_GPU>::operator()(const char& trans,
                                                           const int& incy)
 {
     cublasOperation_t cutrans = judge_trans_op(false, trans, "gemv_op");
-    cublasErrcheck(cublasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx));
+    cublasErrcheck(cublasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incy));
 }
 
 template <>
@@ -192,7 +192,7 @@ void gemv_op<float, base_device::DEVICE_GPU>::operator()(const char& trans,
                                                           const int& incy)
 {
     cublasOperation_t cutrans = judge_trans_op(false, trans, "gemv_op");
-    cublasErrcheck(cublasSgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx));
+    cublasErrcheck(cublasSgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incy));
 }
 
 
@@ -213,7 +213,7 @@ void gemv_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const cha
     cublasOperation_t cutrans = judge_trans_op(true, trans, "gemv_op");
     cuFloatComplex alpha = make_cuFloatComplex(alpha_in->real(), alpha_in->imag());
     cuFloatComplex beta = make_cuFloatComplex(beta_in->real(), beta_in->imag());
-    cublasErrcheck(cublasCgemv(cublas_handle, cutrans, m, n, &alpha, (cuFloatComplex*)A, lda, (cuFloatComplex*)X, incx, &beta, (cuFloatComplex*)Y, incx));
+    cublasErrcheck(cublasCgemv(cublas_handle, cutrans, m, n, &alpha, (cuFloatComplex*)A, lda, (cuFloatComplex*)X, incx, &beta, (cuFloatComplex*)Y, incy));
 }
 
 template <>
diff --git a/source/source_base/module_container/base/third_party/cublas.h b/source/source_base/module_container/base/third_party/cublas.h
index cea046f30e..664b9d9b36 100644
--- a/source/source_base/module_container/base/third_party/cublas.h
+++ b/source/source_base/module_container/base/third_party/cublas.h
@@ -152,7 +152,7 @@ void gemv_batched(cublasHandle_t& handle, const char& trans, const int& m, const
 {
     for (int ii = 0; ii < batch_size; ++ii) {
         // Call the single GEMV for each pair of matrix A[ii] and vector x[ii]
-        cuBlasConnector::gemv(handle, trans, m, n, alpha, A[ii], lda, x[ii], incy, beta, y[ii], incy);
+        cuBlasConnector::gemv(handle, trans, m, n, alpha, A[ii], lda, x[ii], incx, beta, y[ii], incy);
     }
 }
 
diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp
index 3bc53523d8..27c6a5b348 100644
--- a/source/source_hsolver/diago_dav_subspace.cpp
+++ b/source/source_hsolver/diago_dav_subspace.cpp
@@ -77,7 +77,7 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
     if (this->device == base_device::GpuDevice)
     {
         resmem_real_op()(this->d_precondition, nbasis_in);
-        // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in);
+        syncmem_var_h2d_op()(this->d_precondition, this->precondition.data(), nbasis_in);
         resmem_complex_op()(this->d_scc, this->nbase_x * this->nbase_x);
         resmem_real_op()(this->d_eigenvalue, this->nbase_x);
     }
@@ -369,7 +369,7 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
          this->one,
          psi_iter + nbase * this->dim,
          this->dim);
-    } else 
+    } else
     {
 #ifdef __DSP
     ModuleBase::gemv_op_mt<T, Device>()

From ad93d4ca1e5897018b84d662874048af02dcc4d2 Mon Sep 17 00:00:00 2001
From: Bonan Zhu <zhubonan@outlook.com>
Date: Mon, 2 Feb 2026 10:34:59 +0800
Subject: [PATCH 3/4] Fix ROCm gemv incy parameter bug (same as CUDA fix)

Fixed 3 hipBLAS gemv calls that incorrectly used incx instead of incy
for the Y vector stride parameter:
- hipblasDgemv (double)
- hipblasCgemv (complex<float>)
- hipblasZgemv (complex<double>)

This is the same bug that was fixed in the CUDA version (math_kernel_op.cu).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 source/source_base/kernels/rocm/math_kernel_op.hip.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/source_base/kernels/rocm/math_kernel_op.hip.cu b/source/source_base/kernels/rocm/math_kernel_op.hip.cu
index f60cdfc3b1..1b4f30d6b2 100644
--- a/source/source_base/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/source_base/kernels/rocm/math_kernel_op.hip.cu
@@ -188,7 +188,7 @@ void gemv_op<double, base_device::DEVICE_GPU>::operator()(const char& trans,
                                                           const int& incy)
 {
     hipblasOperation_t cutrans = judge_trans_op(false, trans, "gemv_op");
-    hipblasErrcheck(hipblasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incx));
+    hipblasErrcheck(hipblasDgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incy));
 }
 
 template <>
@@ -205,7 +205,7 @@ void gemv_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const cha
                                                                        const int& incy)
 {
     hipblasOperation_t cutrans = judge_trans_op(true, trans, "gemv_op");
-    hipblasErrcheck(hipblasCgemv(cublas_handle, cutrans, m, n, (hipblasComplex*)alpha, (hipblasComplex*)A, lda, (hipblasComplex*)X, incx, (hipblasComplex*)beta, (hipblasComplex*)Y, incx));
+    hipblasErrcheck(hipblasCgemv(cublas_handle, cutrans, m, n, (hipblasComplex*)alpha, (hipblasComplex*)A, lda, (hipblasComplex*)X, incx, (hipblasComplex*)beta, (hipblasComplex*)Y, incy));
 }
 
 template <>
@@ -222,7 +222,7 @@ void gemv_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const ch
                                                                         const int& incy)
 {
     hipblasOperation_t cutrans = judge_trans_op(true, trans, "gemv_op");
-    hipblasErrcheck(hipblasZgemv(cublas_handle, cutrans, m, n, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)A, lda, (hipblasDoubleComplex*)X, incx, (hipblasDoubleComplex*)beta, (hipblasDoubleComplex*)Y, incx));
+    hipblasErrcheck(hipblasZgemv(cublas_handle, cutrans, m, n, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)A, lda, (hipblasDoubleComplex*)X, incx, (hipblasDoubleComplex*)beta, (hipblasDoubleComplex*)Y, incy));
 }
 
 template <>

From 3ce6158c49eb6079587b00145e41d96147f6537c Mon Sep 17 00:00:00 2001
From: Bonan Zhu <zhubonan@outlook.com>
Date: Tue, 3 Feb 2026 17:38:47 +0800
Subject: [PATCH 4/4] Replace cudaErrCheck with CHECK_CUBLAS

---
 source/source_base/kernels/cuda/math_kernel_op.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu
index 3222c7dc04..c5b0648c49 100644
--- a/source/source_base/kernels/cuda/math_kernel_op.cu
+++ b/source/source_base/kernels/cuda/math_kernel_op.cu
@@ -2,6 +2,7 @@
 #include "source_base/kernels/math_kernel_op.h"
 #include "source_psi/psi.h"
 #include "source_base/tool_quit.h"
+#include "source_base/module_container/base/third_party/cublas.h"
 
 #include <base/macros/macros.h>
 #include <cuda_runtime.h>
@@ -192,7 +193,7 @@ void gemv_op<float, base_device::DEVICE_GPU>::operator()(const char& trans,
                                                           const int& incy)
 {
     cublasOperation_t cutrans = judge_trans_op(false, trans, "gemv_op");
-    cublasErrcheck(cublasSgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incy));
+    CHECK_CUBLAS(cublasSgemv(cublas_handle, cutrans, m, n, alpha, A, lda, X, incx, beta, Y, incy));
 }
 
 
@@ -213,7 +214,7 @@ void gemv_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const cha
     cublasOperation_t cutrans = judge_trans_op(true, trans, "gemv_op");
     cuFloatComplex alpha = make_cuFloatComplex(alpha_in->real(), alpha_in->imag());
     cuFloatComplex beta = make_cuFloatComplex(beta_in->real(), beta_in->imag());
-    CHECK_CUDBLAS(cublasCgemv(cublas_handle, cutrans, m, n, &alpha, (cuFloatComplex*)A, lda, (cuFloatComplex*)X, incx, &beta, (cuFloatComplex*)Y, incy));
+    CHECK_CUBLAS(cublasCgemv(cublas_handle, cutrans, m, n, &alpha, (cuFloatComplex*)A, lda, (cuFloatComplex*)X, incx, &beta, (cuFloatComplex*)Y, incy));
 }
 
 template <>