From 2a86d0692800397436aba3597b767eccecde29aa Mon Sep 17 00:00:00 2001 From: "tianxiang.wang@metax-tech.com" Date: Fri, 5 Sep 2025 06:04:09 +0000 Subject: [PATCH 1/5] =?UTF-8?q?Perf:=20Optimize=20Diago=5FDavSubspace=20wi?= =?UTF-8?q?th=20GPU=20operators=20by=20adding=20and=20fusing=20custom=20ke?= =?UTF-8?q?rnels.=20Signed-off-by=EF=BC=9ATianxiang=20Wang,=20Contributed=20under=20MetaX=20Integrated=20C?= =?UTF-8?q?ircuits=20(Shanghai)=20Co.,=20Ltd.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kernels/cuda/math_kernel_op.cu | 44 ++++++- source/source_base/kernels/math_kernel_op.cpp | 25 ++++ source/source_base/kernels/math_kernel_op.h | 40 +++++- .../kernels/rocm/math_kernel_op.hip.cu | 42 ++++++- source/source_hsolver/diago_dav_subspace.cpp | 74 ++--------- source/source_hsolver/diago_dav_subspace.h | 2 + .../source_hsolver/kernels/bpcg_kernel_op.cpp | 27 ++++ .../source_hsolver/kernels/bpcg_kernel_op.h | 55 +++++++-- .../kernels/cuda/bpcg_kernel_op.cu | 116 ++++++++++++++---- .../kernels/rocm/bpcg_kernel_op.hip.cu | 112 ++++++++++++++--- 10 files changed, 421 insertions(+), 116 deletions(-) diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu index 415ec9f12a..562760ca7a 100644 --- a/source/source_base/kernels/cuda/math_kernel_op.cu +++ b/source/source_base/kernels/cuda/math_kernel_op.cu @@ -133,6 +133,14 @@ __global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const } } +template +__global__ void matrix_multiply_vector_kernel(const int m, const int n, T *a, const int lda, const Real *b, const Real alpha, T *c, const int ldc){ + int row = blockIdx.x * blockDim.x + threadIdx.x; + int col = blockIdx.y * blockDim.y + threadIdx.y; + if (col >= n || row >= m) return; + c[col * ldc + row] = a[col * lda + row] * b[col] * alpha; +} + cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name) { if (trans == 'N') @@ -147,7 +155,7 @@ cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* { return CUBLAS_OP_C; } - else + else { ModuleBase::WARNING_QUIT(name, std::string("Unknown trans type ") + trans + std::string(" !")); } @@ -438,10 +446,44 @@ void matrixCopy, base_device::DEVICE_GPU>::operator()(const cudaCheckOnDebug(); } +template <> +void matrix_mul_vector_op::operator()(const int &m, const int &n, + double *a, const int &lda, const double *b, const double alpha, double *c, const int &ldc){ + dim3 thread(16, 16, 1); + dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1); + matrix_multiply_vector_kernel <<>>(m, n, a, lda, + b, alpha, c, ldc); + cudaCheckOnDebug(); +} + +template <> +void matrix_mul_vector_op, base_device::DEVICE_GPU>::operator()(const int &m, const int &n, + std::complex *a, const int &lda, const float *b, const float alpha, std::complex *c, const int &ldc){ + dim3 thread(16, 16, 1); + dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1); + matrix_multiply_vector_kernel, float> <<>>(m, n, reinterpret_cast*>(a), lda, + b, alpha, reinterpret_cast*>(c), ldc); + cudaCheckOnDebug(); +} + +template <> +void matrix_mul_vector_op, base_device::DEVICE_GPU>::operator()(const int &m, const int &n, + std::complex *a, const int &lda, const double *b, const double alpha, std::complex *c, const int &ldc) +{ + dim3 thread(16, 16, 1); + dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1); + matrix_multiply_vector_kernel, double> <<>>(m, n, reinterpret_cast*>(a), lda, + b, alpha, reinterpret_cast*>(c), ldc); + cudaCheckOnDebug(); +} // Explicitly instantiate functors for the types of functor registered. template struct matrixCopy, base_device::DEVICE_GPU>; template struct matrixCopy; template struct matrixCopy, base_device::DEVICE_GPU>; + +template struct matrix_mul_vector_op, base_device::DEVICE_GPU>; +template struct matrix_mul_vector_op; +template struct matrix_mul_vector_op, base_device::DEVICE_GPU>; } // namespace ModuleBase diff --git a/source/source_base/kernels/math_kernel_op.cpp b/source/source_base/kernels/math_kernel_op.cpp index dad790e463..343d342e00 100644 --- a/source/source_base/kernels/math_kernel_op.cpp +++ b/source/source_base/kernels/math_kernel_op.cpp @@ -119,12 +119,35 @@ struct matrixCopy } }; +template +struct matrix_mul_vector_op { + using Real = typename GetTypeReal::type; + void operator()(const int& m, const int &n, + T *a, + const int &lda, + const Real *b, + const Real alpha, + T *c, + const int &ldc){ +#ifdef _OPENMP +#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T)) +#endif + for (int j = 0; j < n; j++){ + for (int i = 0; i < m; i++){ + c[j * ldc + i] = a[j * lda + i] * b[j] * alpha; + } + } + + } +}; + template struct gemv_op, base_device::DEVICE_CPU>; template struct gemv_op; template struct gemm_op, base_device::DEVICE_CPU>; template struct gemm_op; template struct matrixTranspose_op, base_device::DEVICE_CPU>; template struct matrixCopy, base_device::DEVICE_CPU>; +template struct matrix_mul_vector_op, base_device::DEVICE_CPU>; template struct gemv_op, base_device::DEVICE_CPU>; template struct gemv_op; @@ -133,6 +156,8 @@ template struct gemm_op; template struct matrixTranspose_op, base_device::DEVICE_CPU>; template struct matrixCopy; template struct matrixCopy, base_device::DEVICE_CPU>; +template struct matrix_mul_vector_op; +template struct matrix_mul_vector_op, base_device::DEVICE_CPU>; #ifdef __LCAO template struct matrixTranspose_op; diff --git a/source/source_base/kernels/math_kernel_op.h b/source/source_base/kernels/math_kernel_op.h index 9476d32fdc..f5c8e218df 100644 --- a/source/source_base/kernels/math_kernel_op.h +++ b/source/source_base/kernels/math_kernel_op.h @@ -104,7 +104,7 @@ template struct vector_div_constant_op { /// /// Input Parameters /// \param dim : array size - /// \param vector : input array + /// \param vector : input array /// \param constant : input constant /// /// Output Parameters @@ -298,6 +298,31 @@ template struct matrixCopy { void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB); }; +template +struct matrix_mul_vector_op { + using Real = typename GetTypeReal::type; + /// @brief a * b * beta by each column + /// + /// Input Parameters + /// \param m : row number + /// \param n : column number + /// \param a : input matrix + /// \param lda : leading dimension of matrix a + /// \param b : input vector + /// \param alpha : factor + /// \param ldc : leading dimension of matrix c + /// + /// Output Parameters + /// \param c : output matrix + void operator()(const int &m, const int &n, + T *a, + const int &lda, + const Real *b, + const Real alpha, + T *c, + const int &ldc); +}; + template struct apply_eigenvalues_op { using Real = typename GetTypeReal::type; @@ -314,7 +339,7 @@ struct precondition_op { T* psi_iter, const int& nbase, const int& notconv, - const Real* precondition, + const Real* precondition, const Real* eigenvalues); }; @@ -393,6 +418,17 @@ template struct matrixCopy { const int& LDB); }; +template struct matrix_mul_vector_op { + using Real = typename GetTypeReal::type; + void operator()(const int &m, const int &n, + T *a, + const int &lda, + const Real *b, + const Real alpha, + T *c, + const int &ldc); +}; + void createGpuBlasHandle(); void destoryBLAShandle(); diff --git a/source/source_base/kernels/rocm/math_kernel_op.hip.cu b/source/source_base/kernels/rocm/math_kernel_op.hip.cu index 8eb87988c8..f60cdfc3b1 100644 --- a/source/source_base/kernels/rocm/math_kernel_op.hip.cu +++ b/source/source_base/kernels/rocm/math_kernel_op.hip.cu @@ -145,6 +145,15 @@ __launch_bounds__(1024) __global__ } } +template +__launch_bounds__(1024) __global__ +void matrix_multiply_vector_kernel(const int m, const int n, T *a, const int lda, const Real *b, const Real alpha, T *c, const int ldc){ + int row = blockIdx.x * blockDim.x + threadIdx.x; + int col = blockIdx.y * blockDim.y + threadIdx.y; + if (col >= n || row >= m) return; + c[col * ldc + row] = a[col * lda + row] * b[col] * alpha; +} + hipblasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name) { if (trans == 'N') @@ -159,7 +168,7 @@ hipblasOperation_t judge_trans_op(bool is_complex, const char& trans, const char { return HIPBLAS_OP_C; } - else + else { ModuleBase::WARNING_QUIT(name, std::string("Unknown trans type ") + trans + std::string(" !")); } @@ -437,7 +446,38 @@ void matrixCopy, base_device::DEVICE_GPU>::operator()(const hipCheckOnDebug(); } +template <> +void matrix_mul_vector_op::operator()(const int &m, const int &n, + double *a, const int &lda, const double *b, const double alpha, double *c, const int &ldc){ + dim3 thread(16, 16, 1); + dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1); + hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel), dim3(block, thread), + m, n, a, lda, b, alpha, c, ldc); + hipCheckOnDebug(); +} +template <> +void matrix_mul_vector_op, base_device::DEVICE_GPU>::operator()(const int &m, const int &n, + std::complex *a, const int &lda, const float *b, const float alpha, std::complex *c, const int &ldc){ + dim3 thread(16, 16, 1); + dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1); + hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel, float>), dim3(block, thread), + m, n, reinterpret_cast*>(a), lda, + b, alpha, reinterpret_cast*>(c), ldc); + hipCheckOnDebug(); +} + +template <> +void matrix_mul_vector_op, base_device::DEVICE_GPU>::operator()(const int &m, const int &n, + std::complex *a, const int &lda, const double *b, const double alpha, std::complex *c, const int &ldc) +{ + dim3 thread(16, 16, 1); + dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1); + hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel, double>), dim3(block, thread), + m, n, reinterpret_cast*>(a), lda, + b, alpha, reinterpret_cast*>(c), ldc); + hipCheckOnDebug(); +} // Explicitly instantiate functors for the types of functor registered. template struct matrixCopy; diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp index 4402fad9f8..fe89b23377 100644 --- a/source/source_hsolver/diago_dav_subspace.cpp +++ b/source/source_hsolver/diago_dav_subspace.cpp @@ -76,6 +76,7 @@ Diago_DavSubspace::Diago_DavSubspace(const std::vector& precond { resmem_real_op()(this->d_precondition, nbasis_in); // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in); + resmem_real_op()(this->d_eigenvalue, this->nbase_x); } #endif } @@ -94,6 +95,7 @@ Diago_DavSubspace::~Diago_DavSubspace() if (this->device == base_device::GpuDevice) { delmem_real_op()(this->d_precondition); + delmem_real_op()(this->d_eigenvalue); } #endif } @@ -316,30 +318,15 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, this->dim); // Eigenvalues operation section - std::vector e_temp_cpu(this->notconv, 0); - Real* e_temp_hd = e_temp_cpu.data(); + Real* e_temp_hd = eigenvalue_iter->data(); if (this->device == base_device::GpuDevice) { - e_temp_hd = nullptr; - resmem_real_op()(e_temp_hd, nbase); + syncmem_var_h2d_op()(this->d_eigenvalue, eigenvalue_iter->data(), this->nbase_x); + e_temp_hd = this->d_eigenvalue; } - for (int m = 0; m < this->notconv; m++) - { - e_temp_cpu[m] = -(*eigenvalue_iter)[m]; - } - - if (this->device == base_device::GpuDevice) - { - syncmem_var_h2d_op()(e_temp_hd, e_temp_cpu.data(), this->notconv); - } - - apply_eigenvalues_op()(nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd); - - if (this->device == base_device::GpuDevice) - { - delmem_real_op()(e_temp_hd); - } + // vcc = - vcc * eigenvalue + ModuleBase::matrix_mul_vector_op()(nbase, notconv, vcc, this->nbase_x, eigenvalue_iter->data(), -1.0, vcc, this->nbase_x); #ifdef __DSP ModuleBase::gemm_op_mt() @@ -364,17 +351,12 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - Real* eigenvalues_gpu = nullptr; - resmem_real_op()(eigenvalues_gpu, notconv); - syncmem_var_h2d_op()(eigenvalues_gpu, (*eigenvalue_iter).data(), notconv); - precondition_op()(this->dim, psi_iter, nbase, notconv, d_precondition, - eigenvalues_gpu); - delmem_real_op()(eigenvalues_gpu); + this->d_eigenvalue); } else #endif @@ -395,7 +377,7 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, resmem_real_op()(psi_norm, notconv); using setmem_real_op = base_device::memory::set_memory_op; setmem_real_op()(psi_norm, 0.0, notconv); - + normalize_op()(this->dim, psi_iter, nbase, @@ -641,7 +623,7 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, } else { -#ifdef __MPI +#ifdef __MPI std::vector h_diag; std::vector s_diag; std::vector vcc_tmp; @@ -680,7 +662,7 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, } #else std::cout << "Error: parallel diagonalization is not supported in serial mode." << std::endl; - exit(1); + exit(1); #endif } } @@ -772,39 +754,7 @@ void Diago_DavSubspace::refresh(const int& dim, if (this->device == base_device::GpuDevice) { -#if defined(__CUDA) || defined(__ROCM) - T* hcc_cpu = nullptr; - T* scc_cpu = nullptr; - T* vcc_cpu = nullptr; - base_device::memory::resize_memory_op()(hcc_cpu, - this->nbase_x * this->nbase_x, - "DAV::hcc"); - base_device::memory::resize_memory_op()(scc_cpu, - this->nbase_x * this->nbase_x, - "DAV::scc"); - base_device::memory::resize_memory_op()(vcc_cpu, - this->nbase_x * this->nbase_x, - "DAV::vcc"); - - syncmem_d2h_op()(hcc_cpu, hcc, this->nbase_x * this->nbase_x); - syncmem_d2h_op()(scc_cpu, scc, this->nbase_x * this->nbase_x); - syncmem_d2h_op()(vcc_cpu, vcc, this->nbase_x * this->nbase_x); - - for (int i = 0; i < nbase; i++) - { - hcc_cpu[i * this->nbase_x + i] = eigenvalue_in_hsolver[i]; - scc_cpu[i * this->nbase_x + i] = this->one[0]; - vcc_cpu[i * this->nbase_x + i] = this->one[0]; - } - - syncmem_h2d_op()(hcc, hcc_cpu, this->nbase_x * this->nbase_x); - syncmem_h2d_op()(scc, scc_cpu, this->nbase_x * this->nbase_x); - syncmem_h2d_op()(vcc, vcc_cpu, this->nbase_x * this->nbase_x); - - base_device::memory::delete_memory_op()(hcc_cpu); - base_device::memory::delete_memory_op()(scc_cpu); - base_device::memory::delete_memory_op()(vcc_cpu); -#endif + refresh_hcc_scc_vcc_op()(nbase, hcc, scc, vcc, this->nbase_x, this->d_eigenvalue, this->one_); } else { diff --git a/source/source_hsolver/diago_dav_subspace.h b/source/source_hsolver/diago_dav_subspace.h index 3b4c224ec8..3d76bbf225 100644 --- a/source/source_hsolver/diago_dav_subspace.h +++ b/source/source_hsolver/diago_dav_subspace.h @@ -94,6 +94,8 @@ class Diago_DavSubspace /// Eigenvectors on the reduced basis T* vcc = nullptr; + Real* d_eigenvalue = nullptr; + /// device type of psi Device* ctx = {}; base_device::DEVICE_CPU* cpu_ctx = {}; diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp index 19c51fc398..3d74591783 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp +++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp @@ -183,6 +183,30 @@ struct normalize_op { } }; +template +struct refresh_hcc_scc_vcc_op +{ + using Real = typename GetTypeReal::type; + void operator()(const int &n, + T *hcc, + T *scc, + T *vcc, + const int &ldh, + const Real *eigenvalue, + const T &one) + { +#ifdef _OPENMP +#pragma omp parallel for collapse(1) schedule(static, 8192 / sizeof(T)) +#endif + for (int i = 0; i < n; i++) + { + hcc[i * ldh + i] = eigenvalue[i]; + scc[i * ldh + i] = one; + vcc[i * ldh + i] = one; + } + } +}; + template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; template struct line_minimize_with_block_op, base_device::DEVICE_CPU>; template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; @@ -196,4 +220,7 @@ template struct precondition_op; template struct normalize_op, base_device::DEVICE_CPU>; template struct normalize_op, base_device::DEVICE_CPU>; template struct normalize_op; +template struct refresh_hcc_scc_vcc_op, base_device::DEVICE_CPU>; +template struct refresh_hcc_scc_vcc_op, base_device::DEVICE_CPU>; +template struct refresh_hcc_scc_vcc_op; } // namespace hsolver \ No newline at end of file diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.h b/source/source_hsolver/kernels/bpcg_kernel_op.h index 802ee3c1e4..9ac7c5e2ce 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.h +++ b/source/source_hsolver/kernels/bpcg_kernel_op.h @@ -63,11 +63,11 @@ template struct apply_eigenvalues_op { using Real = typename GetTypeReal::type; - void operator()(const int& nbase, - const int& nbase_x, - const int& notconv, - T* result, - const T* vectors, + void operator()(const int& nbase, + const int& nbase_x, + const int& notconv, + T* result, + const T* vectors, const Real* eigenvalues); }; @@ -92,6 +92,30 @@ struct normalize_op { Real* psi_norm = nullptr); }; +template struct refresh_hcc_scc_vcc_op { + using Real = typename GetTypeReal::type; + /// @brief refresh hcc scc vcc + /// + /// Input Parameters + /// \param n : first dimension of matrix + /// \param ldh : leading dimension of hcc, scc, vcc + /// \param nbase : matrix size + /// \param eigenvalue : input eigenvalue + /// \param one : constant one + /// + /// Output Parameters + /// \param hcc : output matrix hcc + /// \param scc : output matrix scc + /// \param vcc : output matrix vcc + void operator()(const int &n, + T *hcc, + T *scc, + T *vcc, + const int &ldh, + const Real *eigenvalue, + const T& one); +}; + #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template @@ -114,11 +138,11 @@ struct calc_grad_with_block_op { template struct apply_eigenvalues_op { using Real = typename GetTypeReal::type; - void operator()(const int& nbase, - const int& nbase_x, - const int& notconv, - T* result, - const T* vectors, + void operator()(const int& nbase, + const int& nbase_x, + const int& notconv, + T* result, + const T* vectors, const Real* eigenvalues); }; @@ -143,6 +167,17 @@ struct normalize_op { Real* psi_norm = nullptr); }; +template +struct refresh_hcc_scc_vcc_op { + using Real = typename GetTypeReal::type; + void operator()(const int &n, + T *hcc, + T *scc, + T *vcc, + const int &ldh, + const Real *eigenvalue, + const T& one); +}; #endif } // namespace hsolver diff --git a/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu b/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu index e8af516274..a8f45ad886 100644 --- a/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu +++ b/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu @@ -257,7 +257,7 @@ __global__ void apply_eigenvalues_kernel( { int m = blockIdx.x; int idx = threadIdx.x + blockIdx.y * blockDim.x; - + if (m < notconv && idx < nbase) { result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx]; } @@ -274,7 +274,7 @@ __global__ void precondition_kernel( { int m = blockIdx.x; int i = threadIdx.x + blockIdx.y * blockDim.x; - + if (m < notconv && i < dim) { Real x = abs(precondition[i] - eigenvalues[m]); Real pre = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); @@ -293,17 +293,17 @@ __global__ void normalize_kernel( int m = blockIdx.x; int tid = threadIdx.x; __shared__ Real sum[thread_per_block]; - + sum[tid] = 0.0; - + // Calculate the sum for normalization for (int i = tid; i < dim; i += thread_per_block) { auto val = psi_iter[(nbase + m) * dim + i]; sum[tid] += (val * thrust::conj(val)).real(); } - + __syncthreads(); - + // Parallel reduction in shared memory for (int s = thread_per_block/2; s > warp_size; s >>= 1) { if (tid < s) { @@ -311,7 +311,7 @@ __global__ void normalize_kernel( } __syncthreads(); } - + if (tid < warp_size) { sum[tid] += sum[tid + 32]; __syncwarp(); sum[tid] += sum[tid + 16]; __syncwarp(); @@ -320,22 +320,41 @@ __global__ void normalize_kernel( sum[tid] += sum[tid + 2]; __syncwarp(); sum[tid] += sum[tid + 1]; __syncwarp(); } - + __syncthreads(); - + Real norm = sqrt(sum[0]); - + // Normalize the vector for (int i = tid; i < dim; i += thread_per_block) { psi_iter[(nbase + m) * dim + i] /= norm; } - + // Store the norm if needed if (tid == 0 && psi_norm != nullptr) { psi_norm[m] = norm; } } +template +__global__ void refresh_hcc_scc_vcc_kernel( + const int n, + T *hcc, + T *scc, + T *vcc, + const int ldh, + const Real *eigenvalue, + const T one) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + { + hcc[i * ldh + i] = eigenvalue[i]; + scc[i * ldh + i] = one; + vcc[i * ldh + i] = one; + } +} + template void line_minimize_with_block_op::operator()(T* grad_out, T* hgrad_out, @@ -392,15 +411,15 @@ void apply_eigenvalues_op::operator()(const int& nba { const int threads_per_block = 256; const int blocks_per_grid_y = (nbase + threads_per_block - 1) / threads_per_block; - + dim3 grid(notconv, blocks_per_grid_y); - + auto vec_complex = reinterpret_cast*>(vectors); auto res_complex = reinterpret_cast*>(result); - + apply_eigenvalues_kernel<<>>( vec_complex, res_complex, eigenvalues, nbase, nbase_x, notconv); - + cudaCheckOnDebug(); } @@ -414,14 +433,14 @@ void precondition_op::operator()(const int& dim, { const int threads_per_block = 256; const int blocks_per_grid_y = (dim + threads_per_block - 1) / threads_per_block; - + dim3 grid(notconv, blocks_per_grid_y); - + auto psi_complex = reinterpret_cast*>(psi_iter); - + precondition_kernel<<>>( psi_complex, precondition, eigenvalues, dim, nbase, notconv); - + cudaCheckOnDebug(); } @@ -433,10 +452,62 @@ void normalize_op::operator()(const int& dim, Real* psi_norm) { auto psi_complex = reinterpret_cast*>(psi_iter); - + normalize_kernel<<>>( psi_complex, psi_norm, dim, nbase, notconv); - + + cudaCheckOnDebug(); +} + +template <> +void refresh_hcc_scc_vcc_op::operator()(const int &n, + double *hcc, + double *scc, + double *vcc, + const int &ldh, + const double *eigenvalue, + const double& one) +{ + int thread = 512; + int block = (n + thread - 1) / thread; + refresh_hcc_scc_vcc_kernel <<>> (n, hcc, scc, vcc, ldh, eigenvalue, one); + + cudaCheckOnDebug(); +} + +template <> +void refresh_hcc_scc_vcc_op, base_device::DEVICE_GPU>::operator()(const int &n, + std::complex *hcc, + std::complex *scc, + std::complex *vcc, + const int &ldh, + const float *eigenvalue, + const std::complex& one) +{ + int thread = 512; + int block = (n + thread - 1) / thread; + refresh_hcc_scc_vcc_kernel, float> <<>> (n, reinterpret_cast*>(hcc), + reinterpret_cast*>(scc), reinterpret_cast*>(vcc), ldh, eigenvalue, + thrust::complex(one)); + + cudaCheckOnDebug(); +} + +template <> +void refresh_hcc_scc_vcc_op, base_device::DEVICE_GPU>::operator()(const int &n, + std::complex *hcc, + std::complex *scc, + std::complex *vcc, + const int &ldh, + const double *eigenvalue, + const std::complex& one) +{ + int thread = 512; + int block = (n + thread - 1) / thread; + refresh_hcc_scc_vcc_kernel, double> <<>> (n, reinterpret_cast*>(hcc), + reinterpret_cast*>(scc), reinterpret_cast*>(vcc), ldh, eigenvalue, + thrust::complex(one)); + cudaCheckOnDebug(); } @@ -453,4 +524,7 @@ template struct precondition_op; template struct normalize_op, base_device::DEVICE_GPU>; template struct normalize_op, base_device::DEVICE_GPU>; template struct normalize_op; +template struct refresh_hcc_scc_vcc_op, base_device::DEVICE_GPU>; +template struct refresh_hcc_scc_vcc_op, base_device::DEVICE_GPU>; +template struct refresh_hcc_scc_vcc_op; } \ No newline at end of file diff --git a/source/source_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu b/source/source_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu index 095cd4deff..b7bbeb7494 100644 --- a/source/source_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu +++ b/source/source_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu @@ -182,7 +182,7 @@ __global__ void apply_eigenvalues_kernel( { int m = blockIdx.x; int idx = threadIdx.x + blockIdx.y * blockDim.x; - + if (m < notconv && idx < nbase) { result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx]; } @@ -199,7 +199,7 @@ __global__ void precondition_kernel( { int m = blockIdx.x; int i = threadIdx.x + blockIdx.y * blockDim.x; - + if (m < notconv && i < dim) { Real x = abs(precondition[i] - eigenvalues[m]); Real pre = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); @@ -218,17 +218,17 @@ __global__ void normalize_kernel( int m = blockIdx.x; int tid = threadIdx.x; __shared__ Real sum[THREAD_PER_BLOCK]; - + sum[tid] = 0.0; - + // Calculate the sum for normalization for (int i = tid; i < dim; i += THREAD_PER_BLOCK) { auto val = psi_iter[(nbase + m) * dim + i]; sum[tid] += (val * thrust::conj(val)).real(); } - + __syncthreads(); - + // Parallel reduction in shared memory for (int s = THREAD_PER_BLOCK/2; s > 0; s >>= 1) { if (tid < s) { @@ -236,20 +236,39 @@ __global__ void normalize_kernel( } __syncthreads(); } - + Real norm = sqrt(sum[0]); - + // Normalize the vector for (int i = tid; i < dim; i += THREAD_PER_BLOCK) { psi_iter[(nbase + m) * dim + i] /= norm; } - + // Store the norm if needed if (tid == 0 && psi_norm != nullptr) { psi_norm[m] = norm; } } +template +__global__ void refresh_hcc_scc_vcc_kernel( + const int n, + T *hcc, + T *scc, + T *vcc, + const int ldh, + const Real *eigenvalue, + const T one) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + { + hcc[i * ldh + i] = eigenvalue[i]; + scc[i * ldh + i] = one; + vcc[i * ldh + i] = one; + } +} + template void line_minimize_with_block_op::operator()(T* grad_out, T* hgrad_out, @@ -306,15 +325,15 @@ void apply_eigenvalues_op::operator()(const int& nba { const int threads_per_block = 256; const int blocks_per_grid_y = (nbase + threads_per_block - 1) / threads_per_block; - + dim3 grid(notconv, blocks_per_grid_y); - + auto vec_complex = reinterpret_cast*>(vectors); auto res_complex = reinterpret_cast*>(result); - + apply_eigenvalues_kernel<<>>( vec_complex, res_complex, eigenvalues, nbase, nbase_x, notconv); - + hipCheckOnDebug(); } @@ -328,14 +347,14 @@ void precondition_op::operator()(const int& dim, { const int threads_per_block = 256; const int blocks_per_grid_y = (dim + threads_per_block - 1) / threads_per_block; - + dim3 grid(notconv, blocks_per_grid_y); - + auto psi_complex = reinterpret_cast*>(psi_iter); - + precondition_kernel<<>>( psi_complex, precondition, eigenvalues, dim, nbase, notconv); - + hipCheckOnDebug(); } @@ -347,10 +366,62 @@ void normalize_op::operator()(const int& dim, Real* psi_norm) { auto psi_complex = reinterpret_cast*>(psi_iter); - + normalize_kernel<<>>( psi_complex, psi_norm, dim, nbase, notconv); - + + hipCheckOnDebug(); +} + +template <> +void refresh_hcc_scc_vcc_op::operator()(const int &n, + double *hcc, + double *scc, + double *vcc, + const int &ldh, + const double *eigenvalue, + const double& one) +{ + int thread = 512; + int block = (n + thread - 1) / thread; + refresh_hcc_scc_vcc_kernel <<>> (n, hcc, scc, vcc, ldh, eigenvalue, one); + + hipCheckOnDebug(); +} + +template <> +void refresh_hcc_scc_vcc_op, base_device::DEVICE_GPU>::operator()(const int &n, + std::complex *hcc, + std::complex *scc, + std::complex *vcc, + const int &ldh, + const float *eigenvalue, + const std::complex& one) +{ + int thread = 512; + int block = (n + thread - 1) / thread; + refresh_hcc_scc_vcc_kernel, float> <<>> (n, reinterpret_cast*>(hcc), + reinterpret_cast*>(scc), reinterpret_cast*>(vcc), ldh, eigenvalue, + thrust::complex(one)); + + hipCheckOnDebug(); +} + +template <> +void refresh_hcc_scc_vcc_op, base_device::DEVICE_GPU>::operator()(const int &n, + std::complex *hcc, + std::complex *scc, + std::complex *vcc, + const int &ldh, + const double *eigenvalue, + const std::complex& one) +{ + int thread = 512; + int block = (n + thread - 1) / thread; + refresh_hcc_scc_vcc_kernel, double> <<>> (n, reinterpret_cast*>(hcc), + reinterpret_cast*>(scc), reinterpret_cast*>(vcc), ldh, eigenvalue, + thrust::complex(one)); + hipCheckOnDebug(); } @@ -367,4 +438,7 @@ template struct precondition_op; template struct normalize_op, base_device::DEVICE_GPU>; template struct normalize_op, base_device::DEVICE_GPU>; template struct normalize_op; +template struct refresh_hcc_scc_vcc_op, base_device::DEVICE_GPU>; +template struct refresh_hcc_scc_vcc_op, base_device::DEVICE_GPU>; +template struct refresh_hcc_scc_vcc_op; } \ No newline at end of file From 7d0a07ecafeedc3e89a8dc26f66b6cb94e25daae Mon Sep 17 00:00:00 2001 From: "tianxiang.wang@metax-tech.com" Date: Fri, 5 Sep 2025 06:40:09 +0000 Subject: [PATCH 2/5] =?UTF-8?q?Perf:=20reduce=20memory=20allocation=20and?= =?UTF-8?q?=20copy=20in=20Diago=5FDavSubspace::diag=5Fzhegvx=20Signed-off-?= =?UTF-8?q?by=EF=BC=9ATianxiang=20Wang,=20C?= =?UTF-8?q?ontributed=20under=20MetaX=20Integrated=20Circuits=20(Shanghai)?= =?UTF-8?q?=20Co.,=20Ltd.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/source_hsolver/diago_dav_subspace.cpp | 33 +++---------------- source/source_hsolver/diago_dav_subspace.h | 1 + .../source_hsolver/kernels/cuda/dngvd_op.cu | 2 +- .../kernels/rocm/dngvd_op.hip.cu | 12 +++---- 4 files changed, 13 insertions(+), 35 deletions(-) diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp index fe89b23377..7a78cef163 100644 --- a/source/source_hsolver/diago_dav_subspace.cpp +++ b/source/source_hsolver/diago_dav_subspace.cpp @@ -76,6 +76,7 @@ Diago_DavSubspace::Diago_DavSubspace(const std::vector& precond { resmem_real_op()(this->d_precondition, nbasis_in); // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in); + base_device::memory::resize_memory_op()(this->d_scc, this->nbase_x * this->nbase_x); resmem_real_op()(this->d_eigenvalue, this->nbase_x); } #endif @@ -95,6 +96,7 @@ Diago_DavSubspace::~Diago_DavSubspace() if (this->device == base_device::GpuDevice) { delmem_real_op()(this->d_precondition); + delmem_complex_op()(this->d_scc); delmem_real_op()(this->d_eigenvalue); } #endif @@ -546,34 +548,9 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, #if defined(__CUDA) || defined(__ROCM) if (this->diag_comm.rank == 0) { - Real* eigenvalue_gpu = nullptr; - resmem_real_op()(eigenvalue_gpu, this->nbase_x); - - syncmem_var_h2d_op()(eigenvalue_gpu, (*eigenvalue_iter).data(), this->nbase_x); - - T* hcc_gpu = nullptr; - T* scc_gpu = nullptr; - T* vcc_gpu = nullptr; - base_device::memory::resize_memory_op()(hcc_gpu, nbase * nbase); - base_device::memory::resize_memory_op()(scc_gpu, nbase * nbase); - base_device::memory::resize_memory_op()(vcc_gpu, nbase * nbase); - for(int i=0;i()(hcc_gpu + i * nbase, hcc + i * nbase_x, nbase); - base_device::memory::synchronize_memory_op()(scc_gpu + i * nbase, scc + i * nbase_x, nbase); - } - dngvd_op()(this->ctx, nbase, nbase, hcc_gpu, scc_gpu, eigenvalue_gpu, vcc_gpu); - for(int i=0;i()(vcc + i * nbase_x, vcc_gpu + i * nbase, nbase); - } - delmem_complex_op()(hcc_gpu); - delmem_complex_op()(scc_gpu); - delmem_complex_op()(vcc_gpu); - - syncmem_var_d2h_op()((*eigenvalue_iter).data(), eigenvalue_gpu, this->nbase_x); - - delmem_real_op()(eigenvalue_gpu); + base_device::memory::synchronize_memory_op()(this->d_scc, scc, nbase * this->nbase_x); + dngvd_op()(this->ctx, nbase, this->nbase_x, this->hcc, this->d_scc, this->d_eigenvalue, this->vcc); + syncmem_var_d2h_op()((*eigenvalue_iter).data(), this->d_eigenvalue, this->nbase_x); } #endif } diff --git a/source/source_hsolver/diago_dav_subspace.h b/source/source_hsolver/diago_dav_subspace.h index 3d76bbf225..ba5cf34e50 100644 --- a/source/source_hsolver/diago_dav_subspace.h +++ b/source/source_hsolver/diago_dav_subspace.h @@ -94,6 +94,7 @@ class Diago_DavSubspace /// Eigenvectors on the reduced basis T* vcc = nullptr; + T* d_scc = nullptr; Real* d_eigenvalue = nullptr; /// device type of psi diff --git a/source/source_hsolver/kernels/cuda/dngvd_op.cu b/source/source_hsolver/kernels/cuda/dngvd_op.cu index 4ce3d9a1d0..5c1ef8ccab 100644 --- a/source/source_hsolver/kernels/cuda/dngvd_op.cu +++ b/source/source_hsolver/kernels/cuda/dngvd_op.cu @@ -216,7 +216,7 @@ struct dngvd_op Real* W, // eigenvalue T* V) { - assert(nstart == ldh); + // assert(nstart == ldh); // A to V cudaErrcheck(cudaMemcpy(V, A, sizeof(T) * ldh * nstart, cudaMemcpyDeviceToDevice)); xhegvd_wrapper(CUBLAS_FILL_MODE_UPPER, nstart, V, ldh, diff --git a/source/source_hsolver/kernels/rocm/dngvd_op.hip.cu b/source/source_hsolver/kernels/rocm/dngvd_op.hip.cu index a359ccda87..6730e0f936 100644 --- a/source/source_hsolver/kernels/rocm/dngvd_op.hip.cu +++ b/source/source_hsolver/kernels/rocm/dngvd_op.hip.cu @@ -8,7 +8,7 @@ namespace hsolver { // NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op static hipsolverHandle_t hipsolver_H = nullptr; -// Test on DCU platform. When nstart is greater than 234, code on DCU performs better. +// Test on DCU platform. When nstart is greater than 234, code on DCU performs better. const int N_DCU = 234; void createGpuSolverHandle() { @@ -97,7 +97,7 @@ void dngvd_op::operator()(const base_device::DE hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); } - + } #endif // __LCAO @@ -112,7 +112,7 @@ void dngvd_op, base_device::DEVICE_GPU>::operator()(const ba { // copied from ../cuda/dngvd_op.cu, "dngvd_op" assert(nstart == ldh); - + if (nstart > N_DCU){ hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex) * ldh * nstart, hipMemcpyDeviceToDevice)); // now vcc contains hcc @@ -170,7 +170,7 @@ void dngvd_op, base_device::DEVICE_GPU>::operator()(const ba hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice)); } - + } template <> @@ -184,7 +184,7 @@ void dngvd_op, base_device::DEVICE_GPU>::operator()(const b ) { // copied from ../cuda/dngvd_op.cu, "dngvd_op" - assert(nstart == ldh); + // assert(nstart == ldh); // save a copy of scc in case the diagonalization fails if (nstart > N_DCU){ @@ -253,7 +253,7 @@ void dngvd_op, base_device::DEVICE_GPU>::operator()(const b - + } #ifdef __LCAO From 675dd233c6911fd5dd297d2d072e322459313acb Mon Sep 17 00:00:00 2001 From: "tianxiang.wang@metax-tech.com" Date: Mon, 8 Sep 2025 01:45:03 +0000 Subject: [PATCH 3/5] =?UTF-8?q?Perf:=20Replace=20loop-based=202D=20copy=20?= =?UTF-8?q?and=20memset=20with=20memcpy=5F2d=5Fop,=20memset=5F2d=5Fop=20Si?= =?UTF-8?q?gned-off-by=EF=BC=9ATianxiang=20Wang,=20Contributed=20under=20MetaX=20Integrated=20Circuits=20(?= =?UTF-8?q?Shanghai)=20Co.,=20Ltd.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../module_device/cuda/memory_op.cu | 68 +++++++++++ .../source_base/module_device/memory_op.cpp | 113 ++++++++++++++++++ source/source_base/module_device/memory_op.h | 113 +++++++++++++++++- source/source_hsolver/diago_dav_subspace.cpp | 21 +--- source/source_hsolver/diago_dav_subspace.h | 2 + 5 files changed, 300 insertions(+), 17 deletions(-) diff --git a/source/source_base/module_device/cuda/memory_op.cu b/source/source_base/module_device/cuda/memory_op.cu index 43eae7975c..ecc972c3ba 100644 --- a/source/source_base/module_device/cuda/memory_op.cu +++ b/source/source_base/module_device/cuda/memory_op.cu @@ -85,6 +85,16 @@ void set_memory_op::operator()(FPTYPE* arr, cudaErrcheck(cudaMemset(arr, var, sizeof(FPTYPE) * size)); } +template +void set_memory_2d_op::operator()(FPTYPE* arr, + const size_t pitch, + const int var, + const size_t width, + const size_t height) +{ + cudaErrcheck(cudaMemset2D(arr, sizeof(FPTYPE) * pitch , var, sizeof(FPTYPE) * width, height)); +} + template void synchronize_memory_op::operator()( FPTYPE* arr_out, @@ -112,6 +122,42 @@ void synchronize_memory_op +void synchronize_memory_2d_op::operator()( + FPTYPE* arr_out, + const size_t dpitch, + const FPTYPE* arr_in, + const size_t spitch, + const size_t width, + const size_t height) +{ + cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyDeviceToHost)); +} + +template +void synchronize_memory_2d_op::operator()( + FPTYPE* arr_out, + const size_t dpitch, + const FPTYPE* arr_in, + const size_t spitch, + const size_t width, + const size_t height) +{ + cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyHostToDevice)); +} + +template +void synchronize_memory_2d_op::operator()( + FPTYPE* arr_out, + const size_t dpitch, + const FPTYPE* arr_in, + const size_t spitch, + const size_t width, + const size_t height) +{ + cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyDeviceToDevice)); +} + template struct cast_memory_op { @@ -196,6 +242,12 @@ template struct set_memory_op; template struct set_memory_op, base_device::DEVICE_GPU>; template struct set_memory_op, base_device::DEVICE_GPU>; +template struct set_memory_2d_op; +template struct set_memory_2d_op; +template struct set_memory_2d_op; +template struct set_memory_2d_op, base_device::DEVICE_GPU>; +template struct set_memory_2d_op, base_device::DEVICE_GPU>; + template struct synchronize_memory_op; template struct synchronize_memory_op; template struct synchronize_memory_op; @@ -212,6 +264,22 @@ template struct synchronize_memory_op, base_device::DEVICE_ template struct synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>; template struct synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>; +template struct synchronize_memory_2d_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>; +template struct synchronize_memory_2d_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>; +template struct synchronize_memory_2d_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>; +template struct synchronize_memory_2d_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>; +template struct synchronize_memory_2d_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>; + template struct cast_memory_op; template struct cast_memory_op; template struct cast_memory_op; diff --git a/source/source_base/module_device/memory_op.cpp b/source/source_base/module_device/memory_op.cpp index 189014c981..155bf46983 100644 --- a/source/source_base/module_device/memory_op.cpp +++ b/source/source_base/module_device/memory_op.cpp @@ -55,6 +55,18 @@ struct set_memory_op } }; +template +struct set_memory_2d_op +{ + void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height) + { + for (size_t i = 0; i < height; i++){ + set_memory_op()(arr + i * pitch, var, width); + } + } +}; + + template struct synchronize_memory_op { @@ -70,6 +82,23 @@ struct synchronize_memory_op +struct synchronize_memory_2d_op +{ + void operator()(FPTYPE* arr_out, + const size_t dpitch, + const FPTYPE* arr_in, + const size_t spitch, + const size_t width, + const size_t height) + { + for (int i = 0; i < height; i++){ + synchronize_memory_op()( + arr_out + i * dpitch, arr_in + i * spitch, width); + } + } +}; + template struct cast_memory_op { @@ -108,12 +137,24 @@ template struct set_memory_op; template struct set_memory_op, base_device::DEVICE_CPU>; template struct set_memory_op, base_device::DEVICE_CPU>; +template struct set_memory_2d_op; +template struct set_memory_2d_op; +template struct set_memory_2d_op; +template struct set_memory_2d_op, base_device::DEVICE_CPU>; +template struct set_memory_2d_op, base_device::DEVICE_CPU>; + template struct synchronize_memory_op; template struct synchronize_memory_op; template struct synchronize_memory_op; template struct synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_CPU>; template struct synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_CPU>; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op, base_device::DEVICE_CPU, base_device::DEVICE_CPU>; +template struct synchronize_memory_2d_op, base_device::DEVICE_CPU, base_device::DEVICE_CPU>; + template struct cast_memory_op; template struct cast_memory_op; template struct cast_memory_op; @@ -167,6 +208,14 @@ struct set_memory_op } }; +template +struct set_memory_2d_op +{ + void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height) + { + } +}; + template struct synchronize_memory_op { @@ -197,6 +246,48 @@ struct synchronize_memory_op +struct synchronize_memory_2d_op +{ + void operator()(const Device_in* dev_in, + FPTYPE* arr_out, + const size_t dpitch, + const FPTYPE* arr_in, + const size_t spitch, + const size_t width, + const size_t height) + { + } +}; + +template +struct synchronize_memory_2d_op +{ + void operator()(const Device_in* dev_in, + FPTYPE* arr_out, + const size_t dpitch, + const FPTYPE* arr_in, + const size_t spitch, + const size_t width, + const size_t height) + { + } +}; + +template +struct synchronize_memory_2d_op +{ + void operator()(const Device_in* dev_in, + FPTYPE* arr_out, + const size_t dpitch, + const FPTYPE* arr_in, + const size_t spitch, + const size_t width, + const size_t height) + { + } +}; + template struct cast_memory_op { @@ -247,6 +338,12 @@ template struct set_memory_op; template struct set_memory_op, base_device::DEVICE_GPU>; template struct set_memory_op, base_device::DEVICE_GPU>; +template struct set_memory_2d_op; +template struct set_memory_2d_op; +template struct set_memory_2d_op; +template struct set_memory_2d_op, base_device::DEVICE_GPU>; +template struct set_memory_2d_op, base_device::DEVICE_GPU>; + template struct synchronize_memory_op; template struct synchronize_memory_op; template struct synchronize_memory_op; @@ -263,6 +360,22 @@ template struct synchronize_memory_op, base_device::DEVICE_ template struct synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>; template struct synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op; +template struct synchronize_memory_2d_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>; +template struct synchronize_memory_2d_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>; +template struct synchronize_memory_2d_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>; +template struct synchronize_memory_2d_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>; +template struct synchronize_memory_2d_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>; +template struct synchronize_memory_2d_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>; + template struct cast_memory_op; template struct cast_memory_op; template struct cast_memory_op; diff --git a/source/source_base/module_device/memory_op.h b/source/source_base/module_device/memory_op.h index 30182649d3..c24acbb024 100644 --- a/source/source_base/module_device/memory_op.h +++ b/source/source_base/module_device/memory_op.h @@ -40,6 +40,22 @@ struct set_memory_op void operator()(FPTYPE* arr, const int var, const size_t size); }; +template +struct set_memory_2d_op +{ + /// @brief memset2D for multi-device + /// + /// Input Parameters + /// \param var : the specified constant value + /// \param pitch : Pitch in elements of 2D device memory + /// \param width : Width of matrix set (columns in elements) + /// \param height : Height of matrix set (rows) + /// + /// Output Parameters + /// \param arr : output array initialized by the input value + void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height); +}; + template struct synchronize_memory_op { @@ -56,6 +72,28 @@ struct synchronize_memory_op const size_t size); }; +template +struct synchronize_memory_2d_op +{ + /// @brief memcpy2D for multi-device + /// + /// Input Parameters + /// \param arr_in : input array + /// \param dpitch : Pitch in elements of destination memory + /// \param spitch : Pitch in elements of source memory + /// \param width : Width of matrix transfer (columns in elements) + /// \param height : Height of matrix transfer (rows) + /// + /// Output Parameters + /// \param arr_out : output array initialized by the input array + void operator()(FPTYPE* arr_out, + const size_t dpitch, + const FPTYPE* arr_in, + const size_t spitch, + const size_t width, + const size_t height); +}; + template struct cast_memory_op { @@ -113,6 +151,12 @@ struct set_memory_op void operator()(FPTYPE* arr, const int var, const size_t size); }; +template +struct set_memory_2d_op +{ + void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height); +}; + template struct synchronize_memory_op { @@ -133,7 +177,38 @@ struct synchronize_memory_op +struct synchronize_memory_2d_op +{ + void operator()(FPTYPE* arr_out, + const size_t dpitch, + const FPTYPE* arr_in, + const size_t spitch, + const size_t width, + const size_t height); +}; +template +struct synchronize_memory_2d_op +{ + void operator()(FPTYPE* arr_out, + const size_t dpitch, + const FPTYPE* arr_in, + const size_t spitch, + const size_t width, + const size_t height); +}; +template +struct synchronize_memory_2d_op +{ + void operator()(FPTYPE* arr_out, + const size_t dpitch, + const FPTYPE* arr_in, + const size_t spitch, + const size_t width, + const size_t height); }; template @@ -194,6 +269,16 @@ using setmem_dd_op = base_device::memory::set_memory_op, base_device::DEVICE_GPU>; using setmem_zd_op = base_device::memory::set_memory_op, base_device::DEVICE_GPU>; +using setmem_sh_2d_op = base_device::memory::set_memory_2d_op; +using setmem_dh_2d_op = base_device::memory::set_memory_2d_op; +using setmem_ch_2d_op = base_device::memory::set_memory_2d_op, base_device::DEVICE_CPU>; +using setmem_zh_2d_op = base_device::memory::set_memory_2d_op, base_device::DEVICE_CPU>; + +using setmem_sd_2d_op = base_device::memory::set_memory_2d_op; +using setmem_dd_2d_op = base_device::memory::set_memory_2d_op; +using setmem_cd_2d_op = base_device::memory::set_memory_2d_op, base_device::DEVICE_GPU>; +using setmem_zd_2d_op = base_device::memory::set_memory_2d_op, base_device::DEVICE_GPU>; + using delmem_sh_op = base_device::memory::delete_memory_op; using delmem_dh_op = base_device::memory::delete_memory_op; using delmem_ch_op = base_device::memory::delete_memory_op, base_device::DEVICE_CPU>; @@ -230,6 +315,32 @@ using syncmem_z2z_h2d_op using syncmem_z2z_d2h_op = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>; +using syncmem_c2c_h2h_op + = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_CPU>; +using syncmem_c2c_h2d_op + = base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>; +using syncmem_c2c_d2h_op + = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>; +using syncmem_z2z_h2h_op + = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_CPU>; +using syncmem_z2z_h2d_op + = base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>; +using syncmem_z2z_d2h_op + = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>; + +using syncmem_c2c_h2h_2d_op + = base_device::memory::synchronize_memory_2d_op, base_device::DEVICE_CPU, base_device::DEVICE_CPU>; +using syncmem_c2c_h2d_2d_op + = base_device::memory::synchronize_memory_2d_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>; +using syncmem_c2c_d2h_2d_op + = base_device::memory::synchronize_memory_2d_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>; +using syncmem_z2z_h2h_2d_op + = base_device::memory::synchronize_memory_2d_op, base_device::DEVICE_CPU, base_device::DEVICE_CPU>; +using syncmem_z2z_h2d_2d_op + = base_device::memory::synchronize_memory_2d_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>; +using syncmem_z2z_d2h_2d_op + = base_device::memory::synchronize_memory_2d_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>; + using castmem_s2d_h2h_op = base_device::memory::cast_memory_op; using castmem_s2d_h2d_op diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp index 7a78cef163..d427f246aa 100644 --- a/source/source_hsolver/diago_dav_subspace.cpp +++ b/source/source_hsolver/diago_dav_subspace.cpp @@ -129,13 +129,10 @@ int Diago_DavSubspace::diag_once(const HPsiFunc& hpsi_func, ModuleBase::timer::tick("Diago_DavSubspace", "first"); + syncmem_complex_2d_op()(this->psi_in_iter, this->dim, psi_in, psi_in_dmax, this->dim, this->n_band); for (int m = 0; m < this->n_band; m++) { unconv[m] = m; - - syncmem_complex_op()(this->psi_in_iter + m * this->dim, - psi_in + m * psi_in_dmax, - this->dim); } // compute h*psi_in_iter @@ -247,12 +244,7 @@ int Diago_DavSubspace::diag_once(const HPsiFunc& hpsi_func, // estimate of the eigenvectors and set the basis dimension to N; // update this->psi_in_iter according to psi_in - for (size_t i = 0; i < this->n_band; i++) - { - syncmem_complex_op()(this->psi_in_iter + i * this->dim, - psi_in + i * psi_in_dmax, - this->dim); - } + syncmem_complex_2d_op()(this->psi_in_iter, this->dim, psi_in, psi_in_dmax, this->dim, this->n_band); this->refresh(this->dim, this->n_band, @@ -722,12 +714,9 @@ void Diago_DavSubspace::refresh(const int& dim, nbase = nband; // set hcc/scc/vcc to 0 - for (size_t i = 0; i < nbase; i++) - { - setmem_complex_op()(&hcc[this->nbase_x * i], 0, nbase); - setmem_complex_op()(&scc[this->nbase_x * i], 0, nbase); - setmem_complex_op()(&vcc[this->nbase_x * i], 0, nbase); - } + setmem_complex_2d_op()(hcc, this->nbase_x, 0, nbase, nbase); + setmem_complex_2d_op()(scc, this->nbase_x, 0, nbase, nbase); + setmem_complex_2d_op()(vcc, this->nbase_x, 0, nbase, nbase); if (this->device == base_device::GpuDevice) { diff --git a/source/source_hsolver/diago_dav_subspace.h b/source/source_hsolver/diago_dav_subspace.h index ba5cf34e50..a2227949d4 100644 --- a/source/source_hsolver/diago_dav_subspace.h +++ b/source/source_hsolver/diago_dav_subspace.h @@ -179,6 +179,7 @@ class Diago_DavSubspace using delmem_real_op = base_device::memory::delete_memory_op; #endif using setmem_real_op = base_device::memory::set_memory_op; + using setmem_complex_2d_op = base_device::memory::set_memory_2d_op; using resmem_real_h_op = base_device::memory::resize_memory_op; using delmem_real_h_op = base_device::memory::delete_memory_op; @@ -187,6 +188,7 @@ class Diago_DavSubspace using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op; using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op; using syncmem_complex_op = base_device::memory::synchronize_memory_op; + using syncmem_complex_2d_op = base_device::memory::synchronize_memory_2d_op; using castmem_complex_op = base_device::memory::cast_memory_op, T, Device, Device>; using syncmem_h2d_op = base_device::memory::synchronize_memory_op; using syncmem_d2h_op = base_device::memory::synchronize_memory_op; From 4d270a6575539db3303dd3a6232a22c45c7e0589 Mon Sep 17 00:00:00 2001 From: "tianxiang.wang@metax-tech.com" Date: Mon, 8 Sep 2025 03:29:39 +0000 Subject: [PATCH 4/5] =?UTF-8?q?Perf:=20use=20warp=20reduce=20instead=20of?= =?UTF-8?q?=20shared=20memory=20for=20better=20efficiency=20Signed-off-by?= =?UTF-8?q?=EF=BC=9ATianxiang=20Wang,=20Con?= =?UTF-8?q?tributed=20under=20MetaX=20Integrated=20Circuits=20(Shanghai)?= =?UTF-8?q?=20Co.,=20Ltd.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kernels/cuda/bpcg_kernel_op.cu | 67 ++++++++++++------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu b/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu index a8f45ad886..b0a08cc513 100644 --- a/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu +++ b/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu @@ -7,6 +7,8 @@ namespace hsolver { const int warp_size = 32; const int thread_per_block = 256; +#define FULL_MASK 0xffffffff +#define WARP_SIZE 32 template __global__ void line_minimize_with_block( @@ -282,6 +284,37 @@ __global__ void precondition_kernel( } } +template +__device__ Real warpReduceSum(Real val) { + for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) + val += __shfl_down_sync(FULL_MASK, val, offset); + return val; +} + +template +__device__ Real blockReduceSum(Real val, volatile Real* shared) { + int lane = threadIdx.x % WARP_SIZE; + int wid = threadIdx.x / WARP_SIZE; + + val = warpReduceSum(val); + + if (lane == 0) + shared[wid] = val; + + __syncthreads(); + + Real sum = 0.0; + if (wid == 0) { + sum = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0.0; + sum = warpReduceSum(sum); + if (lane == 0) shared[0] = sum; + } + + __syncthreads(); + return shared[0]; +} + + template __global__ void normalize_kernel( thrust::complex* psi_iter, @@ -292,38 +325,19 @@ __global__ void normalize_kernel( { int m = blockIdx.x; int tid = threadIdx.x; - __shared__ Real sum[thread_per_block]; + extern __shared__ char s_char[]; + Real* shared = reinterpret_cast(s_char); - sum[tid] = 0.0; + Real local_sum = 0.0; // Calculate the sum for normalization for (int i = tid; i < dim; i += thread_per_block) { auto val = psi_iter[(nbase + m) * dim + i]; - sum[tid] += (val * thrust::conj(val)).real(); + local_sum += (val * thrust::conj(val)).real(); } - __syncthreads(); - - // Parallel reduction in shared memory - for (int s = thread_per_block/2; s > warp_size; s >>= 1) { - if (tid < s) { - sum[tid] += sum[tid + s]; - } - __syncthreads(); - } - - if (tid < warp_size) { - sum[tid] += sum[tid + 32]; __syncwarp(); - sum[tid] += sum[tid + 16]; __syncwarp(); - sum[tid] += sum[tid + 8]; __syncwarp(); - sum[tid] += sum[tid + 4]; __syncwarp(); - sum[tid] += sum[tid + 2]; __syncwarp(); - sum[tid] += sum[tid + 1]; __syncwarp(); - } - - __syncthreads(); - - Real norm = sqrt(sum[0]); + Real l2_sq = blockReduceSum(local_sum, shared); + Real norm = sqrt(l2_sq); // Normalize the vector for (int i = tid; i < dim; i += thread_per_block) { @@ -452,8 +466,9 @@ void normalize_op::operator()(const int& dim, Real* psi_norm) { auto psi_complex = reinterpret_cast*>(psi_iter); + int sharedMemSize = (thread_per_block / WARP_SIZE) * sizeof(Real); - normalize_kernel<<>>( + normalize_kernel<<>>( psi_complex, psi_norm, dim, nbase, notconv); cudaCheckOnDebug(); From 53323929cef6e150c45e3f8a459f57871564fe27 Mon Sep 17 00:00:00 2001 From: "tianxiang.wang@metax-tech.com" Date: Mon, 8 Sep 2025 06:52:33 +0000 Subject: [PATCH 5/5] =?UTF-8?q?Fix=20compilation=20error=20Signed-off-by?= =?UTF-8?q?=EF=BC=9ATianxiang=20Wang,=20Con?= =?UTF-8?q?tributed=20under=20MetaX=20Integrated=20Circuits=20(Shanghai)?= =?UTF-8?q?=20Co.,=20Ltd.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/source_base/module_device/memory_op.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/source/source_base/module_device/memory_op.cpp b/source/source_base/module_device/memory_op.cpp index 155bf46983..da22bc2ce6 100644 --- a/source/source_base/module_device/memory_op.cpp +++ b/source/source_base/module_device/memory_op.cpp @@ -249,8 +249,7 @@ struct synchronize_memory_op struct synchronize_memory_2d_op { - void operator()(const Device_in* dev_in, - FPTYPE* arr_out, + void operator()(FPTYPE* arr_out, const size_t dpitch, const FPTYPE* arr_in, const size_t spitch, @@ -263,8 +262,7 @@ struct synchronize_memory_2d_op struct synchronize_memory_2d_op { - void operator()(const Device_in* dev_in, - FPTYPE* arr_out, + void operator()(FPTYPE* arr_out, const size_t dpitch, const FPTYPE* arr_in, const size_t spitch, @@ -277,8 +275,7 @@ struct synchronize_memory_2d_op struct synchronize_memory_2d_op { - void operator()(const Device_in* dev_in, - FPTYPE* arr_out, + void operator()(FPTYPE* arr_out, const size_t dpitch, const FPTYPE* arr_in, const size_t spitch,