From 76233fe781495a2dc83ffe651c1a79b602c538fa Mon Sep 17 00:00:00 2001 From: shakedregev Date: Wed, 21 May 2025 14:45:24 +0000 Subject: [PATCH 1/8] added diagonal scaling, compiles, need to write tests --- resolve/cuda/cudaKernels.cu | 187 +++++++++++++++++++++------ resolve/cuda/cudaKernels.h | 33 +++-- resolve/hip/hipKernels.h | 43 +++--- resolve/hip/hipKernels.hip | 127 +++++++++--------- resolve/matrix/MatrixHandler.cpp | 66 ++++++++++ resolve/matrix/MatrixHandler.hpp | 6 + resolve/matrix/MatrixHandlerCpu.cpp | 59 ++++++++- resolve/matrix/MatrixHandlerCpu.hpp | 4 + resolve/matrix/MatrixHandlerCuda.cpp | 48 +++++++ resolve/matrix/MatrixHandlerCuda.hpp | 2 + resolve/matrix/MatrixHandlerHip.hpp | 2 + resolve/matrix/MatrixHandlerImpl.hpp | 4 + resolve/vector/Vector.cpp | 132 +++++++++---------- 13 files changed, 518 insertions(+), 195 deletions(-) diff --git a/resolve/cuda/cudaKernels.cu b/resolve/cuda/cudaKernels.cu index cbe03f09b..1bfa485a5 100644 --- a/resolve/cuda/cudaKernels.cu +++ b/resolve/cuda/cudaKernels.cu @@ -1,10 +1,10 @@ /** * @file cudaKernels.cu * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov) - * @brief + * @brief * @date 2023-12-08 - * - * + * + * */ #include "cudaKernels.h" @@ -17,9 +17,9 @@ namespace ReSolve { /** * @brief Computes v^T * [u1 u2] where v is n x k multivector * and u1 and u2 are n x 1 vectors. - * + * * @tparam Tv5 - Size of shared memory - * + * * @param[in] u1 - (n x 1) vector * @param[in] u2 - (n x 1) vector * @param[in] v - (n x k) multivector @@ -27,12 +27,12 @@ namespace ReSolve { * @param[in] k - dimension of the subspace * @param[in] N - size of vectors u1, u2 */ - template - __global__ void MassIPTwoVec(const real_type* __restrict__ u1, - const real_type* __restrict__ u2, - const real_type* __restrict__ v, + template + __global__ void MassIPTwoVec(const real_type* __restrict__ u1, + const real_type* __restrict__ u2, + const real_type* __restrict__ v, real_type* result, - const index_type k, + const index_type k, const index_type N) { index_type t = threadIdx.x; @@ -119,16 +119,16 @@ namespace ReSolve { /** * @brief AXPY y = y - x*alpha where alpha is [k x 1], and x is [N x k] needed in 1 and 2 synch GMRES - * - * @tparam Tmaxk - * + * + * @tparam Tmaxk + * * @param[in] N - number of rows in x * @param[in] k - number of columns in x * @param[in] x_data - double array, size [N x k] * @param[out] y_data - double array, size [N x 1] * @param[in] alpha - doble array, size [k x 1] */ - template + template __global__ void massAxpy3(index_type N, index_type k, const real_type* x_data, @@ -155,17 +155,17 @@ namespace ReSolve { /** * @brief Pass through matrix rows and sum each as \sum_{j=1}^m abs(a_{ij}) - * + * * @param[in] n - number of rows in the matrix. * @param[in] nnz - number of non-zero values in the matrix * @param[in] a_ia - row pointers (CSR storage) * @param[in] a_val - values (CSR storage) * @param[out] result - array size [n x 1] containing sums of values in each row. */ - __global__ void matrixInfNormPart1(const index_type n, - const index_type nnz, + __global__ void matrixInfNormPart1(const index_type n, + const index_type nnz, const index_type* a_ia, - const real_type* a_val, + const real_type* a_val, real_type* result) { index_type idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -178,6 +178,71 @@ namespace ReSolve { idx += (blockDim.x * gridDim.x); } } + /** + * @brief Scales a csr matrix on the left by a diagonal matrix + * + * @param[in] n - number of rows in the matrix + * @param[in] a_row_ptr - row pointers (CSR storage) + * @param[in, out] a_val - values (CSR storage). Changes in place. + * @param[in] d_val - diagonal values + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + __global__ void leftDiagScale(index_type n, + const index_type* a_row_ptr, + real_type* a_val, + const real_type* d_val) + { + // Get row index from thread and block indices + index_type row = blockIdx.x * blockDim.x + threadIdx.x; + + // Check if the thread's row is within matrix bounds + if (row < n) { + // Get the start and end positions for this row in the CSR format + index_type row_start = a_row_ptr[row]; + index_type row_end = a_row_ptr[row + 1]; + + // Get the scaling factor for this row from the diagonal matrix + real_type scale = d_val[row]; + + // Scale all non-zero elements in this row + for (index_type i = row_start; i < row_end; i++) { + a_val[i] *= scale; + } + } + } + /** + * @brief Scales a csr matrix on the right by a diagonal matrix + * + * @param[in] n - number of rows in the matrix + * @param[in] a_row_ptr - row pointers (CSR storage) + * @param[in] a_col_ind - column indices (CSR storage) + * @param[in, out] a_val - values (CSR storage). Changes in place. + * @param[in] d_val - diagonal values + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + __global__ void rightDiagScale(index_type n, + const index_type* a_row_ptr, + const index_type* a_col_ind, + real_type* a_val, + const real_type* d_val) + { + // Get row index from thread and block indices + index_type row = blockIdx.x * blockDim.x + threadIdx.x; + + // Check if the thread's row is within matrix bounds + if (row < n) { + // Get the start and end positions for this row in the CSR format + index_type row_start = a_row_ptr[row]; + index_type row_end = a_row_ptr[row + 1]; + + // Scale all non-zero elements in this row + for (index_type i = row_start; i < row_end; i++) { + a_val[i] *= d_val[a_col_ind[i]]; + } + } + } } // namespace kernels @@ -188,24 +253,24 @@ namespace ReSolve { /** * @brief Computes result = mvec^T * [vec1 vec2] - * + * * @param n - size of vectors vec1, vec2 - * @param i - - * @param vec1 - (n x 1) vector + * @param i - + * @param vec1 - (n x 1) vector * @param vec2 - (n x 1) vector * @param mvec - (n x (i+1)) multivector * @param result - ((i+1) x 2) multivector - * + * * @todo Input data should be `const`. * @todo Is it coincidence that the block size is equal to the default * value of Tv5? * @todo Should we use dynamic shared memory here instead? */ - void mass_inner_product_two_vectors(index_type n, - index_type i, - const real_type* vec1, - const real_type* vec2, - const real_type* mvec, + void mass_inner_product_two_vectors(index_type n, + index_type i, + const real_type* vec1, + const real_type* vec2, + const real_type* mvec, real_type* result) { kernels::MassIPTwoVec<<>>(vec1, vec2, mvec, result, i, n); @@ -213,10 +278,10 @@ namespace ReSolve { /** * @brief Computes y := y - x*alpha - * + * * @param[in] n - vector size * @param[in] i - number of vectors in the multivector - * @param[in] x - (n x (i+1)) multivector + * @param[in] x - (n x (i+1)) multivector * @param[out] y - (n x (i+1)) multivector * @param[in] alpha - ((i+1) x 1) vector */ @@ -226,20 +291,70 @@ namespace ReSolve { } /** - * @brief - * + * @brief Wrapper that scales a csr matrix on the left by a diagonal matrix + * + * @param[in] n - number of rows in the matrix + * @param[in] a_row_ptr - row pointers (CSR storage) + * @param[in, out] a_val - values (CSR storage). Changes in place. + * @param[in] d_val - diagonal values + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + void cudaLeftDiagScale(index_type n, + const index_type* a_row_ptr, + real_type* a_val, + const real_type* d_val) + { + + // Define block size and number of blocks + const int blockSize = 256; + int numBlocks = (n + blockSize - 1) / blockSize; + + // Launch the kernel + kernels::leftDiagScale<<>>(n, a_row_ptr, a_val, d_val); + } + + /** + * @brief Wrapper that scales a csr matrix on the right by a diagonal matrix + * + * @param[in] n - number of rows in the matrix + * @param[in] a_row_ptr - row pointers (CSR storage) + * @param[in] a_col_ind - column indices (CSR storage) + * @param[in, out] a_val - values (CSR storage). Changes in place. + * @param[in] d_val - diagonal values + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + void cudaRightDiagScale(index_type n, + const index_type* a_row_ptr, + const index_type* a_col_ind, + real_type* a_val, + const real_type* d_val) + { + // Define block size and number of blocks + const int blockSize = 256; + int numBlocks = (n + blockSize - 1) / blockSize; + // Launch the kernel + kernels::rightDiagScale<<>>(n, a_row_ptr, a_col_ind, a_val, d_val); + } + + + + /** + * @brief + * * @param[in] n - * @param[in] nnz - - * @param[in] a_ia - + * @param[in] a_ia - * @param[in] a_val - * @param[out] result - - * + * * @todo Decide how to allow user to configure grid and block sizes. */ - void matrix_row_sums(index_type n, - index_type nnz, + void matrix_row_sums(index_type n, + index_type nnz, const index_type* a_ia, - const real_type* a_val, + const real_type* a_val, real_type* result) { kernels::matrixInfNormPart1<<<1000, 1024>>>(n, nnz, a_ia, a_val, result); diff --git a/resolve/cuda/cudaKernels.h b/resolve/cuda/cudaKernels.h index 5deb7a6d1..f33153607 100644 --- a/resolve/cuda/cudaKernels.h +++ b/resolve/cuda/cudaKernels.h @@ -1,11 +1,11 @@ /** * @file cudaKernels.h * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov) - * + * * @brief Contains prototypes of CUDA kernels. - * + * * @note These kernels will be used in CUDA specific code, only. - * + * */ #pragma once @@ -13,20 +13,31 @@ namespace ReSolve { - void mass_inner_product_two_vectors(index_type n, - index_type i, - const real_type* vec1, - const real_type* vec2, - const real_type* mvec, + void mass_inner_product_two_vectors(index_type n, + index_type i, + const real_type* vec1, + const real_type* vec2, + const real_type* mvec, real_type* result); void mass_axpy(index_type n, index_type i, const real_type* x, real_type* y, const real_type* alpha); + void cudaLeftDiagScale(index_type n, + const index_type* a_row_ptr, + real_type* a_val, + const real_type* diag); + + void cudaRightDiagScale(index_type n, + const index_type* a_row_ptr, + const index_type* a_col_idx, + real_type* a_val, + const real_type* diag); + //needed for matrix inf nrm - void matrix_row_sums(index_type n, - index_type nnz, + void matrix_row_sums(index_type n, + index_type nnz, const index_type* a_ia, - const real_type* a_val, + const real_type* a_val, real_type* result); } // namespace ReSolve diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h index 9a51449b8..9ad576933 100644 --- a/resolve/hip/hipKernels.h +++ b/resolve/hip/hipKernels.h @@ -3,9 +3,9 @@ * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov) * @brief Contains prototypes of HIP kernels. * @date 2023-12-08 - * + * * @note These kernels will be used in HIP specific code, only. - * + * */ #pragma once @@ -14,36 +14,47 @@ namespace ReSolve { - void mass_inner_product_two_vectors(index_type n, - index_type i, - real_type* vec1, - real_type* vec2, - real_type* mvec, + void mass_inner_product_two_vectors(index_type n, + index_type i, + real_type* vec1, + real_type* vec2, + real_type* mvec, real_type* result); void mass_axpy(index_type n, index_type i, real_type* x, real_type* y, real_type* alpha); + void hipLeftDiagScale(index_type n, + index_type* a_row_ptr, + real_type* a_val, + real_type* diag); + + void hipRightDiagScale(index_type n, + index_type* a_row_ptr, + index_type* a_col_idx, + real_type* a_val, + real_type* diag); + //needed for matrix inf nrm - void matrix_row_sums(index_type n, - index_type nnz, + void matrix_row_sums(index_type n, + index_type nnz, index_type* a_ia, - real_type* a_val, + real_type* a_val, real_type* result); // needed for triangular solve - void permuteVectorP(index_type n, + void permuteVectorP(index_type n, index_type* perm_vector, - real_type* vec_in, + real_type* vec_in, real_type* vec_out); - void permuteVectorQ(index_type n, + void permuteVectorQ(index_type n, index_type* perm_vector, - real_type* vec_in, + real_type* vec_in, real_type* vec_out); - void vector_inf_norm(index_type n, + void vector_inf_norm(index_type n, real_type* input, - real_type * buffer, + real_type * buffer, real_type* result); } // namespace ReSolve diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip index f864b3829..bc9d4b331 100644 --- a/resolve/hip/hipKernels.hip +++ b/resolve/hip/hipKernels.hip @@ -1,10 +1,10 @@ /** * @file hipKernels.hip * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov) - * @brief + * @brief * @date 2023-12-08 - * - * + * + * */ #include "hipKernels.h" @@ -18,9 +18,9 @@ namespace ReSolve { /** * @brief Computes v^T * [u1 u2] where v is n x k multivector * and u1 and u2 are n x 1 vectors. - * + * * @tparam Tv5 - Size of shared memory - * + * * @param[in] u1 - (n x 1) vector * @param[in] u2 - (n x 1) vector * @param[in] v - (n x k) multivector @@ -29,11 +29,11 @@ namespace ReSolve { * @param[in] N - size of vectors u1, u2 */ template - __global__ void MassIPTwoVec_kernel(const real_type* __restrict__ u1, - const real_type* __restrict__ u2, - const real_type* __restrict__ v, + __global__ void MassIPTwoVec_kernel(const real_type* __restrict__ u1, + const real_type* __restrict__ u2, + const real_type* __restrict__ v, real_type* result, - const index_type k, + const index_type k, const index_type N) { index_type t = threadIdx.x; @@ -120,9 +120,9 @@ namespace ReSolve { /** * @brief AXPY y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES - * - * @tparam Tmaxk - * + * + * @tparam Tmaxk + * * @param[in] N - * @param[in] k - * @param[in] x_data - @@ -156,17 +156,17 @@ namespace ReSolve { /** * @brief Pass through matrix rows and sum each as \sum_{j=1}^m abs(a_{ij}) - * + * * @param[in] n - * @param[in] nnz - * @param[in] a_ia - * @param[in] a_val - * @param[out] result - */ - __global__ void matrixInfNormPart1(const index_type n, - const index_type nnz, + __global__ void matrixInfNormPart1(const index_type n, + const index_type nnz, const index_type* a_ia, - const real_type* a_val, + const real_type* a_val, real_type* result) { index_type idx = blockIdx.x*blockDim.x + threadIdx.x; @@ -181,14 +181,14 @@ namespace ReSolve { } /** - * @brief - * + * @brief + * * @param[in] n - vector size * @param[in] input - - * @param[out] result - + * @param[out] result - */ - __global__ void vectorInfNorm(const index_type n, - const real_type* input, + __global__ void vectorInfNorm(const index_type n, + const real_type* input, real_type* result) { @@ -203,7 +203,7 @@ namespace ReSolve { } idx += (blockDim.x*gridDim.x); - + while (idx < n) { local_max = fmax(fabs(input[idx]), local_max); idx += (blockDim.x * gridDim.x); @@ -259,16 +259,16 @@ namespace ReSolve { /** - * @brief - * + * @brief + * * @param n - vector size * @param perm_vector - permutation map * @param vec_in - input vector * @param vec_out - permuted vector */ - __global__ void permuteVectorP_kernel(const index_type n, + __global__ void permuteVectorP_kernel(const index_type n, const index_type* perm_vector, - const real_type* vec_in, + const real_type* vec_in, real_type* vec_out) { //one thread per vector entry, pass through rows @@ -280,16 +280,16 @@ namespace ReSolve { } /** - * @brief - * + * @brief + * * @param n - vector size * @param perm_vector - permutation map * @param vec_in - input vector * @param vec_out - permuted vector */ - __global__ void permuteVectorQ_kernel(const index_type n, + __global__ void permuteVectorQ_kernel(const index_type n, const index_type* perm_vector, - const real_type* vec_in, + const real_type* vec_in, real_type* vec_out) { //one thread per vector entry, pass through rows @@ -301,6 +301,7 @@ namespace ReSolve { } + } // namespace kernels // @@ -309,24 +310,24 @@ namespace ReSolve { /** * @brief Computes result = mvec^T * [vec1 vec2] - * + * * @param n - size of vectors vec1, vec2 - * @param i - - * @param vec1 - (n x 1) vector + * @param i - + * @param vec1 - (n x 1) vector * @param vec2 - (n x 1) vector * @param mvec - (n x (i+1)) multivector * @param result - ((i+1) x 2) multivector - * + * * @todo Input data should be `const`. * @todo Is it coincidence that the block size is equal to the default * value of Tv5? * @todo Should we use dynamic shared memory here instead? */ - void mass_inner_product_two_vectors(index_type n, - index_type i, - real_type* vec1, - real_type* vec2, - real_type* mvec, + void mass_inner_product_two_vectors(index_type n, + index_type i, + real_type* vec1, + real_type* vec2, + real_type* mvec, real_type* result) { hipLaunchKernelGGL(kernels::MassIPTwoVec_kernel, @@ -344,10 +345,10 @@ namespace ReSolve { /** * @brief Computes y := y - x*alpha - * + * * @param[in] n - vector size * @param[in] i - number of vectors in the multivector - * @param[in] x - (n x (i+1)) multivector + * @param[in] x - (n x (i+1)) multivector * @param[out] y - (n x (i+1)) multivector * @param[in] alpha - ((i+1) x 1) vector */ @@ -366,38 +367,38 @@ namespace ReSolve { } /** - * @brief - * + * @brief + * * @param[in] n - * @param[in] nnz - - * @param[in] a_ia - + * @param[in] a_ia - * @param[in] a_val - * @param[out] result - - * + * * @todo Decide how to allow user to configure grid and block sizes. */ - void matrix_row_sums(index_type n, - index_type nnz, + void matrix_row_sums(index_type n, + index_type nnz, index_type* a_ia, - real_type* a_val, + real_type* a_val, real_type* result) { hipLaunchKernelGGL(kernels::matrixInfNormPart1, dim3(1000), dim3(1024), 0, 0, n, nnz, a_ia, a_val, result); } /** - * @brief - * + * @brief + * * @param n - * @param input - - * @param buffer - - * @param result - - * + * @param buffer - + * @param result - + * * @todo Decide how to allow user to configure grid and block sizes. */ - void vector_inf_norm(index_type n, + void vector_inf_norm(index_type n, real_type* input, - real_type* buffer, + real_type* buffer, real_type* result) { hipLaunchKernelGGL(kernels::vectorInfNorm, dim3(1024), dim3(1024), 0, 0, n, input, buffer); @@ -407,16 +408,16 @@ namespace ReSolve { } /** - * @brief - * + * @brief + * * @param n - vector size * @param perm_vector - permutation map * @param vec_in - input vector * @param vec_out - permuted vector */ - void permuteVectorP(index_type n, + void permuteVectorP(index_type n, index_type* perm_vector, - real_type* vec_in, + real_type* vec_in, real_type* vec_out) { hipLaunchKernelGGL(kernels::permuteVectorP_kernel, @@ -431,16 +432,16 @@ namespace ReSolve { } /** - * @brief - * + * @brief + * * @param n - vector size * @param perm_vector - permutation map * @param vec_in - input vector * @param vec_out - permuted vector */ - void permuteVectorQ(index_type n, + void permuteVectorQ(index_type n, index_type* perm_vector, - real_type* vec_in, + real_type* vec_in, real_type* vec_out) { hipLaunchKernelGGL(kernels::permuteVectorQ_kernel, diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp index aa92b0715..323b0d9c9 100644 --- a/resolve/matrix/MatrixHandler.cpp +++ b/resolve/matrix/MatrixHandler.cpp @@ -252,6 +252,72 @@ namespace ReSolve { return 1; } + /** + * @brief Left diagonal scaling of a sparse CSR matrix + * + * @param[in] A - Sparse CSR matrix + * @param[in] diag - vector representing the diagonal matrix + * @param[in] memspace - Device where the operation is computed + * + * @pre The diagonal vector must be of the same size as the number of rows in the matrix. + * @pre A is unscaled and allocated + * @post A is scaled + * @invariant diag + * + * @return 0 if successful, 1 otherwise + */ + int MatrixHandler::leftDiagonalScale(vector_type* diag, matrix::Csr* A, memory::MemorySpace memspace) + { + assert(A->getSparseFormat() == matrix::Sparse::COMPRESSED_SPARSE_ROW && + "Matrix has to be in CSR format for left diagonal scaling.\n"); + assert(diag->getSize() == A->getNumRows() && "Diagonal vector must be of the same size as the number of rows in the matrix."); + assert(A->getValues(memspace) != nullptr && "Matrix values are null!\n"); + assert(diag->getData(memspace) != nullptr && "Diagonal vector data is null!\n"); + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + return cpuImpl_->leftDiagonalScale(diag, A); + break; + case DEVICE: + return devImpl_->leftDiagonalScale(diag, A); + break; + } + return 1; + } + + /** + * @brief Right diagonal scaling of a sparse CSR matrix + * + * @param[in] A - Sparse CSR matrix + * @param[in] diag - vector representing the diagonal matrix + * @param[in] memspace - Device where the operation is computed + * + * @pre The diagonal vector must be of the same size as the number of columns in the matrix. + * @pre A is unscaled and allocated + * @post A is scaled + * @invariant diag + * + * @return 0 if successful, 1 otherwise + */ + int MatrixHandler::rightDiagonalScale(matrix::Csr* A, vector_type* diag, memory::MemorySpace memspace) + { + assert(A->getSparseFormat() == matrix::Sparse::COMPRESSED_SPARSE_ROW && + "Matrix has to be in CSR format for right diagonal scaling.\n"); + assert(diag->getSize() == A->getNumColumns() && "Diagonal vector must be of the same size as the number of columns in the matrix."); + assert(A->getValues(memspace) != nullptr && "Matrix values are null!\n"); + assert(diag->getData(memspace) != nullptr && "Diagonal vector data is null!\n"); + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + return cpuImpl_->rightDiagonalScale(A, diag); + break; + case DEVICE: + return devImpl_->rightDiagonalScale(A, diag); + break; + } + return 1; + } + /** * @brief Add a constant to the nonzero values of a csr matrix. * @param[in,out] A - Sparse matrix diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp index fac69a371..6b99ee52e 100644 --- a/resolve/matrix/MatrixHandler.hpp +++ b/resolve/matrix/MatrixHandler.hpp @@ -56,8 +56,14 @@ namespace ReSolve { int transpose(matrix::Csr* A, matrix::Csr* At, memory::MemorySpace memspace); + int leftDiagonalScale(vector_type* diag, matrix::Csr* A, memory::MemorySpace memspace); + + int rightDiagonalScale(matrix::Csr* A, vector_type* diag, memory::MemorySpace memspace); + void addConst(matrix::Sparse* A, real_type alpha, memory::MemorySpace memspace); + + /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped int matvec(matrix::Sparse* A, vector_type* vec_x, diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp index 9af71085d..529081e9c 100644 --- a/resolve/matrix/MatrixHandlerCpu.cpp +++ b/resolve/matrix/MatrixHandlerCpu.cpp @@ -140,10 +140,10 @@ namespace ReSolve * * @authors Slaven Peles , Daniel Reynolds (SMU), and * David Gardner and Carol Woodward (LLNL) - * + * * @param[in] A_csc - pointer to the CSC matrix - * @param[out] A_csr - pointer to an allocated CSR matrix - * + * @param[out] A_csr - pointer to an allocated CSR matrix + * * @return 0 if successful, 1 otherwise */ int MatrixHandlerCpu::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) @@ -291,6 +291,59 @@ namespace ReSolve return 0; } + /** + * @brief Left diagonal scaling of a sparse CSR matrix + * + * @param[in] diag - vector representing the diagonal matrix + * @param[in, out] A - Sparse CSR matrix + * + * @pre The diagonal vector must be of the same size as the number of rows in the matrix. + * @pre A is unscaled and allocated + * @post A is scaled + * @invariant diag + * + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerCpu::leftDiagonalScale(vector_type* diag, matrix::Csr* A) + { + real_type* diag_data = diag->getData(memory::HOST); + index_type* rowPtrA = A->getRowData(memory::HOST); + real_type* valuesA = A->getValues( memory::HOST); + + for (index_type i = 0; i < A->getNumRows(); ++i) { + for (index_type j = rowPtrA[i]; j < rowPtrA[i+1]; ++j) { + valuesA[j] *= diag_data[i]; + } + } + return 0; + } + + /** + * @brief Right diagonal scaling of a sparse CSR matrix + * + * @param[in] A - Sparse CSR matrix + * @param[in] diag - vector representing the diagonal matrix + * + * @pre The diagonal vector must be of the same size as the number of columns in the matrix. + * @pre A is unscaled + * @post A is scaled + * @invariant diag + * + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerCpu::rightDiagonalScale(matrix::Csr* A, vector_type* diag) + { + real_type* diag_data = diag->getData(memory::HOST); + index_type* rowPtrA = A->getRowData(memory::HOST); + index_type* colIdxA = A->getColData(memory::HOST); + real_type* valuesA = A->getValues( memory::HOST); + + for (index_type i = 0; i < A->getNumRows(); ++i) { + for (index_type j = rowPtrA[i]; j < rowPtrA[i+1]; ++j) { + valuesA[j] *= diag_data[colIdxA[j]]; + } + } + } /** * @brief Add a constant to all nonzero values in the matrix * diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp index 1509a3528..f89f363e4 100644 --- a/resolve/matrix/MatrixHandlerCpu.hpp +++ b/resolve/matrix/MatrixHandlerCpu.hpp @@ -39,6 +39,10 @@ namespace ReSolve { int transpose(matrix::Csr* A, matrix::Csr* At) override; + int leftDiagonalScale(vector_type* diag, matrix::Csr* A) override; + + int rightDiagonalScale(matrix::Csr* A, vector_type* diag) override; + int addConst(matrix::Sparse* A, real_type alpha) override; virtual int matvec(matrix::Sparse* A, diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp index d698fb080..775c557e2 100644 --- a/resolve/matrix/MatrixHandlerCuda.cpp +++ b/resolve/matrix/MatrixHandlerCuda.cpp @@ -325,6 +325,54 @@ namespace ReSolve { return error_sum; } + /** + * @brief Left diagonal scaling of a sparse CSR matrix in CUDA + * + * @param[in] diag - vector representing the diagonal matrix + * @param[in, out] A - Sparse CSR matrix + * + * @pre The diagonal vector must be of the same size as the number of rows in the matrix. + * @pre A is unscaled and allocated + * @post A is scaled + * @invariant diag + * + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerCuda::leftDiagonalScale(vector_type* diag, matrix::Csr* A) + { + real_type* diag_data = diag->getData(memory::DEVICE); + index_type* a_row_ptr = A->getRowData(memory::DEVICE); + real_type* a_vals = A->getValues( memory::DEVICE); + index_type n = A->getNumRows(); + cudaLeftDiagScale(n, a_row_ptr, a_vals, diag_data); + return 0; + } + + /** + * @brief Right diagonal scaling of a sparse CSR matrix in CUDA + * + * @param[in] A - Sparse CSR matrix + * @param[in] diag - vector representing the diagonal matrix + * + * @pre The diagonal vector must be of the same size as the number of columns in the matrix. + * @pre A is unscaled + * @post A is scaled + * @invariant diag + * + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerCuda::rightDiagonalScale(matrix::Csr* A, vector_type* diag) + { + real_type* diag_data = diag->getData(memory::DEVICE); + index_type* a_row_ptr = A->getRowData(memory::DEVICE); + index_type* a_col_idx = A->getColData(memory::DEVICE); + real_type* a_vals = A->getValues( memory::DEVICE); + index_type n = A->getNumRows(); + cudaRightDiagScale(n, a_row_ptr, a_col_idx, a_vals, diag_data); + return 0; + } + + /** * @brief Add a constant to all nonzero values in the matrix * diff --git a/resolve/matrix/MatrixHandlerCuda.hpp b/resolve/matrix/MatrixHandlerCuda.hpp index 06bafea3a..f338de709 100644 --- a/resolve/matrix/MatrixHandlerCuda.hpp +++ b/resolve/matrix/MatrixHandlerCuda.hpp @@ -37,6 +37,8 @@ namespace ReSolve { int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) override; int transpose(matrix::Csr* A, matrix::Csr* At) override; int addConst(matrix::Sparse* A, real_type alpha) override; + int leftDiagonalScale(vector_type* diag, matrix::Csr* A) override; + int rightDiagonalScale(matrix::Csr* A, vector_type* diag) override; virtual int matvec(matrix::Sparse* A, vector_type* vec_x, vector_type* vec_result, diff --git a/resolve/matrix/MatrixHandlerHip.hpp b/resolve/matrix/MatrixHandlerHip.hpp index 7e43d466f..991d5dfa1 100644 --- a/resolve/matrix/MatrixHandlerHip.hpp +++ b/resolve/matrix/MatrixHandlerHip.hpp @@ -38,6 +38,8 @@ namespace ReSolve { int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) override; int transpose(matrix::Csr* A, matrix::Csr* At) override; + int leftDiagonalScale(vector_type* diag, matrix::Csr* A) override; + int rightDiagonalScale(matrix::Csr* A, vector_type* diag) override; int addConst(matrix::Sparse* A, real_type alpha) override; virtual int matvec(matrix::Sparse* A, diff --git a/resolve/matrix/MatrixHandlerImpl.hpp b/resolve/matrix/MatrixHandlerImpl.hpp index 2879a620a..2937e4bc8 100644 --- a/resolve/matrix/MatrixHandlerImpl.hpp +++ b/resolve/matrix/MatrixHandlerImpl.hpp @@ -36,6 +36,10 @@ namespace ReSolve { virtual int transpose(matrix::Csr* A, matrix::Csr* At) = 0; + virtual int leftDiagonalScale(vector_type* diag, matrix::Csr* A) = 0; + + virtual int rightDiagonalScale(matrix::Csr* A, vector_type* diag) = 0; + virtual int addConst(matrix::Sparse* A, real_type alpha) = 0; virtual int matvec(matrix::Sparse* A, diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp index bae6d197b..b88adbc20 100644 --- a/resolve/vector/Vector.cpp +++ b/resolve/vector/Vector.cpp @@ -32,9 +32,9 @@ namespace ReSolve { namespace vector { { } - /** + /** * @brief destructor. - * + * */ Vector::~Vector() { @@ -80,13 +80,13 @@ namespace ReSolve { namespace vector { return k_; } - /** + /** * @brief set the vector data variable (HOST or DEVICE) to the provided pointer. - * + * * @param[in] data - Pointer to data * @param[in] memspace - Memory space (HOST or DEVICE) - * - * @warning This function DOES NOT ALLOCATE any data, it only assigns the pointer. + * + * @warning This function DOES NOT ALLOCATE any data, it only assigns the pointer. */ int Vector::setData(real_type* data, memory::MemorySpace memspace) { @@ -118,15 +118,15 @@ namespace ReSolve { namespace vector { return 0; } - /** + /** * @brief set the flag to indicate that the data (HOST or DEVICE) has been updated. - * - * Important because of data mirroring approach. - * - * @param[in] memspace - Memory space (HOST or DEVICE) + * + * Important because of data mirroring approach. + * + * @param[in] memspace - Memory space (HOST or DEVICE) */ void Vector::setDataUpdated(memory::MemorySpace memspace) - { + { using namespace ReSolve::memory; switch (memspace) { case HOST: @@ -140,12 +140,12 @@ namespace ReSolve { namespace vector { } } - /** - * @brief Copy data from another vector. - * + /** + * @brief Copy data from another vector. + * * @param[in] v - Vector, which data will be copied - * @param[in] memspaceIn - Memory space of the incoming data (HOST or DEVICE) - * @param[in] memspaceOut - Memory space the data will be copied to (HOST or DEVICE) + * @param[in] memspaceIn - Memory space of the incoming data (HOST or DEVICE) + * @param[in] memspaceOut - Memory space the data will be copied to (HOST or DEVICE) * * @pre size of _v_ is equal or larger than the current vector size. */ @@ -155,14 +155,14 @@ namespace ReSolve { namespace vector { return copyDataFrom(data, memspaceIn, memspaceOut); } - /** - * @brief Copy vector data from input array. - * - * This function allocates (if necessary) and copies the data. - * + /** + * @brief Copy vector data from input array. + * + * This function allocates (if necessary) and copies the data. + * * @param[in] data - Data that is to be copied - * @param[in] memspaceIn - Memory space of the incoming data (HOST or DEVICE) - * @param[in] memspaceOut - Memory space the data will be copied to (HOST or DEVICE) + * @param[in] memspaceIn - Memory space of the incoming data (HOST or DEVICE) + * @param[in] memspaceOut - Memory space the data will be copied to (HOST or DEVICE) * * @return 0 if successful, -1 otherwise. */ @@ -183,7 +183,7 @@ namespace ReSolve { namespace vector { //allocate first mem_.allocateArrayOnDevice(&d_data_, n_capacity_ * k_); owns_gpu_data_ = true; - } + } switch(control) { case 0: //cpu->cpu @@ -212,33 +212,33 @@ namespace ReSolve { namespace vector { return 0; } - /** + /** * @brief get a pointer to HOST or DEVICE vector data. - * - * @param[in] memspace - Memory space of the pointer (HOST or DEVICE) + * + * @param[in] memspace - Memory space of the pointer (HOST or DEVICE) * * @return pointer to the vector data (HOST or DEVICE). In case of multivectors, vectors are stored column-wise. - * + * * @note This function gives you access to the pointer, not to a copy. - * If you change the values using the pointer, the vector values will change too. + * If you change the values using the pointer, the vector values will change too. */ real_type* Vector::getData(memory::MemorySpace memspace) { return getData(0, memspace); } - /** + /** * @brief get a pointer to HOST or DEVICE data of a particular vector in a multivector. - * - * @param[in] i - Index of a vector in multivector - * @param[in] memspace - Memory space of the pointer (HOST or DEVICE) + * + * @param[in] i - Index of a vector in multivector + * @param[in] memspace - Memory space of the pointer (HOST or DEVICE) * * @return pointer to the _i_th vector data (HOST or DEVICE) within a multivector. - * + * * @pre _i_ < _k_ i.e,, _i_ is smaller than the total number of vectors in multivector. - * - * @note This function gives you access to the pointer, not to a copy. - * If you change the values using the pointer, the vector values will change too. + * + * @note This function gives you access to the pointer, not to a copy. + * If you change the values using the pointer, the vector values will change too. */ real_type* Vector::getData(index_type i, memory::MemorySpace memspace) { @@ -314,13 +314,13 @@ namespace ReSolve { namespace vector { return 0; } - /** - * @brief allocate vector data for HOST or DEVICE - * - * @param[in] memspace - Memory space of the data to be allocated + /** + * @brief allocate vector data for HOST or DEVICE + * + * @param[in] memspace - Memory space of the data to be allocated * */ - void Vector::allocate(memory::MemorySpace memspace) + void Vector::allocate(memory::MemorySpace memspace) { using namespace ReSolve::memory; switch (memspace) { @@ -337,13 +337,13 @@ namespace ReSolve { namespace vector { } } - /** + /** * @brief set vector data to zero. In case of multivectors, entire multivector is set to zero. - * + * * @param[in] memspace - Memory space of the data to be set to 0 (HOST or DEVICE) * */ - void Vector::setToZero(memory::MemorySpace memspace) + void Vector::setToZero(memory::MemorySpace memspace) { using namespace ReSolve::memory; switch (memspace) { @@ -364,15 +364,15 @@ namespace ReSolve { namespace vector { } } - /** + /** * @brief set the data of a single vector in a multivector to zero. - * + * * @param[in] i - Index of a vector in a multivector * @param[in] memspace - Memory space of the data to be set to 0 (HOST or DEVICE) * * @pre _i_ < _k_ i.e,, _i_ is smaller than the total number of vectors in multivector. */ - void Vector::setToZero(index_type j, memory::MemorySpace memspace) + void Vector::setToZero(index_type j, memory::MemorySpace memspace) { using namespace ReSolve::memory; switch (memspace) { @@ -394,16 +394,16 @@ namespace ReSolve { namespace vector { } } - /** + /** * @brief set vector data to a given constant. - * + * * In case of multivectors, entire multivector is set to the constant. - * + * * @param[in] C - Constant (real number) * @param[in] memspace - Memory space of the data to be set to 0 (HOST or DEVICE) * */ - void Vector::setToConst(real_type C, memory::MemorySpace memspace) + void Vector::setToConst(real_type C, memory::MemorySpace memspace) { using namespace ReSolve::memory; switch (memspace) { @@ -424,16 +424,16 @@ namespace ReSolve { namespace vector { } } - /** + /** * @brief set the data of a single vector in a multivector to a given constant. - * + * * @param[in] j - Index of a vector in a multivector * @param[in] C - Constant (real number) * @param[in] memspace - Memory space of the data to be set to 0 (HOST or DEVICE) * * @pre _j_ < _k_ i.e,, _j_ is smaller than the total number of vectors in multivector. */ - void Vector::setToConst(index_type j, real_type C, memory::MemorySpace memspace) + void Vector::setToConst(index_type j, real_type C, memory::MemorySpace memspace) { using namespace ReSolve::memory; switch (memspace) { @@ -458,14 +458,14 @@ namespace ReSolve { namespace vector { * @brief Get a pointer to HOST or DEVICE data of a specified vector in a multivector. * * @param[in] i - Index of a vector in a multivector - * @param[in] memspace - Memory space of the pointer (HOST or DEVICE) + * @param[in] memspace - Memory space of the pointer (HOST or DEVICE) * * @return A pointer to the `i`th vector data (HOST or DEVICE). * * @pre `i` < `k_`, i.e. `i` is smaller than the total number of vectors in multivector. * * @note This function gives you access to the pointer, not to a copy. - * If you change the values using the pointer, the vector values will change too. + * If you change the values using the pointer, the vector values will change too. */ real_type* Vector::getVectorData(index_type i, memory::MemorySpace memspace) { @@ -506,10 +506,10 @@ namespace ReSolve { namespace vector { } } - /** + /** * @brief copy HOST or DEVICE data of a specified vector in a multivector - * to _dest_. - * + * to _dest_. + * * @param[out] dest - Pointer to the memory to which data is copied * @param[in] i - Index of a vector in a multivector * @param[in] memspace - Memory space (HOST or DEVICE) to copy from and to @@ -542,13 +542,13 @@ namespace ReSolve { namespace vector { } return 0; } - } + } - /** + /** * @brief copy HOST or DEVICE vector data to _dest_. - * - * In case of multivector, all data (size _k_ * _n_) is copied. - * + * + * In case of multivector, all data (size _k_ * _n_) is copied. + * * @param[out] dest - Pointer to the memory to which data is copied * @param[in] memspace - Memory space (HOST or DEVICE) to copy from * From 95cbcfe51383715e7959800dc16deed11a42c4ba Mon Sep 17 00:00:00 2001 From: shakedregev Date: Wed, 21 May 2025 20:51:14 +0000 Subject: [PATCH 2/8] created vector, need to add new verify function --- tests/unit/matrix/MatrixHandlerTests.hpp | 35 ++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp index 2f8aad032..b550817d5 100644 --- a/tests/unit/matrix/MatrixHandlerTests.hpp +++ b/tests/unit/matrix/MatrixHandlerTests.hpp @@ -150,6 +150,19 @@ class MatrixHandlerTests : TestBase return status.report(testname.c_str()); } + TestOutcome leftDiagScale(index_type n, index_type m) + { + TestStatus status; + std::string testname(__func__); + matrix::Csr* A = createRectangularCsrMatrix(n, m); + vector::Vector* diag = createIncrementingVector(n); + handler_.leftDiagonalScale(diag, A, memspace_); + status *= verifyCsrMatrix(A, 2.0); + delete A; + delete diag; + return status.report(testname.c_str()); + } + private: ReSolve::MatrixHandler& handler_; memory::MemorySpace memspace_{memory::HOST}; @@ -481,6 +494,28 @@ class MatrixHandlerTests : TestBase return A; } + + /** + * @brief create a vector with increasing values, starting with 1.0 + * + * The values are set to 1.0, 2.0, ..., n + * + * @param[in] n number of elements + */ + vector::Vector* createIncrementingVector(const index_type n) + { + vector::Vector* vec = new vector::Vector(n); + vec->allocate(memory::HOST); + real_type* data = vec->getData(memory::HOST); + for (index_type i = 0; i < n; ++i) { + data[i] = static_cast(i + 1.0); + } + vec->setDataUpdated(memory::HOST); + if (memspace_ == memory::DEVICE) { + vec->syncData(memspace_); + } + return vec; + } }; // class MatrixHandlerTests }} // namespace ReSolve::tests From 6e62c5a57254887ef6a7811d109a65abb59019ee Mon Sep 17 00:00:00 2001 From: shakedregev Date: Wed, 21 May 2025 21:34:18 +0000 Subject: [PATCH 3/8] cpu working, cuda not --- tests/unit/matrix/MatrixHandlerTests.hpp | 71 ++++++++++++++++++++- tests/unit/matrix/runMatrixHandlerTests.cpp | 1 + 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp index b550817d5..b96a9ae0f 100644 --- a/tests/unit/matrix/MatrixHandlerTests.hpp +++ b/tests/unit/matrix/MatrixHandlerTests.hpp @@ -157,7 +157,7 @@ class MatrixHandlerTests : TestBase matrix::Csr* A = createRectangularCsrMatrix(n, m); vector::Vector* diag = createIncrementingVector(n); handler_.leftDiagonalScale(diag, A, memspace_); - status *= verifyCsrMatrix(A, 2.0); + status *= verifyLeftScaledCsrMatrix(A); delete A; delete diag; return status.report(testname.c_str()); @@ -516,6 +516,75 @@ class MatrixHandlerTests : TestBase } return vec; } + + /** + * @brief Verify structure of a CSR matrix with preset pattern that is left-scaled + * + * The sparsity structure corresponds to the CSR representation of a rectangular matrix + * created by createRectangularCsrMatrix, left-scaled by a diagonal matrix + * represented by a vector with values 1.0, 2.0, ..., n + * + * @pre A is a valid, allocated CSR matrix + * @invariant A + */ + bool verifyLeftScaledCsrMatrix(matrix::Csr* A) + { + // Check if the matrix is valid + if (A == nullptr) { + return false; + } + + // Get matrix dimensions + const index_type n = A->getNumRows(); + const index_type m = A->getNumColumns(); + + // Create the original unscaled matrix to compare against + matrix::Csr* originalA = createRectangularCsrMatrix(n, m); + + // Get data from both matrices + index_type* origRowptr = originalA->getRowData(memory::HOST); + index_type* origColidx = originalA->getColData(memory::HOST); + real_type* origVal = originalA->getValues(memory::HOST); + + index_type* scaledRowptr = A->getRowData(memory::HOST); + index_type* scaledColidx = A->getColData(memory::HOST); + real_type* scaledVal = A->getValues(memory::HOST); + + // Verify that row pointers and column indices are the same + for (index_type i = 0; i <= n; ++i) { + if (origRowptr[i] != scaledRowptr[i]) { + delete originalA; + return false; + } + } + + for (index_type i = 0; i < A->getNnz(); ++i) { + if (origColidx[i] != scaledColidx[i]) { + delete originalA; + return false; + } + } + + // Verify values - each element in row i should be scaled by (i+1.0) + for (index_type i = 0; i < n; ++i) { + real_type rowScale = static_cast(i + 1); + for (index_type j = origRowptr[i]; j < origRowptr[i + 1]; ++j) { + // For integer values, exact comparison is sufficient + if (scaledVal[j] != origVal[j] * rowScale) { + std::cout << "Mismatch at row " << i << ", index " << j + << ": scaledVal = " << scaledVal[j] + << ", expected = " << origVal[j] * rowScale << "\n"; + delete originalA; + return false; + } + } + } + + // Clean up the original matrix + delete originalA; + + return true; + } }; // class MatrixHandlerTests }} // namespace ReSolve::tests diff --git a/tests/unit/matrix/runMatrixHandlerTests.cpp b/tests/unit/matrix/runMatrixHandlerTests.cpp index dcaaed5ca..aa43a6807 100644 --- a/tests/unit/matrix/runMatrixHandlerTests.cpp +++ b/tests/unit/matrix/runMatrixHandlerTests.cpp @@ -45,6 +45,7 @@ void runTests(const std::string& backend, ReSolve::tests::TestingResults& result result += test.transpose(2048, 1024); result += test.transpose(1024, 1200); result += test.transpose(1200, 1024); + result += test.leftDiagScale(2, 2); std::cout << "\n"; } From 9b49a86cec9ea0555510995082452fa65341f5d8 Mon Sep 17 00:00:00 2001 From: shakedregev Date: Thu, 22 May 2025 21:50:36 +0000 Subject: [PATCH 4/8] left diagonal scale test passing on cpu and cuda --- resolve/cuda/cudaKernels.cu | 3 +-- resolve/matrix/MatrixHandlerCuda.cpp | 2 ++ tests/unit/matrix/MatrixHandlerTests.hpp | 3 +++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/resolve/cuda/cudaKernels.cu b/resolve/cuda/cudaKernels.cu index 1bfa485a5..f522186cf 100644 --- a/resolve/cuda/cudaKernels.cu +++ b/resolve/cuda/cudaKernels.cu @@ -307,9 +307,8 @@ namespace ReSolve { { // Define block size and number of blocks - const int blockSize = 256; + const int blockSize = 1; int numBlocks = (n + blockSize - 1) / blockSize; - // Launch the kernel kernels::leftDiagScale<<>>(n, a_row_ptr, a_val, d_val); } diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp index 775c557e2..85570d042 100644 --- a/resolve/matrix/MatrixHandlerCuda.cpp +++ b/resolve/matrix/MatrixHandlerCuda.cpp @@ -344,7 +344,9 @@ namespace ReSolve { index_type* a_row_ptr = A->getRowData(memory::DEVICE); real_type* a_vals = A->getValues( memory::DEVICE); index_type n = A->getNumRows(); + // check values in A and diag cudaLeftDiagScale(n, a_row_ptr, a_vals, diag_data); + A->setUpdated(memory::DEVICE); return 0; } diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp index b96a9ae0f..390e2b712 100644 --- a/tests/unit/matrix/MatrixHandlerTests.hpp +++ b/tests/unit/matrix/MatrixHandlerTests.hpp @@ -157,6 +157,9 @@ class MatrixHandlerTests : TestBase matrix::Csr* A = createRectangularCsrMatrix(n, m); vector::Vector* diag = createIncrementingVector(n); handler_.leftDiagonalScale(diag, A, memspace_); + if (memspace_ == memory::DEVICE) { + A->syncData(memory::HOST); + } status *= verifyLeftScaledCsrMatrix(A); delete A; delete diag; From 694a59a00929028e053f0929d9c167d458974091 Mon Sep 17 00:00:00 2001 From: shakedregev Date: Fri, 23 May 2025 10:44:32 -0400 Subject: [PATCH 5/8] tests pass on hip --- resolve/hip/hipKernels.h | 10 +-- resolve/hip/hipKernels.hip | 116 ++++++++++++++++++++++++++++ resolve/matrix/MatrixHandlerHip.cpp | 50 ++++++++++++ 3 files changed, 171 insertions(+), 5 deletions(-) diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h index 9ad576933..166d93c2e 100644 --- a/resolve/hip/hipKernels.h +++ b/resolve/hip/hipKernels.h @@ -23,15 +23,15 @@ namespace ReSolve { void mass_axpy(index_type n, index_type i, real_type* x, real_type* y, real_type* alpha); void hipLeftDiagScale(index_type n, - index_type* a_row_ptr, + const index_type* a_row_ptr, real_type* a_val, - real_type* diag); + const real_type* diag); void hipRightDiagScale(index_type n, - index_type* a_row_ptr, - index_type* a_col_idx, + const index_type* a_row_ptr, + const index_type* a_col_idx, real_type* a_val, - real_type* diag); + const real_type* diag); //needed for matrix inf nrm void matrix_row_sums(index_type n, diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip index bc9d4b331..3cb94992b 100644 --- a/resolve/hip/hipKernels.hip +++ b/resolve/hip/hipKernels.hip @@ -300,6 +300,73 @@ namespace ReSolve { } } + /** + * @brief Scales a csr matrix on the left by a diagonal matrix + * + * @param[in] n - number of rows in the matrix + * @param[in] a_row_ptr - row pointers (CSR storage) + * @param[in, out] a_val - values (CSR storage). Changes in place. + * @param[in] d_val - diagonal values + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + __global__ void leftDiagScale(index_type n, + const index_type* a_row_ptr, + real_type* a_val, + const real_type* d_val) + { + // Get row index from thread and block indices + index_type row = blockIdx.x * blockDim.x + threadIdx.x; + + // Check if the thread's row is within matrix bounds + if (row < n) { + // Get the start and end positions for this row in the CSR format + index_type row_start = a_row_ptr[row]; + index_type row_end = a_row_ptr[row + 1]; + + // Get the scaling factor for this row from the diagonal matrix + real_type scale = d_val[row]; + + // Scale all non-zero elements in this row + for (index_type i = row_start; i < row_end; i++) { + a_val[i] *= scale; + } + } + } + + /** + * @brief Scales a csr matrix on the right by a diagonal matrix + * + * @param[in] n - number of rows in the matrix + * @param[in] a_row_ptr - row pointers (CSR storage) + * @param[in] a_col_ind - column indices (CSR storage) + * @param[in, out] a_val - values (CSR storage). Changes in place. + * @param[in] d_val - diagonal values + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + __global__ void rightDiagScale(index_type n, + const index_type* a_row_ptr, + const index_type* a_col_ind, + real_type* a_val, + const real_type* d_val) + { + // Get row index from thread and block indices + index_type row = blockIdx.x * blockDim.x + threadIdx.x; + + // Check if the thread's row is within matrix bounds + if (row < n) { + // Get the start and end positions for this row in the CSR format + index_type row_start = a_row_ptr[row]; + index_type row_end = a_row_ptr[row + 1]; + + // Scale all non-zero elements in this row + for (index_type i = row_start; i < row_end; i++) { + a_val[i] *= d_val[a_col_ind[i]]; + } + } + } + } // namespace kernels @@ -386,6 +453,8 @@ namespace ReSolve { hipLaunchKernelGGL(kernels::matrixInfNormPart1, dim3(1000), dim3(1024), 0, 0, n, nnz, a_ia, a_val, result); } + + /** * @brief * @@ -455,5 +524,52 @@ namespace ReSolve { vec_out); } + /** + * @brief Wrapper that scales a csr matrix on the left by a diagonal matrix + * + * @param[in] n - number of rows in the matrix + * @param[in] a_row_ptr - row pointers (CSR storage) + * @param[in, out] a_val - values (CSR storage). Changes in place. + * @param[in] d_val - diagonal values + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + void hipLeftDiagScale(index_type n, + const index_type* a_row_ptr, + real_type* a_val, + const real_type* d_val) + { + + // Define block size and number of blocks + const int blockSize = 1; + int numBlocks = (n + blockSize - 1) / blockSize; + // Launch the kernel + kernels::leftDiagScale<<>>(n, a_row_ptr, a_val, d_val); + } + + /** + * @brief Wrapper that scales a csr matrix on the right by a diagonal matrix + * + * @param[in] n - number of rows in the matrix + * @param[in] a_row_ptr - row pointers (CSR storage) + * @param[in] a_col_ind - column indices (CSR storage) + * @param[in, out] a_val - values (CSR storage). Changes in place. + * @param[in] d_val - diagonal values + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + void hipRightDiagScale(index_type n, + const index_type* a_row_ptr, + const index_type* a_col_ind, + real_type* a_val, + const real_type* d_val) + { + // Define block size and number of blocks + const int blockSize = 256; + int numBlocks = (n + blockSize - 1) / blockSize; + // Launch the kernel + kernels::rightDiagScale<<>>(n, a_row_ptr, a_col_ind, a_val, d_val); + } + } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp index 5d48f7a59..205ac3d2d 100644 --- a/resolve/matrix/MatrixHandlerHip.cpp +++ b/resolve/matrix/MatrixHandlerHip.cpp @@ -307,4 +307,54 @@ namespace ReSolve { return 0; } + /** + * @brief Left diagonal scaling of a sparse CSR matrix in HIP + * + * @param[in] diag - vector representing the diagonal matrix + * @param[in, out] A - Sparse CSR matrix + * + * @pre The diagonal vector must be of the same size as the number of rows in the matrix. + * @pre A is unscaled and allocated + * @post A is scaled + * @invariant diag + * + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerHip::leftDiagonalScale(vector_type* diag, matrix::Csr* A) + { + real_type* diag_data = diag->getData(memory::DEVICE); + index_type* a_row_ptr = A->getRowData(memory::DEVICE); + real_type* a_vals = A->getValues( memory::DEVICE); + index_type n = A->getNumRows(); + // check values in A and diag + hipLeftDiagScale(n, a_row_ptr, a_vals, diag_data); + A->setUpdated(memory::DEVICE); + return 0; + } + + /** + * @brief Right diagonal scaling of a sparse CSR matrix in HIP + * + * @param[in] A - Sparse CSR matrix + * @param[in] diag - vector representing the diagonal matrix + * + * @pre The diagonal vector must be of the same size as the number of columns in the matrix. + * @pre A is unscaled + * @post A is scaled + * @invariant diag + * + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerHip::rightDiagonalScale(matrix::Csr* A, vector_type* diag) + { + real_type* diag_data = diag->getData(memory::DEVICE); + index_type* a_row_ptr = A->getRowData(memory::DEVICE); + index_type* a_col_idx = A->getColData(memory::DEVICE); + real_type* a_vals = A->getValues( memory::DEVICE); + index_type n = A->getNumRows(); + hipRightDiagScale(n, a_row_ptr, a_col_idx, a_vals, diag_data); + A->setUpdated(memory::DEVICE); + return 0; + } + } // namespace ReSolve From eaa87b744a81066b63817fd508d5a2a689ea10e2 Mon Sep 17 00:00:00 2001 From: shakedregev Date: Fri, 23 May 2025 11:04:50 -0400 Subject: [PATCH 6/8] fixed typo --- resolve/vector/Vector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp index b88adbc20..07aaed872 100644 --- a/resolve/vector/Vector.cpp +++ b/resolve/vector/Vector.cpp @@ -498,7 +498,7 @@ namespace ReSolve { namespace vector { if (new_n_size > n_capacity_) { out::error() << "Trying to resize vector to " << new_n_size - << " elements but memory allocated only for " << n_capacity_ << elements. << "\n"; + << " elements but memory allocated only for " << n_capacity_ << "elements." << "\n"; return 1; } else { n_size_ = new_n_size; From 490a836ec3f0834737df732eeddc1a13fdee4391 Mon Sep 17 00:00:00 2001 From: shakedregev Date: Fri, 23 May 2025 12:54:32 -0400 Subject: [PATCH 7/8] removed cuda and hip from kernel names --- resolve/cuda/cudaKernels.cu | 2 +- resolve/cuda/cudaKernels.h | 2 +- resolve/hip/hipKernels.h | 2 +- resolve/hip/hipKernels.hip | 2 +- resolve/matrix/MatrixHandlerCuda.cpp | 2 +- resolve/matrix/MatrixHandlerHip.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/resolve/cuda/cudaKernels.cu b/resolve/cuda/cudaKernels.cu index f522186cf..1b7a293eb 100644 --- a/resolve/cuda/cudaKernels.cu +++ b/resolve/cuda/cudaKernels.cu @@ -300,7 +300,7 @@ namespace ReSolve { * * @todo Decide how to allow user to configure grid and block sizes. */ - void cudaLeftDiagScale(index_type n, + void LeftDiagScale(index_type n, const index_type* a_row_ptr, real_type* a_val, const real_type* d_val) diff --git a/resolve/cuda/cudaKernels.h b/resolve/cuda/cudaKernels.h index f33153607..8c3271080 100644 --- a/resolve/cuda/cudaKernels.h +++ b/resolve/cuda/cudaKernels.h @@ -22,7 +22,7 @@ namespace ReSolve { void mass_axpy(index_type n, index_type i, const real_type* x, real_type* y, const real_type* alpha); - void cudaLeftDiagScale(index_type n, + void LeftDiagScale(index_type n, const index_type* a_row_ptr, real_type* a_val, const real_type* diag); diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h index 166d93c2e..bd6e40702 100644 --- a/resolve/hip/hipKernels.h +++ b/resolve/hip/hipKernels.h @@ -22,7 +22,7 @@ namespace ReSolve { real_type* result); void mass_axpy(index_type n, index_type i, real_type* x, real_type* y, real_type* alpha); - void hipLeftDiagScale(index_type n, + void LeftDiagScale(index_type n, const index_type* a_row_ptr, real_type* a_val, const real_type* diag); diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip index 3cb94992b..82fc6e698 100644 --- a/resolve/hip/hipKernels.hip +++ b/resolve/hip/hipKernels.hip @@ -534,7 +534,7 @@ namespace ReSolve { * * @todo Decide how to allow user to configure grid and block sizes. */ - void hipLeftDiagScale(index_type n, + void LeftDiagScale(index_type n, const index_type* a_row_ptr, real_type* a_val, const real_type* d_val) diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp index 85570d042..efb5e5c23 100644 --- a/resolve/matrix/MatrixHandlerCuda.cpp +++ b/resolve/matrix/MatrixHandlerCuda.cpp @@ -345,7 +345,7 @@ namespace ReSolve { real_type* a_vals = A->getValues( memory::DEVICE); index_type n = A->getNumRows(); // check values in A and diag - cudaLeftDiagScale(n, a_row_ptr, a_vals, diag_data); + LeftDiagScale(n, a_row_ptr, a_vals, diag_data); A->setUpdated(memory::DEVICE); return 0; } diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp index 205ac3d2d..cdf0a486b 100644 --- a/resolve/matrix/MatrixHandlerHip.cpp +++ b/resolve/matrix/MatrixHandlerHip.cpp @@ -327,7 +327,7 @@ namespace ReSolve { real_type* a_vals = A->getValues( memory::DEVICE); index_type n = A->getNumRows(); // check values in A and diag - hipLeftDiagScale(n, a_row_ptr, a_vals, diag_data); + LeftDiagScale(n, a_row_ptr, a_vals, diag_data); A->setUpdated(memory::DEVICE); return 0; } From 8e9b7955ed845a14df6b755459e17983efb07ea4 Mon Sep 17 00:00:00 2001 From: shakedregev Date: Fri, 23 May 2025 13:37:18 -0400 Subject: [PATCH 8/8] fixed typo --- resolve/cuda/cudaKernels.cu | 2 +- resolve/cuda/cudaKernels.h | 2 +- resolve/hip/hipKernels.h | 2 +- resolve/hip/hipKernels.hip | 2 +- resolve/matrix/MatrixHandlerCuda.cpp | 2 +- resolve/matrix/MatrixHandlerHip.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/resolve/cuda/cudaKernels.cu b/resolve/cuda/cudaKernels.cu index 1b7a293eb..f660c1ed8 100644 --- a/resolve/cuda/cudaKernels.cu +++ b/resolve/cuda/cudaKernels.cu @@ -300,7 +300,7 @@ namespace ReSolve { * * @todo Decide how to allow user to configure grid and block sizes. */ - void LeftDiagScale(index_type n, + void leftDiagScale(index_type n, const index_type* a_row_ptr, real_type* a_val, const real_type* d_val) diff --git a/resolve/cuda/cudaKernels.h b/resolve/cuda/cudaKernels.h index 8c3271080..a8c132433 100644 --- a/resolve/cuda/cudaKernels.h +++ b/resolve/cuda/cudaKernels.h @@ -22,7 +22,7 @@ namespace ReSolve { void mass_axpy(index_type n, index_type i, const real_type* x, real_type* y, const real_type* alpha); - void LeftDiagScale(index_type n, + void leftDiagScale(index_type n, const index_type* a_row_ptr, real_type* a_val, const real_type* diag); diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h index bd6e40702..84d7d104b 100644 --- a/resolve/hip/hipKernels.h +++ b/resolve/hip/hipKernels.h @@ -22,7 +22,7 @@ namespace ReSolve { real_type* result); void mass_axpy(index_type n, index_type i, real_type* x, real_type* y, real_type* alpha); - void LeftDiagScale(index_type n, + void leftDiagScale(index_type n, const index_type* a_row_ptr, real_type* a_val, const real_type* diag); diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip index 82fc6e698..798ef6ca8 100644 --- a/resolve/hip/hipKernels.hip +++ b/resolve/hip/hipKernels.hip @@ -534,7 +534,7 @@ namespace ReSolve { * * @todo Decide how to allow user to configure grid and block sizes. */ - void LeftDiagScale(index_type n, + void leftDiagScale(index_type n, const index_type* a_row_ptr, real_type* a_val, const real_type* d_val) diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp index efb5e5c23..cc5a437ea 100644 --- a/resolve/matrix/MatrixHandlerCuda.cpp +++ b/resolve/matrix/MatrixHandlerCuda.cpp @@ -345,7 +345,7 @@ namespace ReSolve { real_type* a_vals = A->getValues( memory::DEVICE); index_type n = A->getNumRows(); // check values in A and diag - LeftDiagScale(n, a_row_ptr, a_vals, diag_data); + leftDiagScale(n, a_row_ptr, a_vals, diag_data); A->setUpdated(memory::DEVICE); return 0; } diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp index cdf0a486b..b4a1cd18d 100644 --- a/resolve/matrix/MatrixHandlerHip.cpp +++ b/resolve/matrix/MatrixHandlerHip.cpp @@ -327,7 +327,7 @@ namespace ReSolve { real_type* a_vals = A->getValues( memory::DEVICE); index_type n = A->getNumRows(); // check values in A and diag - LeftDiagScale(n, a_row_ptr, a_vals, diag_data); + leftDiagScale(n, a_row_ptr, a_vals, diag_data); A->setUpdated(memory::DEVICE); return 0; }