ORNL · shakedregev · May 21, 2025 · May 21, 2025 · May 21, 2025 · May 22, 2025
diff --git a/resolve/cuda/cudaKernels.cu b/resolve/cuda/cudaKernels.cu
@@ -1,10 +1,10 @@
 /**
  * @file cudaKernels.cu
  * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov)
- * @brief 
+ * @brief
  * @date 2023-12-08
- * 
- * 
+ *
+ *
  */
 
 #include "cudaKernels.h"
@@ -17,22 +17,22 @@ namespace ReSolve {
     /**
      * @brief Computes v^T * [u1 u2] where v is n x k multivector
      * and u1 and u2 are n x 1 vectors.
-     * 
+     *
      * @tparam Tv5 - Size of shared memory
-     *  
+     *
      * @param[in] u1      - (n x 1) vector
      * @param[in] u2      - (n x 1) vector
      * @param[in] v       - (n x k) multivector
      * @param[out] result - (k x 2) multivector
      * @param[in] k       - dimension of the subspace
      * @param[in] N       - size of vectors u1, u2
      */
-    template <size_t Tv5 = 1024> 
-    __global__ void MassIPTwoVec(const real_type* __restrict__ u1, 
-                                 const real_type* __restrict__ u2, 
-                                 const real_type* __restrict__ v, 
+    template <size_t Tv5 = 1024>
+    __global__ void MassIPTwoVec(const real_type* __restrict__ u1,
+                                 const real_type* __restrict__ u2,
+                                 const real_type* __restrict__ v,
                                  real_type* result,
-                                 const index_type k, 
+                                 const index_type k,
                                  const index_type N)
     {
       index_type t = threadIdx.x;
@@ -119,16 +119,16 @@ namespace ReSolve {
 
     /**
      * @brief AXPY y = y - x*alpha where alpha is [k x 1], and x is [N x k] needed in 1 and 2 synch GMRES
-     * 
-     * @tparam Tmaxk 
-     * 
+     *
+     * @tparam Tmaxk
+     *
      * @param[in]  N      - number of rows in x
      * @param[in]  k      - number of columns in x
      * @param[in]  x_data - double array, size [N x k]
      * @param[out] y_data - double array, size [N x 1]
      * @param[in]  alpha  - doble array, size [k x 1]
      */
-    template <size_t Tmaxk = 1024> 
+    template <size_t Tmaxk = 1024>
     __global__ void massAxpy3(index_type N,
                               index_type k,
                               const real_type* x_data,
@@ -155,17 +155,17 @@ namespace ReSolve {
 
     /**
      * @brief Pass through matrix rows and sum each as \sum_{j=1}^m abs(a_{ij})
-     * 
+     *
      * @param[in]  n      - number of rows in the matrix.
      * @param[in]  nnz    - number of non-zero values in the matrix
      * @param[in]  a_ia   - row pointers (CSR storage)
      * @param[in]  a_val  - values (CSR storage)
      * @param[out] result - array size [n x 1] containing sums of values in each row.
      */
-    __global__ void matrixInfNormPart1(const index_type n, 
-                                       const index_type nnz, 
+    __global__ void matrixInfNormPart1(const index_type n,
+                                       const index_type nnz,
                                        const index_type* a_ia,
-                                       const real_type* a_val, 
+                                       const real_type* a_val,
                                        real_type* result)
     {
       index_type idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -178,6 +178,71 @@ namespace ReSolve {
         idx += (blockDim.x * gridDim.x);
       }
     }
+    /**
+     * @brief Scales a csr matrix on the left by a diagonal matrix
+     *
+     * @param[in]  n      - number of rows in the matrix
+     * @param[in]  a_row_ptr - row pointers (CSR storage)
+     * @param[in, out]  a_val    - values (CSR storage). Changes in place.
+     * @param[in]  d_val    - diagonal values
+     *
+     * @todo Decide how to allow user to configure grid and block sizes.
+     */
+    __global__ void leftDiagScale(index_type n,
+                      const index_type* a_row_ptr,
+                      real_type* a_val,
+                      const real_type* d_val)
+    {
+      // Get row index from thread and block indices
+      index_type row = blockIdx.x * blockDim.x + threadIdx.x;
+
+      // Check if the thread's row is within matrix bounds
+      if (row < n) {
+        // Get the start and end positions for this row in the CSR format
+        index_type row_start = a_row_ptr[row];
+        index_type row_end = a_row_ptr[row + 1];
+
+        // Get the scaling factor for this row from the diagonal matrix
+        real_type scale = d_val[row];
+
+        // Scale all non-zero elements in this row
+        for (index_type i = row_start; i < row_end; i++) {
+          a_val[i] *= scale;
+        }
+      }
+    }
+    /**
+     * @brief Scales a csr matrix on the right by a diagonal matrix
+     *
+     * @param[in]  n      - number of rows in the matrix
+     * @param[in]  a_row_ptr - row pointers (CSR storage)
+     * @param[in]  a_col_ind - column indices (CSR storage)
+     * @param[in, out]  a_val    - values (CSR storage). Changes in place.
+     * @param[in]  d_val    - diagonal values
+     *
+     * @todo Decide how to allow user to configure grid and block sizes.
+     */
+    __global__ void rightDiagScale(index_type n,
+                      const index_type* a_row_ptr,
+                      const index_type* a_col_ind,
+                      real_type* a_val,
+                      const real_type* d_val)
+    {
+      // Get row index from thread and block indices
+      index_type row = blockIdx.x * blockDim.x + threadIdx.x;
+
+      // Check if the thread's row is within matrix bounds
+      if (row < n) {
+        // Get the start and end positions for this row in the CSR format
+        index_type row_start = a_row_ptr[row];
+        index_type row_end = a_row_ptr[row + 1];
+
+        // Scale all non-zero elements in this row
+        for (index_type i = row_start; i < row_end; i++) {
+          a_val[i] *= d_val[a_col_ind[i]];
+        }
+      }
+    }
 
   } // namespace kernels
 
@@ -188,35 +253,35 @@ namespace ReSolve {
 
   /**
    * @brief Computes result = mvec^T * [vec1 vec2]
-   * 
+   *
    * @param n      - size of vectors vec1, vec2
-   * @param i      - 
-   * @param vec1   - (n x 1) vector 
+   * @param i      -
+   * @param vec1   - (n x 1) vector
    * @param vec2   - (n x 1) vector
    * @param mvec   - (n x (i+1)) multivector
    * @param result - ((i+1) x 2) multivector
-   * 
+   *
    * @todo Input data should be `const`.
    * @todo Is it coincidence that the block size is equal to the default
    * value of Tv5?
    * @todo Should we use dynamic shared memory here instead?
    */
-  void mass_inner_product_two_vectors(index_type n, 
-                                      index_type i, 
-                                      const real_type* vec1, 
-                                      const real_type* vec2, 
-                                      const real_type* mvec, 
+  void mass_inner_product_two_vectors(index_type n,
+                                      index_type i,
+                                      const real_type* vec1,
+                                      const real_type* vec2,
+                                      const real_type* mvec,
                                       real_type* result)
   {
     kernels::MassIPTwoVec<<<i, 1024>>>(vec1, vec2, mvec, result, i, n);
   }
 
   /**
    * @brief Computes y := y - x*alpha
-   * 
+   *
    * @param[in]  n     - vector size
    * @param[in]  i     - number of vectors in the multivector
-   * @param[in]  x     - (n x (i+1)) multivector 
+   * @param[in]  x     - (n x (i+1)) multivector
    * @param[out] y     - (n x (i+1)) multivector
    * @param[in]  alpha - ((i+1) x 1) vector
    */
@@ -226,20 +291,69 @@ namespace ReSolve {
   }
 
   /**
-   * @brief 
-   * 
+   * @brief Wrapper that scales a csr matrix on the left by a diagonal matrix
+   *
+   * @param[in]  n      - number of rows in the matrix
+   * @param[in]  a_row_ptr - row pointers (CSR storage)
+   * @param[in, out]  a_val    - values (CSR storage). Changes in place.
+   * @param[in]  d_val    - diagonal values
+   *
+   * @todo Decide how to allow user to configure grid and block sizes.
+   */
+  void leftDiagScale(index_type n,
+                       const index_type* a_row_ptr,
+                       real_type* a_val,
+                       const real_type* d_val)
+  {
+
+    // Define block size and number of blocks
+    const int blockSize = 1;
+    int numBlocks = (n + blockSize - 1) / blockSize;
+    // Launch the kernel
+    kernels::leftDiagScale<<<numBlocks, blockSize>>>(n, a_row_ptr, a_val, d_val);
+  }
+
+  /**
+   * @brief Wrapper that scales a csr matrix on the right by a diagonal matrix
+   *
+   * @param[in]  n      - number of rows in the matrix
+   * @param[in]  a_row_ptr - row pointers (CSR storage)
+   * @param[in]  a_col_ind - column indices (CSR storage)
+   * @param[in, out]  a_val    - values (CSR storage). Changes in place.
+   * @param[in]  d_val    - diagonal values
+   *
+   * @todo Decide how to allow user to configure grid and block sizes.
+   */
+  void cudaRightDiagScale(index_type n,
+                       const index_type* a_row_ptr,
+                       const index_type* a_col_ind,
+                       real_type* a_val,
+                       const real_type* d_val)
+  {
+    // Define block size and number of blocks
+    const int blockSize = 256;
+    int numBlocks = (n + blockSize - 1) / blockSize;
+    // Launch the kernel
+    kernels::rightDiagScale<<<numBlocks, blockSize>>>(n, a_row_ptr, a_col_ind, a_val, d_val);
+  }
+
+
+
+  /**
+   * @brief
+   *
    * @param[in]  n      -
    * @param[in]  nnz    -
-   * @param[in]  a_ia   - 
+   * @param[in]  a_ia   -
    * @param[in]  a_val  -
    * @param[out] result -
-   * 
+   *
    * @todo Decide how to allow user to configure grid and block sizes.
    */
-  void matrix_row_sums(index_type n, 
-                       index_type nnz, 
+  void matrix_row_sums(index_type n,
+                       index_type nnz,
                        const index_type* a_ia,
-                       const real_type* a_val, 
+                       const real_type* a_val,
                        real_type* result)
   {
     kernels::matrixInfNormPart1<<<1000, 1024>>>(n, nnz, a_ia, a_val, result);

diff --git a/resolve/cuda/cudaKernels.h b/resolve/cuda/cudaKernels.h
@@ -1,32 +1,43 @@
 /**
  * @file cudaKernels.h
  * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov)
- * 
+ *
  * @brief Contains prototypes of CUDA kernels.
- * 
+ *
  * @note These kernels will be used in CUDA specific code, only.
- * 
+ *
  */
 #pragma once
 
 #include <resolve/Common.hpp>
 
 namespace ReSolve {
 
-  void mass_inner_product_two_vectors(index_type n, 
-                                      index_type i, 
-                                      const real_type* vec1, 
-                                      const real_type* vec2, 
-                                      const real_type* mvec, 
+  void mass_inner_product_two_vectors(index_type n,
+                                      index_type i,
+                                      const real_type* vec1,
+                                      const real_type* vec2,
+                                      const real_type* mvec,
                                       real_type* result);
 
   void mass_axpy(index_type n, index_type i, const real_type* x, real_type* y, const real_type* alpha);
 
+  void leftDiagScale(index_type n,
+                      const index_type* a_row_ptr,
+                      real_type* a_val,
+                      const real_type* diag);
+
+  void cudaRightDiagScale(index_type n,
+                       const index_type* a_row_ptr,
+                       const index_type* a_col_idx,
+                       real_type* a_val,
+                       const real_type* diag);
+
   //needed for matrix inf nrm
-  void matrix_row_sums(index_type n, 
-                       index_type nnz, 
+  void matrix_row_sums(index_type n,
+                       index_type nnz,
                        const index_type* a_ia,
-                       const real_type* a_val, 
+                       const real_type* a_val,
                        real_type* result);
 
 } // namespace ReSolve