Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 150 additions & 36 deletions resolve/cuda/cudaKernels.cu
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
/**
* @file cudaKernels.cu
* @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov)
* @brief
* @brief
* @date 2023-12-08
*
*
*
*
*/

#include "cudaKernels.h"
Expand All @@ -17,22 +17,22 @@ namespace ReSolve {
/**
* @brief Computes v^T * [u1 u2] where v is n x k multivector
* and u1 and u2 are n x 1 vectors.
*
*
* @tparam Tv5 - Size of shared memory
*
*
* @param[in] u1 - (n x 1) vector
* @param[in] u2 - (n x 1) vector
* @param[in] v - (n x k) multivector
* @param[out] result - (k x 2) multivector
* @param[in] k - dimension of the subspace
* @param[in] N - size of vectors u1, u2
*/
template <size_t Tv5 = 1024>
__global__ void MassIPTwoVec(const real_type* __restrict__ u1,
const real_type* __restrict__ u2,
const real_type* __restrict__ v,
template <size_t Tv5 = 1024>
__global__ void MassIPTwoVec(const real_type* __restrict__ u1,
const real_type* __restrict__ u2,
const real_type* __restrict__ v,
real_type* result,
const index_type k,
const index_type k,
const index_type N)
{
index_type t = threadIdx.x;
Expand Down Expand Up @@ -119,16 +119,16 @@ namespace ReSolve {

/**
* @brief AXPY y = y - x*alpha where alpha is [k x 1], and x is [N x k] needed in 1 and 2 synch GMRES
*
* @tparam Tmaxk
*
*
* @tparam Tmaxk
*
* @param[in] N - number of rows in x
* @param[in] k - number of columns in x
* @param[in] x_data - double array, size [N x k]
* @param[out] y_data - double array, size [N x 1]
* @param[in] alpha - doble array, size [k x 1]
*/
template <size_t Tmaxk = 1024>
template <size_t Tmaxk = 1024>
__global__ void massAxpy3(index_type N,
index_type k,
const real_type* x_data,
Expand All @@ -155,17 +155,17 @@ namespace ReSolve {

/**
* @brief Pass through matrix rows and sum each as \sum_{j=1}^m abs(a_{ij})
*
*
* @param[in] n - number of rows in the matrix.
* @param[in] nnz - number of non-zero values in the matrix
* @param[in] a_ia - row pointers (CSR storage)
* @param[in] a_val - values (CSR storage)
* @param[out] result - array size [n x 1] containing sums of values in each row.
*/
__global__ void matrixInfNormPart1(const index_type n,
const index_type nnz,
__global__ void matrixInfNormPart1(const index_type n,
const index_type nnz,
const index_type* a_ia,
const real_type* a_val,
const real_type* a_val,
real_type* result)
{
index_type idx = blockIdx.x * blockDim.x + threadIdx.x;
Expand All @@ -178,6 +178,71 @@ namespace ReSolve {
idx += (blockDim.x * gridDim.x);
}
}
/**
* @brief Scales a csr matrix on the left by a diagonal matrix
*
* @param[in] n - number of rows in the matrix
* @param[in] a_row_ptr - row pointers (CSR storage)
* @param[in, out] a_val - values (CSR storage). Changes in place.
* @param[in] d_val - diagonal values
*
* @todo Decide how to allow user to configure grid and block sizes.
*/
__global__ void leftDiagScale(index_type n,
const index_type* a_row_ptr,
real_type* a_val,
const real_type* d_val)
{
// Get row index from thread and block indices
index_type row = blockIdx.x * blockDim.x + threadIdx.x;

// Check if the thread's row is within matrix bounds
if (row < n) {
// Get the start and end positions for this row in the CSR format
index_type row_start = a_row_ptr[row];
index_type row_end = a_row_ptr[row + 1];

// Get the scaling factor for this row from the diagonal matrix
real_type scale = d_val[row];

// Scale all non-zero elements in this row
for (index_type i = row_start; i < row_end; i++) {
a_val[i] *= scale;
}
}
}
/**
* @brief Scales a csr matrix on the right by a diagonal matrix
*
* @param[in] n - number of rows in the matrix
* @param[in] a_row_ptr - row pointers (CSR storage)
* @param[in] a_col_ind - column indices (CSR storage)
* @param[in, out] a_val - values (CSR storage). Changes in place.
* @param[in] d_val - diagonal values
*
* @todo Decide how to allow user to configure grid and block sizes.
*/
__global__ void rightDiagScale(index_type n,
const index_type* a_row_ptr,
const index_type* a_col_ind,
real_type* a_val,
const real_type* d_val)
{
// Get row index from thread and block indices
index_type row = blockIdx.x * blockDim.x + threadIdx.x;

// Check if the thread's row is within matrix bounds
if (row < n) {
// Get the start and end positions for this row in the CSR format
index_type row_start = a_row_ptr[row];
index_type row_end = a_row_ptr[row + 1];

// Scale all non-zero elements in this row
for (index_type i = row_start; i < row_end; i++) {
a_val[i] *= d_val[a_col_ind[i]];
}
}
}

} // namespace kernels

Expand All @@ -188,35 +253,35 @@ namespace ReSolve {

/**
* @brief Computes result = mvec^T * [vec1 vec2]
*
*
* @param n - size of vectors vec1, vec2
* @param i -
* @param vec1 - (n x 1) vector
* @param i -
* @param vec1 - (n x 1) vector
* @param vec2 - (n x 1) vector
* @param mvec - (n x (i+1)) multivector
* @param result - ((i+1) x 2) multivector
*
*
* @todo Input data should be `const`.
* @todo Is it coincidence that the block size is equal to the default
* value of Tv5?
* @todo Should we use dynamic shared memory here instead?
*/
void mass_inner_product_two_vectors(index_type n,
index_type i,
const real_type* vec1,
const real_type* vec2,
const real_type* mvec,
void mass_inner_product_two_vectors(index_type n,
index_type i,
const real_type* vec1,
const real_type* vec2,
const real_type* mvec,
real_type* result)
{
kernels::MassIPTwoVec<<<i, 1024>>>(vec1, vec2, mvec, result, i, n);
}

/**
* @brief Computes y := y - x*alpha
*
*
* @param[in] n - vector size
* @param[in] i - number of vectors in the multivector
* @param[in] x - (n x (i+1)) multivector
* @param[in] x - (n x (i+1)) multivector
* @param[out] y - (n x (i+1)) multivector
* @param[in] alpha - ((i+1) x 1) vector
*/
Expand All @@ -226,20 +291,69 @@ namespace ReSolve {
}

/**
* @brief
*
* @brief Wrapper that scales a csr matrix on the left by a diagonal matrix
*
* @param[in] n - number of rows in the matrix
* @param[in] a_row_ptr - row pointers (CSR storage)
* @param[in, out] a_val - values (CSR storage). Changes in place.
* @param[in] d_val - diagonal values
*
* @todo Decide how to allow user to configure grid and block sizes.
*/
void leftDiagScale(index_type n,
const index_type* a_row_ptr,
real_type* a_val,
const real_type* d_val)
{

// Define block size and number of blocks
const int blockSize = 1;
int numBlocks = (n + blockSize - 1) / blockSize;
// Launch the kernel
kernels::leftDiagScale<<<numBlocks, blockSize>>>(n, a_row_ptr, a_val, d_val);
}

/**
* @brief Wrapper that scales a csr matrix on the right by a diagonal matrix
*
* @param[in] n - number of rows in the matrix
* @param[in] a_row_ptr - row pointers (CSR storage)
* @param[in] a_col_ind - column indices (CSR storage)
* @param[in, out] a_val - values (CSR storage). Changes in place.
* @param[in] d_val - diagonal values
*
* @todo Decide how to allow user to configure grid and block sizes.
*/
void cudaRightDiagScale(index_type n,
const index_type* a_row_ptr,
const index_type* a_col_ind,
real_type* a_val,
const real_type* d_val)
{
// Define block size and number of blocks
const int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
// Launch the kernel
kernels::rightDiagScale<<<numBlocks, blockSize>>>(n, a_row_ptr, a_col_ind, a_val, d_val);
}



/**
* @brief
*
* @param[in] n -
* @param[in] nnz -
* @param[in] a_ia -
* @param[in] a_ia -
* @param[in] a_val -
* @param[out] result -
*
*
* @todo Decide how to allow user to configure grid and block sizes.
*/
void matrix_row_sums(index_type n,
index_type nnz,
void matrix_row_sums(index_type n,
index_type nnz,
const index_type* a_ia,
const real_type* a_val,
const real_type* a_val,
real_type* result)
{
kernels::matrixInfNormPart1<<<1000, 1024>>>(n, nnz, a_ia, a_val, result);
Expand Down
33 changes: 22 additions & 11 deletions resolve/cuda/cudaKernels.h
Original file line number Diff line number Diff line change
@@ -1,32 +1,43 @@
/**
* @file cudaKernels.h
* @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov)
*
*
* @brief Contains prototypes of CUDA kernels.
*
*
* @note These kernels will be used in CUDA specific code, only.
*
*
*/
#pragma once

#include <resolve/Common.hpp>

namespace ReSolve {

void mass_inner_product_two_vectors(index_type n,
index_type i,
const real_type* vec1,
const real_type* vec2,
const real_type* mvec,
void mass_inner_product_two_vectors(index_type n,
index_type i,
const real_type* vec1,
const real_type* vec2,
const real_type* mvec,
real_type* result);

void mass_axpy(index_type n, index_type i, const real_type* x, real_type* y, const real_type* alpha);

void leftDiagScale(index_type n,
const index_type* a_row_ptr,
real_type* a_val,
const real_type* diag);

void cudaRightDiagScale(index_type n,
const index_type* a_row_ptr,
const index_type* a_col_idx,
real_type* a_val,
const real_type* diag);

//needed for matrix inf nrm
void matrix_row_sums(index_type n,
index_type nnz,
void matrix_row_sums(index_type n,
index_type nnz,
const index_type* a_ia,
const real_type* a_val,
const real_type* a_val,
real_type* result);

} // namespace ReSolve
Loading