diff --git a/CMakePresets.json b/CMakePresets.json
index 8942b80a2..02f57d3cb 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -14,7 +14,8 @@
             "installDir": "${sourceDir}/install",
             "generator": "Unix Makefiles",
             "cacheVariables": {
-                "RESOLVE_USE_CUDA": "ON"
+                "RESOLVE_USE_CUDA": "ON",
+		"CMAKE_CUDA_ARCHITECTURES": "60"
             }
         },
         {
@@ -45,7 +46,8 @@
             "description": "Custom changes specific for Ascent",
             "cacheVariables": {
                   "CMAKE_C_COMPILER": "$env{OLCF_GCC_ROOT}/bin/gcc",
-                  "CMAKE_CXX_COMPILER": "$env{OLCF_GCC_ROOT}/bin/g++"
+                  "CMAKE_CXX_COMPILER": "$env{OLCF_GCC_ROOT}/bin/g++",
+		  "CMAKE_CUDA_ARCHITECTURES": "70"
 	          }
         },
         {
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 5fb26a284..0148a7e17 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -36,7 +36,10 @@ if(RESOLVE_USE_CUDA)
   # Build example where matrix data is updated 
   add_executable(klu_glu_values_update.exe r_KLU_GLU_matrix_values_update.cpp)
   target_link_libraries(klu_glu_values_update.exe PRIVATE ReSolve)
-
+  
+  #rand solver
+  add_executable(gmres_cusparse_rand.exe r_randGMRES_CUDA.cpp)
+  target_link_libraries(gmres_cusparse_rand.exe PRIVATE ReSolve)
 endif(RESOLVE_USE_CUDA)
 
 # Create HIP examples
@@ -53,17 +56,21 @@ if(RESOLVE_USE_HIP)
   add_executable(klu_rocsolverrf_check_redo.exe r_KLU_rocsolverrf_redo_factorization.cpp)
   target_link_libraries(klu_rocsolverrf_check_redo.exe PRIVATE ReSolve)
 
+
+  # Rand GMRES test with rocsparse
+  add_executable(gmres_rocsparse_rand.exe r_randGMRES.cpp)
+  target_link_libraries(gmres_rocsparse_rand.exe PRIVATE ReSolve)
 endif(RESOLVE_USE_HIP)
 
 # Install all examples in bin directory
 set(installable_executables klu_klu.exe klu_klu_standalone.exe)
 
 if(RESOLVE_USE_CUDA)
-  set(installable_executables ${installable_executables} klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe)        
+  set(installable_executables ${installable_executables} klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe gmres_cusparse_rand.exe)        
 endif(RESOLVE_USE_CUDA)
 
 if(RESOLVE_USE_HIP)
-  set(installable_executables ${installable_executables} klu_rocsolverrf.exe klu_rocsolverrf_fgmres.exe klu_rocsolverrf_check_redo.exe)        
+  set(installable_executables ${installable_executables} klu_rocsolverrf.exe klu_rocsolverrf_fgmres.exe klu_rocsolverrf_check_redo.exe gmres_rocsparse_rand.exe)        
 endif(RESOLVE_USE_HIP)
 
 install(TARGETS ${installable_executables} 
diff --git a/examples/r_randGMRES.cpp b/examples/r_randGMRES.cpp
new file mode 100644
index 000000000..b3f85cc95
--- /dev/null
+++ b/examples/r_randGMRES.cpp
@@ -0,0 +1,128 @@
+#include <string>
+#include <iostream>
+#include <iomanip>
+
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/io.hpp>
+#include <resolve/matrix/MatrixHandler.hpp>
+#include <resolve/vector/VectorHandler.hpp>
+#include <resolve/LinSolverDirectKLU.hpp>
+#include <resolve/LinSolverDirectRocSparseILU0.hpp>
+#include <resolve/LinSolverIterativeRandFGMRES.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+
+using namespace ReSolve::constants;
+
+int main(int argc, char *argv[])
+{
+  // Use the same data types as those you specified in ReSolve build.
+  using real_type  = ReSolve::real_type;
+  using vector_type = ReSolve::vector::Vector;
+
+  (void) argc; // TODO: Check if the number of input parameters is correct.
+  std::string  matrixFileName = argv[1];
+  std::string  rhsFileName = argv[2];
+
+
+  ReSolve::matrix::Coo* A_coo;
+  ReSolve::matrix::Csr* A;
+  ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP();
+  workspace_HIP->initializeHandles();
+  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_HIP);
+  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_HIP);
+  real_type* rhs = nullptr;
+  real_type* x   = nullptr;
+
+  vector_type* vec_rhs;
+  vector_type* vec_x;
+  vector_type* vec_r;
+
+  ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2);
+
+  ReSolve::LinSolverDirectRocSparseILU0* Rf = new ReSolve::LinSolverDirectRocSparseILU0(workspace_HIP);
+  ReSolve::LinSolverIterativeRandFGMRES* FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::cs, GS, "hip");
+
+  std::cout << std::endl << std::endl << std::endl;
+  std::cout << "========================================================================================================================"<<std::endl;
+  std::cout << "Reading: " << matrixFileName<< std::endl;
+  std::cout << "========================================================================================================================"<<std::endl;
+  std::cout << std::endl;
+  std::ifstream mat_file(matrixFileName);
+  if(!mat_file.is_open())
+  {
+    std::cout << "Failed to open file " << matrixFileName << "\n";
+    return -1;
+  }
+  std::ifstream rhs_file(rhsFileName);
+  if(!rhs_file.is_open())
+  {
+    std::cout << "Failed to open file " << rhsFileName << "\n";
+    return -1;
+  }
+  A_coo = ReSolve::io::readMatrixFromFile(mat_file);
+  A = new ReSolve::matrix::Csr(A_coo->getNumRows(),
+                               A_coo->getNumColumns(),
+                               A_coo->getNnz(),
+                               A_coo->symmetric(),
+                               A_coo->expanded());
+
+  rhs = ReSolve::io::readRhsFromFile(rhs_file);
+  x = new real_type[A->getNumRows()];
+  vec_rhs = new vector_type(A->getNumRows());
+  vec_x = new vector_type(A->getNumRows());
+  vec_x->allocate(ReSolve::memory::HOST);
+  //iinit guess is 0U
+  vec_x->allocate(ReSolve::memory::DEVICE);
+  vec_x->setToZero(ReSolve::memory::DEVICE);
+  vec_r = new vector_type(A->getNumRows());
+  std::cout<<"Finished reading the matrix and rhs, size: "<<A->getNumRows()<<" x "<<A->getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<<A->symmetric()<< ", Expanded? "<<A->expanded()<<std::endl;
+  mat_file.close();
+  rhs_file.close();
+
+  matrix_handler->coo2csr(A_coo,A, "hip");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  //Now call direct solver
+  real_type norm_b;
+  matrix_handler->setValuesChanged(true, "hip");
+
+  Rf->setup(A);
+  FGMRES->setRestart(150);
+  FGMRES->setMaxit(2500);
+  FGMRES->setTol(1e-12);
+  FGMRES->setup(A);
+  GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); 
+
+  //matrix_handler->setValuesChanged(true, "hip");
+  FGMRES->resetMatrix(A);
+  FGMRES->setupPreconditioner("LU", Rf);
+  FGMRES->setFlexible(1); 
+
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  FGMRES->solve(vec_rhs, vec_x);
+
+  norm_b = vector_handler->dot(vec_rhs, vec_rhs, "hip");
+  norm_b = sqrt(norm_b);
+  std::cout << "FGMRES: init nrm: " 
+    << std::scientific << std::setprecision(16) 
+    << FGMRES->getInitResidualNorm()/norm_b
+    << " final nrm: "
+    << FGMRES->getFinalResidualNorm()/norm_b
+    << " iter: " << FGMRES->getNumIter() << "\n";
+
+
+  delete A;
+  delete A_coo;
+  delete Rf;
+  delete [] x;
+  delete [] rhs;
+  delete vec_r;
+  delete vec_x;
+  delete workspace_HIP;
+  delete matrix_handler;
+  delete vector_handler;
+
+  return 0;
+}
diff --git a/examples/r_randGMRES_CUDA.cpp b/examples/r_randGMRES_CUDA.cpp
new file mode 100644
index 000000000..814ca2d4f
--- /dev/null
+++ b/examples/r_randGMRES_CUDA.cpp
@@ -0,0 +1,128 @@
+#include <string>
+#include <iostream>
+#include <iomanip>
+
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/io.hpp>
+#include <resolve/matrix/MatrixHandler.hpp>
+#include <resolve/vector/VectorHandler.hpp>
+#include <resolve/LinSolverDirectKLU.hpp>
+#include <resolve/LinSolverDirectCuSparseILU0.hpp>
+#include <resolve/LinSolverIterativeRandFGMRES.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+
+using namespace ReSolve::constants;
+
+int main(int argc, char *argv[])
+{
+  // Use the same data types as those you specified in ReSolve build.
+  using real_type  = ReSolve::real_type;
+  using vector_type = ReSolve::vector::Vector;
+
+  (void) argc; // TODO: Check if the number of input parameters is correct.
+  std::string  matrixFileName = argv[1];
+  std::string  rhsFileName = argv[2];
+
+
+  ReSolve::matrix::Coo* A_coo;
+  ReSolve::matrix::Csr* A;
+  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA();
+  workspace_CUDA->initializeHandles();
+  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
+  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
+  real_type* rhs = nullptr;
+  real_type* x   = nullptr;
+
+  vector_type* vec_rhs;
+  vector_type* vec_x;
+  vector_type* vec_r;
+
+  ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2);
+
+  ReSolve::LinSolverDirectCuSparseILU0* Rf = new ReSolve::LinSolverDirectCuSparseILU0(workspace_CUDA);
+  ReSolve::LinSolverIterativeRandFGMRES* FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::fwht, GS, "cuda");
+
+  std::cout << std::endl << std::endl << std::endl;
+  std::cout << "========================================================================================================================"<<std::endl;
+  std::cout << "Reading: " << matrixFileName<< std::endl;
+  std::cout << "========================================================================================================================"<<std::endl;
+  std::cout << std::endl;
+  std::ifstream mat_file(matrixFileName);
+  if(!mat_file.is_open())
+  {
+    std::cout << "Failed to open file " << matrixFileName << "\n";
+    return -1;
+  }
+  std::ifstream rhs_file(rhsFileName);
+  if(!rhs_file.is_open())
+  {
+    std::cout << "Failed to open file " << rhsFileName << "\n";
+    return -1;
+  }
+  A_coo = ReSolve::io::readMatrixFromFile(mat_file);
+  A = new ReSolve::matrix::Csr(A_coo->getNumRows(),
+                               A_coo->getNumColumns(),
+                               A_coo->getNnz(),
+                               A_coo->symmetric(),
+                               A_coo->expanded());
+
+  rhs = ReSolve::io::readRhsFromFile(rhs_file);
+  x = new real_type[A->getNumRows()];
+  vec_rhs = new vector_type(A->getNumRows());
+  vec_x = new vector_type(A->getNumRows());
+  vec_x->allocate(ReSolve::memory::HOST);
+  //iinit guess is 0U
+  vec_x->allocate(ReSolve::memory::DEVICE);
+  vec_x->setToZero(ReSolve::memory::DEVICE);
+  vec_r = new vector_type(A->getNumRows());
+  std::cout<<"Finished reading the matrix and rhs, size: "<<A->getNumRows()<<" x "<<A->getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<<A->symmetric()<< ", Expanded? "<<A->expanded()<<std::endl;
+  mat_file.close();
+  rhs_file.close();
+
+  matrix_handler->coo2csr(A_coo,A, "cuda");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  //Now call direct solver
+  real_type norm_b;
+  matrix_handler->setValuesChanged(true, "cuda");
+
+  Rf->setup(A, nullptr, nullptr, nullptr, nullptr, vec_rhs);
+  FGMRES->setRestart(800);
+  FGMRES->setMaxit(800);
+  FGMRES->setTol(1e-12);
+  FGMRES->setup(A);
+  GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); 
+
+  //matrix_handler->setValuesChanged(true, "cuda");
+  FGMRES->resetMatrix(A);
+  FGMRES->setupPreconditioner("LU", Rf);
+  FGMRES->setFlexible(1); 
+
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  FGMRES->solve(vec_rhs, vec_x);
+
+  norm_b = vector_handler->dot(vec_rhs, vec_rhs, "cuda");
+  norm_b = sqrt(norm_b);
+  std::cout << "FGMRES: init nrm: " 
+    << std::scientific << std::setprecision(16) 
+    << FGMRES->getInitResidualNorm()/norm_b
+    << " final nrm: "
+    << FGMRES->getFinalResidualNorm()/norm_b
+    << " iter: " << FGMRES->getNumIter() << "\n";
+
+
+  delete A;
+  delete A_coo;
+  delete Rf;
+  delete [] x;
+  delete [] rhs;
+  delete vec_r;
+  delete vec_x;
+  delete workspace_CUDA;
+  delete matrix_handler;
+  delete vector_handler;
+
+  return 0;
+}
diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt
index b98c82344..501b0609d 100644
--- a/resolve/CMakeLists.txt
+++ b/resolve/CMakeLists.txt
@@ -18,16 +18,22 @@ set(ReSolve_SRC
 set(ReSolve_GPU_SRC
     GramSchmidt.cpp
     LinSolverIterativeFGMRES.cpp
-)
+    LinSolverIterativeRandFGMRES.cpp
+    RandSketchingManager.cpp
+    RandSketchingCountSketch.cpp
+    RandSketchingFWHT.cpp
+    )
 
 # C++ code that links to CUDA SDK libraries
 set(ReSolve_CUDASDK_SRC
     LinSolverDirectCuSolverGLU.cpp
     LinSolverDirectCuSolverRf.cpp
+    LinSolverDirectCuSparseILU0.cpp
 )
 # HIP files
 set(ReSolve_ROCM_SRC
   LinSolverDirectRocSolverRf.cpp
+  LinSolverDirectRocSparseILU0.cpp
 )
 # Header files to be installed
 set(ReSolve_HEADER_INSTALL
@@ -37,13 +43,15 @@ set(ReSolve_HEADER_INSTALL
     LinSolverDirectCuSolverGLU.hpp
     LinSolverDirectCuSolverRf.hpp
     LinSolverDirectRocSolverRf.hpp
+    LinSolverDirectRocSparseILU0.hpp
+    LinSolverDirectCuSparseILU0.hpp
     LinSolverDirectKLU.hpp
     LinSolverIterativeFGMRES.hpp
     RefactorizationSolver.hpp
     SystemSolver.hpp
     GramSchmidt.hpp
     MemoryUtils.hpp
-)
+    RandSketchingManager.hpp)
 
 # Now, build workspaces
 add_subdirectory(workspace)
diff --git a/resolve/GramSchmidt.cpp b/resolve/GramSchmidt.cpp
index 7a6572b3b..8385d9d39 100644
--- a/resolve/GramSchmidt.cpp
+++ b/resolve/GramSchmidt.cpp
@@ -151,32 +151,38 @@ namespace ReSolve
           vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V,  vec_v_, vec_Hcolumn_, memspace);
           // V(:,i+1) = V(:, i+1) -  V(:,1:i)*Hcol
           vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, memspace );  
-
+          mem_.deviceSynchronize();
+          
           // copy H_col to aux, we will need it later
           vec_Hcolumn_->setDataUpdated(memory::DEVICE);
           vec_Hcolumn_->setCurrentSize(i + 1);
           vec_Hcolumn_->deepCopyVectorData(h_aux_, 0, memory::HOST);
+          mem_.deviceSynchronize();
 
           //Hcol = V(:,1:i)^T*V(:,i+1);
           vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V,  vec_v_, vec_Hcolumn_, memspace);
+          mem_.deviceSynchronize();
 
           // V(:,i+1) = V(:, i+1) -  V(:,1:i)*Hcol
           vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, memspace );  
+          mem_.deviceSynchronize();
 
           // copy H_col to H
           vec_Hcolumn_->setDataUpdated(memory::DEVICE);
           vec_Hcolumn_->deepCopyVectorData(&H[ idxmap(i, 0, num_vecs_ + 1)], 0, memory::HOST);
+          mem_.deviceSynchronize();
 
           // add both pieces together (unstable otherwise, careful here!!)
           t = 0.0;
           for(int j = 0; j <= i; ++j) {
-            H[ idxmap(i, j, num_vecs_ + 1)] += h_aux_[j]; 
+            H[ idxmap(i, j, num_vecs_ + 1)] += h_aux_[j];
           }
 
           t = vector_handler_->dot(vec_v_, vec_v_, memspace);  
           //set the last entry in Hessenberg matrix
           t = sqrt(t);
           H[ idxmap(i, i + 1, num_vecs_ + 1) ] = t; 
+
           if(fabs(t) > EPSILON) {
             t = 1.0/t;
             vector_handler_->scal(&t, vec_v_, memspace);  
diff --git a/resolve/GramSchmidt.hpp b/resolve/GramSchmidt.hpp
index 7d7b93bef..68ea8b09b 100644
--- a/resolve/GramSchmidt.hpp
+++ b/resolve/GramSchmidt.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include "Common.hpp"
 #include <resolve/vector/VectorHandler.hpp>
+#include <resolve/MemoryUtils.hpp>
 #include <iostream>
 #include <cassert>
 namespace ReSolve 
@@ -42,6 +43,8 @@ namespace ReSolve
 
       vector_type* vec_v_{nullptr}; // aux variable
       vector_type* vec_w_{nullptr}; // aux variable
+    
+      MemoryHandler mem_; ///< Device memory manager object
   };
 
 }//namespace
diff --git a/resolve/LinSolverDirectCuSparseILU0.cpp b/resolve/LinSolverDirectCuSparseILU0.cpp
new file mode 100644
index 000000000..cf3ea005b
--- /dev/null
+++ b/resolve/LinSolverDirectCuSparseILU0.cpp
@@ -0,0 +1,312 @@
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include "LinSolverDirectCuSparseILU0.hpp"
+#include <algorithm>
+
+namespace ReSolve 
+{
+  LinSolverDirectCuSparseILU0::LinSolverDirectCuSparseILU0(LinAlgWorkspaceCUDA* workspace)
+  {
+    workspace_ = workspace;
+  }
+
+  LinSolverDirectCuSparseILU0::~LinSolverDirectCuSparseILU0()
+  {
+    mem_.deleteOnDevice(d_aux1_);
+    mem_.deleteOnDevice(d_aux2_);
+    mem_.deleteOnDevice(d_ILU_vals_);
+  }
+
+  int LinSolverDirectCuSparseILU0::setup(matrix::Sparse* A,
+                                         matrix::Sparse*,
+                                         matrix::Sparse*,
+                                         index_type*,
+                                         index_type*,
+                                         vector_type* )
+  {
+    //remember - P and Q are generally CPU variables
+    int error_sum = 0;
+    this->A_ = (matrix::Csr*) A;
+    index_type n = A_->getNumRows();
+
+    index_type nnz = A_->getNnzExpanded();
+    mem_.allocateArrayOnDevice(&d_ILU_vals_,nnz); 
+    //copy A values to a buffer first
+    mem_.copyArrayDeviceToDevice(d_ILU_vals_, A_->getValues(ReSolve::memory::DEVICE), nnz);
+
+    mem_.allocateArrayOnDevice(&d_aux1_,n); 
+    mem_.allocateArrayOnDevice(&d_aux2_,n); 
+    cudaMemset(d_aux1_, 1, n*sizeof(double));
+    cusparseCreateDnVec(&vec_X_, n, d_aux1_, CUDA_R_64F);
+    cusparseCreateDnVec(&vec_Y_, n, d_aux2_, CUDA_R_64F);
+
+    //set up descriptors
+
+    // Create matrix descriptor for A
+    cusparseCreateMatDescr(&descr_A_);
+    cusparseSetMatIndexBase(descr_A_, CUSPARSE_INDEX_BASE_ZERO);
+    cusparseSetMatType(descr_A_, CUSPARSE_MATRIX_TYPE_GENERAL);
+
+    // Create matrix descriptor for L and U
+    cusparseSpSV_createDescr(&descr_spsv_L_);
+    cusparseSpSV_createDescr(&descr_spsv_U_);
+
+    cusparseCreateCsr(&mat_L_,
+                      n,
+                      n,
+                      nnz,
+                      A_->getRowData(ReSolve::memory::DEVICE), 
+                      A_->getColData(ReSolve::memory::DEVICE), 
+                      d_ILU_vals_, //vals_, 
+                      CUSPARSE_INDEX_32I,
+                      CUSPARSE_INDEX_32I,
+                      CUSPARSE_INDEX_BASE_ZERO,
+                      CUDA_R_64F);
+
+    cusparseCreateCsr(&mat_U_,
+                      n,
+                      n,
+                      nnz,
+                      A_->getRowData(ReSolve::memory::DEVICE), 
+                      A_->getColData(ReSolve::memory::DEVICE), 
+                      d_ILU_vals_, //vals_, 
+                      CUSPARSE_INDEX_32I,
+                      CUSPARSE_INDEX_32I,
+                      CUSPARSE_INDEX_BASE_ZERO,
+                      CUDA_R_64F);
+
+
+    // Create matrix info structure
+    cusparseCreateCsrilu02Info(&info_A_);
+
+    int buffer_size_A;
+    size_t buffer_size_L;
+    size_t buffer_size_U;
+
+    status_cusparse_ = cusparseDcsrilu02_bufferSize(workspace_->getCusparseHandle(), 
+                                                    n, 
+                                                    nnz, 
+                                                    descr_A_,
+                                                    d_ILU_vals_, //vals_, 
+                                                    A_->getRowData(ReSolve::memory::DEVICE), 
+                                                    A_->getColData(ReSolve::memory::DEVICE), 
+                                                    info_A_, 
+                                                    &buffer_size_A);
+
+    mem_.allocateBufferOnDevice(&buffer_,(size_t) buffer_size_A);
+    error_sum += status_cusparse_;
+    // Now analysis
+    status_cusparse_ = cusparseDcsrilu02_analysis(workspace_->getCusparseHandle(), 
+                                                  n, 
+                                                  nnz, 
+                                                  descr_A_,
+                                                  d_ILU_vals_, //vals_, 
+                                                  A_->getRowData(ReSolve::memory::DEVICE), 
+                                                  A_->getColData(ReSolve::memory::DEVICE), 
+                                                  info_A_,
+                                                  CUSPARSE_SOLVE_POLICY_USE_LEVEL,
+                                                  buffer_);
+
+    error_sum += status_cusparse_;
+    // and now the actual decomposition
+
+    // Compute incomplete LU factorization
+    status_cusparse_ = cusparseDcsrilu02(workspace_->getCusparseHandle(), 
+                                         n, 
+                                         nnz, 
+                                         descr_A_,
+                                         d_ILU_vals_, //vals_ 
+                                         A_->getRowData(ReSolve::memory::DEVICE), 
+                                         A_->getColData(ReSolve::memory::DEVICE), 
+                                         info_A_,
+                                         CUSPARSE_SOLVE_POLICY_USE_LEVEL,
+                                         buffer_);
+
+    error_sum += status_cusparse_;
+
+    // now take care of LU solve 
+
+    // now create actual Sparse matrix  OBJECTS for L and U
+
+    cusparseFillMode_t fillmodeL = CUSPARSE_FILL_MODE_LOWER;
+    cusparseFillMode_t fillmodeU = CUSPARSE_FILL_MODE_UPPER;
+
+    cusparseDiagType_t diagtypeL = CUSPARSE_DIAG_TYPE_UNIT; 
+    cusparseDiagType_t diagtypeU = CUSPARSE_DIAG_TYPE_NON_UNIT; 
+
+    cusparseSpMatSetAttribute(mat_L_, 
+                              CUSPARSE_SPMAT_FILL_MODE,
+                              &fillmodeL, 
+                              sizeof(fillmodeL));  
+
+    cusparseSpMatSetAttribute(mat_U_, 
+                              CUSPARSE_SPMAT_FILL_MODE,
+                              &fillmodeU, 
+                              sizeof(fillmodeU));
+
+    cusparseSpMatSetAttribute(mat_L_, 
+                              CUSPARSE_SPMAT_DIAG_TYPE,
+                              &diagtypeL, 
+                              sizeof(diagtypeL));  
+
+    cusparseSpMatSetAttribute(mat_U_, 
+                              CUSPARSE_SPMAT_DIAG_TYPE,
+                              &diagtypeU, 
+                              sizeof(diagtypeU));  
+
+    status_cusparse_ = cusparseSpSV_bufferSize(workspace_->getCusparseHandle(), 
+                                               CUSPARSE_OPERATION_NON_TRANSPOSE, 
+                                               &(constants::ONE), 
+                                               mat_L_,
+                                               vec_X_,
+                                               vec_Y_, 
+                                               CUDA_R_64F,
+                                               CUSPARSE_SPSV_ALG_DEFAULT,
+                                               descr_spsv_L_, 
+                                               &buffer_size_L);
+    error_sum += status_cusparse_;
+
+    mem_.allocateBufferOnDevice(&buffer_L_, buffer_size_L);
+    status_cusparse_ = cusparseSpSV_bufferSize(workspace_->getCusparseHandle(), 
+                                               CUSPARSE_OPERATION_NON_TRANSPOSE, 
+                                               &(constants::ONE), 
+                                               mat_U_,
+                                               vec_X_,
+                                               vec_Y_, 
+                                               CUDA_R_64F,
+                                               CUSPARSE_SPSV_ALG_DEFAULT,
+                                               descr_spsv_U_, 
+                                               &buffer_size_U);
+    error_sum += status_cusparse_;
+
+    mem_.allocateBufferOnDevice(&buffer_U_, buffer_size_U);
+
+    status_cusparse_ = cusparseSpSV_analysis(workspace_->getCusparseHandle(), 
+                                             CUSPARSE_OPERATION_NON_TRANSPOSE, 
+                                             &(constants::ONE), 
+                                             mat_L_,
+                                             vec_X_,
+                                             vec_Y_,
+                                             CUDA_R_64F,
+                                             CUSPARSE_SPSV_ALG_DEFAULT,
+                                             descr_spsv_L_,
+                                             buffer_L_);
+    error_sum += status_cusparse_;
+
+
+    status_cusparse_ = cusparseSpSV_analysis(workspace_->getCusparseHandle(), 
+                                             CUSPARSE_OPERATION_NON_TRANSPOSE, 
+                                             &(constants::ONE), 
+                                             mat_U_,
+                                             vec_X_,
+                                             vec_Y_,
+                                             CUDA_R_64F,
+                                             CUSPARSE_SPSV_ALG_DEFAULT,
+                                             descr_spsv_U_,
+                                             buffer_U_);
+
+    error_sum += status_cusparse_;
+    cusparseDestroyDnVec(vec_X_);
+    cusparseDestroyDnVec(vec_Y_);
+    return error_sum;
+  }
+
+  int LinSolverDirectCuSparseILU0::reset(matrix::Sparse* A)
+  {
+    int error_sum = 0;
+    this->A_ = A;
+    index_type n = A_->getNumRows();
+    index_type nnz = A_->getNnzExpanded();
+    mem_.copyArrayDeviceToDevice(d_ILU_vals_, A_->getValues(ReSolve::memory::DEVICE), nnz);
+
+    status_cusparse_ = cusparseDcsrilu02(workspace_->getCusparseHandle(), 
+                                         n, 
+                                         nnz, 
+                                         descr_A_,
+                                         d_ILU_vals_, //vals_ 
+                                         A_->getRowData(ReSolve::memory::DEVICE), 
+                                         A_->getColData(ReSolve::memory::DEVICE), 
+                                         info_A_,
+                                         CUSPARSE_SOLVE_POLICY_USE_LEVEL,
+                                         buffer_);
+    //rerun tri solve analysis - to be updated
+    error_sum += status_cusparse_;
+
+    return error_sum;
+  }
+  // solution is returned in RHS
+  int LinSolverDirectCuSparseILU0::solve(vector_type* rhs)
+  {
+    int error_sum = 0;
+   
+    cusparseCreateDnVec(&vec_X_, A_->getNumRows(), rhs->getData(ReSolve::memory::DEVICE), CUDA_R_64F);
+    cusparseCreateDnVec(&vec_Y_, A_->getNumRows(), d_aux1_, CUDA_R_64F);
+   
+    status_cusparse_ = cusparseSpSV_solve(workspace_->getCusparseHandle(), 
+                                          CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                          &(constants::ONE), 
+                                          mat_L_,
+                                          vec_X_,
+                                          vec_Y_,
+                                          CUDA_R_64F,
+                                          CUSPARSE_SPSV_ALG_DEFAULT,
+                                          descr_spsv_L_);
+    error_sum += status_cusparse_;
+
+    status_cusparse_ = cusparseSpSV_solve(workspace_->getCusparseHandle(), 
+                                          CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                          &(constants::ONE), 
+                                          mat_U_,
+                                          vec_Y_,
+                                          vec_X_,
+                                          CUDA_R_64F,
+                                          CUSPARSE_SPSV_ALG_DEFAULT,
+                                          descr_spsv_U_);
+    error_sum += status_cusparse_;
+   
+    rhs->setDataUpdated(ReSolve::memory::DEVICE);
+   
+    cusparseDestroyDnVec(vec_X_);
+    cusparseDestroyDnVec(vec_Y_);
+
+    return error_sum;
+  }
+
+  int LinSolverDirectCuSparseILU0::solve(vector_type* rhs, vector_type* x)
+  {
+    int error_sum = 0;
+    
+    cusparseCreateDnVec(&vec_X_, A_->getNumRows(), rhs->getData(ReSolve::memory::DEVICE), CUDA_R_64F);
+    cusparseCreateDnVec(&vec_Y_, A_->getNumRows(), d_aux1_, CUDA_R_64F);
+    
+    status_cusparse_ = cusparseSpSV_solve(workspace_->getCusparseHandle(), 
+                                          CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                          &(constants::ONE), 
+                                          mat_L_,
+                                          vec_X_,
+                                          vec_Y_,
+                                          CUDA_R_64F,
+                                          CUSPARSE_SPSV_ALG_DEFAULT,
+                                          descr_spsv_L_);
+    error_sum += status_cusparse_;
+
+    cusparseCreateDnVec(&vec_X_, A_->getNumRows(), x->getData(ReSolve::memory::DEVICE), CUDA_R_64F);
+    status_cusparse_ = cusparseSpSV_solve(workspace_->getCusparseHandle(), 
+                                          CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                          &(constants::ONE), 
+                                          mat_U_,
+                                          vec_Y_,
+                                          vec_X_,
+                                          CUDA_R_64F,
+                                          CUSPARSE_SPSV_ALG_DEFAULT,
+                                          descr_spsv_U_);
+    error_sum += status_cusparse_;
+   
+    x->setDataUpdated(ReSolve::memory::DEVICE);
+   
+    cusparseDestroyDnVec(vec_X_);
+    cusparseDestroyDnVec(vec_Y_);
+
+    return error_sum;
+  }
+}// namespace resolve
diff --git a/resolve/LinSolverDirectCuSparseILU0.hpp b/resolve/LinSolverDirectCuSparseILU0.hpp
new file mode 100644
index 000000000..1807bb991
--- /dev/null
+++ b/resolve/LinSolverDirectCuSparseILU0.hpp
@@ -0,0 +1,73 @@
+#pragma once
+#include "Common.hpp"
+#include "LinSolver.hpp"
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+
+#include <cusparse.h>
+#include <cuda_runtime.h>
+
+namespace ReSolve 
+{
+  // Forward declaration of vector::Vector class
+  namespace vector
+  {
+    class Vector;
+  }
+
+  // Forward declaration of matrix::Sparse class
+  namespace matrix
+  {
+    class Sparse;
+  }
+
+  class LinSolverDirectCuSparseILU0 : public LinSolverDirect 
+  {
+    using vector_type = vector::Vector;
+    
+    public: 
+      LinSolverDirectCuSparseILU0(LinAlgWorkspaceCUDA* workspace);
+      ~LinSolverDirectCuSparseILU0();
+      
+      int setup(matrix::Sparse* A,
+                matrix::Sparse* L = nullptr,
+                matrix::Sparse* U = nullptr,
+                index_type*     P = nullptr,
+                index_type*     Q = nullptr,
+                vector_type* rhs  = nullptr);
+      // if values of A change, but the nnz pattern does not, redo the analysis only (reuse buffers though)
+      int reset(matrix::Sparse* A);
+       
+      int solve(vector_type* rhs, vector_type* x);
+      int solve(vector_type* rhs);// the solutuon is returned IN RHS (rhs is overwritten)
+    
+
+    private:
+      cusparseStatus_t status_cusparse_;
+
+      MemoryHandler mem_; ///< Device memory manager object
+      LinAlgWorkspaceCUDA* workspace_; 
+
+      cusparseMatDescr_t descr_A_{nullptr};
+      
+      cusparseSpMatDescr_t mat_L_; 
+      cusparseSpMatDescr_t mat_U_; 
+      
+      cusparseSpSVDescr_t descr_spsv_L_{nullptr};
+      cusparseSpSVDescr_t descr_spsv_U_{nullptr};
+      csrilu02Info_t  info_A_{nullptr};
+      
+      void* buffer_; 
+      void* buffer_L_; 
+      void* buffer_U_; 
+      
+      real_type* d_aux1_;
+      real_type* d_aux2_;
+      
+      cusparseDnVecDescr_t vec_X_;
+      cusparseDnVecDescr_t vec_Y_;
+
+      // since ILU OVERWRITES THE MATRIX values, we need a buffer to keep the values of ILU decomposition. 
+      real_type* d_ILU_vals_;
+  };
+}// namespace
diff --git a/resolve/LinSolverDirectRocSparseILU0.cpp b/resolve/LinSolverDirectRocSparseILU0.cpp
new file mode 100644
index 000000000..8f15e0579
--- /dev/null
+++ b/resolve/LinSolverDirectRocSparseILU0.cpp
@@ -0,0 +1,293 @@
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include "LinSolverDirectRocSparseILU0.hpp"
+
+#include <resolve/utilities/logger/Logger.hpp>
+
+namespace ReSolve 
+{
+  LinSolverDirectRocSparseILU0::LinSolverDirectRocSparseILU0(LinAlgWorkspaceHIP* workspace)
+  {
+    workspace_ = workspace;
+  }
+
+  LinSolverDirectRocSparseILU0::~LinSolverDirectRocSparseILU0()
+  {
+    mem_.deleteOnDevice(d_aux1_);
+    mem_.deleteOnDevice(d_ILU_vals_);
+  }
+
+  int LinSolverDirectRocSparseILU0::setup(matrix::Sparse* A,
+                                          matrix::Sparse*,
+                                          matrix::Sparse*,
+                                          index_type*,
+                                          index_type*,
+                                          vector_type* )
+  {
+    //remember - P and Q are generally CPU variables
+    int error_sum = 0;
+    this->A_ = (matrix::Csr*) A;
+    index_type n = A_->getNumRows();
+
+    index_type nnz = A_->getNnzExpanded();
+    mem_.allocateArrayOnDevice(&d_ILU_vals_,nnz); 
+    //copy A values to a buffer first
+    mem_.copyArrayDeviceToDevice(d_ILU_vals_, A_->getValues(ReSolve::memory::DEVICE), nnz);
+
+    //set up descriptors
+
+    // Create matrix descriptor for A
+    rocsparse_create_mat_descr(&descr_A_);
+
+    // Create matrix descriptor for L
+    rocsparse_create_mat_descr(&descr_L_);
+    rocsparse_set_mat_fill_mode(descr_L_, rocsparse_fill_mode_lower);
+    rocsparse_set_mat_diag_type(descr_L_, rocsparse_diag_type_unit);
+
+    // Create matrix descriptor for U
+    rocsparse_create_mat_descr(&descr_U_);
+    rocsparse_set_mat_fill_mode(descr_U_, rocsparse_fill_mode_upper);
+    rocsparse_set_mat_diag_type(descr_U_, rocsparse_diag_type_non_unit);
+
+    // Create matrix info structure
+    rocsparse_create_mat_info(&info_A_);
+
+    size_t buffer_size_A;
+    size_t buffer_size_L;
+    size_t buffer_size_U;
+
+    status_rocsparse_ = rocsparse_dcsrilu0_buffer_size(workspace_->getRocsparseHandle(), 
+                                                       n, 
+                                                       nnz, 
+                                                       descr_A_,
+                                                       d_ILU_vals_, //vals_, 
+                                                       A_->getRowData(ReSolve::memory::DEVICE), 
+                                                       A_->getColData(ReSolve::memory::DEVICE), 
+                                                       info_A_, 
+                                                       &buffer_size_A);
+    if (status_rocsparse_ != 0) { 
+        io::Logger::warning() << "Buffer size estimate for ILU0 failed with code: " <<status_rocsparse_<<" \n"; 
+    } 
+
+
+    error_sum += status_rocsparse_;
+    status_rocsparse_ = rocsparse_dcsrsv_buffer_size(workspace_->getRocsparseHandle(), 
+                                                     rocsparse_operation_none, 
+                                                     n, 
+                                                     nnz, 
+                                                     descr_L_,
+                                                     d_ILU_vals_, //vals_, 
+                                                     A_->getRowData(ReSolve::memory::DEVICE), 
+                                                     A_->getColData(ReSolve::memory::DEVICE), 
+                                                     info_A_, 
+                                                     &buffer_size_L);
+    if (status_rocsparse_ != 0) { 
+        io::Logger::warning() << "Buffer size estimate for L solve failed with code: " <<status_rocsparse_<<" \n"; 
+    } 
+
+    error_sum += status_rocsparse_;
+
+    status_rocsparse_ = rocsparse_dcsrsv_buffer_size(workspace_->getRocsparseHandle(), 
+                                                     rocsparse_operation_none, 
+                                                     n, 
+                                                     nnz, 
+                                                     descr_U_,
+                                                     d_ILU_vals_, //vals_, 
+                                                     A_->getRowData(ReSolve::memory::DEVICE), 
+                                                     A_->getColData(ReSolve::memory::DEVICE), 
+                                                     info_A_,
+                                                     &buffer_size_U);
+    if (status_rocsparse_ != 0) { 
+        io::Logger::warning() << "Buffer size estimate for U solve failed with code: " <<status_rocsparse_<<" \n"; 
+    } 
+    error_sum += status_rocsparse_;
+
+    size_t buffer_size = std::max(buffer_size_A, std::max(buffer_size_L, buffer_size_U));
+
+    mem_.allocateBufferOnDevice(&buffer_, buffer_size);
+    
+    // Now analysis
+    status_rocsparse_ = rocsparse_dcsrilu0_analysis(workspace_->getRocsparseHandle(), 
+                                                    n, 
+                                                    nnz, 
+                                                    descr_A_,
+                                                    d_ILU_vals_, //vals_, 
+                                                    A_->getRowData(ReSolve::memory::DEVICE), 
+                                                    A_->getColData(ReSolve::memory::DEVICE), 
+                                                    info_A_,
+                                                    rocsparse_analysis_policy_reuse,
+                                                    rocsparse_solve_policy_auto,
+                                                    buffer_);
+
+    if (status_rocsparse_ != 0) { 
+        io::Logger::warning() << "ILU0 decomposition analysis failed with code: " <<status_rocsparse_<<" \n"; 
+    } 
+    error_sum += status_rocsparse_;
+
+    status_rocsparse_ = rocsparse_dcsrsv_analysis(workspace_->getRocsparseHandle(), 
+                                                  rocsparse_operation_none, 
+                                                  n, 
+                                                  nnz, 
+                                                  descr_L_,
+                                                  d_ILU_vals_, //vals_, 
+                                                  A_->getRowData(ReSolve::memory::DEVICE), 
+                                                  A_->getColData(ReSolve::memory::DEVICE), 
+                                                  info_A_,   
+                                                  rocsparse_analysis_policy_reuse,
+                                                  rocsparse_solve_policy_auto,
+                                                  buffer_);
+    if (status_rocsparse_ != 0) { 
+        io::Logger::warning() << "Solve analysis for L solve failed with code: " <<status_rocsparse_<<" \n"; 
+    } 
+    error_sum += status_rocsparse_;
+
+
+    status_rocsparse_ = rocsparse_dcsrsv_analysis(workspace_->getRocsparseHandle(), 
+                                                  rocsparse_operation_none, 
+                                                  n, 
+                                                  nnz, 
+                                                  descr_U_,
+                                                  d_ILU_vals_, //vals_, 
+                                                  A_->getRowData(ReSolve::memory::DEVICE), 
+                                                  A_->getColData(ReSolve::memory::DEVICE), 
+                                                  info_A_,
+                                                   rocsparse_analysis_policy_reuse,
+                                                  rocsparse_solve_policy_auto,
+                                                  buffer_);
+    if (status_rocsparse_ != 0) { 
+        io::Logger::warning() << "Solve analysis for U solve failed with code: " <<status_rocsparse_<<" \n"; 
+    } 
+
+    error_sum += status_rocsparse_;
+    //allocate aux data
+
+    // and now the actual decomposition
+
+    // Compute incomplete LU factorization
+
+    status_rocsparse_ = rocsparse_dcsrilu0(workspace_->getRocsparseHandle(), 
+                                           n, 
+                                           nnz, 
+                                           descr_A_,
+                                           d_ILU_vals_, //vals_ 
+                                           A_->getRowData(ReSolve::memory::DEVICE), 
+                                           A_->getColData(ReSolve::memory::DEVICE), 
+                                           info_A_,
+                                           rocsparse_solve_policy_auto,
+                                           buffer_);
+    if (status_rocsparse_ != 0) { 
+        io::Logger::warning() << "ILU0 decomposition failed with code: " <<status_rocsparse_<<" \n"; 
+    } 
+
+    error_sum += status_rocsparse_;
+
+    mem_.allocateArrayOnDevice(&d_aux1_,n); 
+    return error_sum;
+  }
+
+  int LinSolverDirectRocSparseILU0::reset(matrix::Sparse* A)
+  {
+    int error_sum = 0;
+    this->A_ = A;
+    index_type n = A_->getNumRows();
+    index_type nnz = A_->getNnzExpanded();
+    mem_.copyArrayDeviceToDevice(d_ILU_vals_, A_->getValues(ReSolve::memory::DEVICE), nnz);
+
+    status_rocsparse_ = rocsparse_dcsrilu0(workspace_->getRocsparseHandle(), 
+                                           n, 
+                                           nnz, 
+                                           descr_A_,
+                                           d_ILU_vals_, //vals_, 
+                                           A_->getRowData(ReSolve::memory::DEVICE), 
+                                           A_->getColData(ReSolve::memory::DEVICE), 
+                                           info_A_,
+                                           rocsparse_solve_policy_auto,
+                                           buffer_);
+
+    error_sum += status_rocsparse_;
+
+    return error_sum;
+  }
+  // solution is returned in RHS
+  int LinSolverDirectRocSparseILU0::solve(vector_type* rhs)
+  {
+    int error_sum = 0;
+    status_rocsparse_ = rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), 
+                                               rocsparse_operation_none,
+                                               A_->getNumRows(),
+                                               A_->getNnzExpanded(), 
+                                               &(constants::ONE), 
+                                               descr_L_,
+                                               d_ILU_vals_, //vals_, 
+                                               A_->getRowData(ReSolve::memory::DEVICE), 
+                                               A_->getColData(ReSolve::memory::DEVICE), 
+                                               info_A_,
+                                               rhs->getData(ReSolve::memory::DEVICE),
+                                               d_aux1_, //result
+                                               rocsparse_solve_policy_auto,
+                                               buffer_);
+    error_sum += status_rocsparse_;
+
+    status_rocsparse_ = rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), 
+                                               rocsparse_operation_none,
+                                               A_->getNumRows(),
+                                               A_->getNnzExpanded(), 
+                                               &(constants::ONE), 
+                                               descr_U_,
+                                               d_ILU_vals_, //vals_, 
+                                               A_->getRowData(ReSolve::memory::DEVICE), 
+                                               A_->getColData(ReSolve::memory::DEVICE), 
+                                               info_A_,
+                                               d_aux1_, //input
+                                               rhs->getData(ReSolve::memory::DEVICE),//result
+                                               rocsparse_solve_policy_auto,
+                                               buffer_);
+    error_sum += status_rocsparse_;
+    rhs->setDataUpdated(ReSolve::memory::DEVICE);
+
+    return error_sum;
+  }
+
+  int LinSolverDirectRocSparseILU0::solve(vector_type* rhs, vector_type* x)
+  {
+    int error_sum = 0;
+
+
+      
+          status_rocsparse_ = rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), 
+                                               rocsparse_operation_none,
+                                               A_->getNumRows(),
+                                               A_->getNnzExpanded(), 
+                                               &(constants::ONE), 
+                                               descr_L_,
+                                               d_ILU_vals_, //vals_, 
+                                               A_->getRowData(ReSolve::memory::DEVICE), 
+                                               A_->getColData(ReSolve::memory::DEVICE), 
+                                               info_A_,
+                                               rhs->getData(ReSolve::memory::DEVICE),
+                                               d_aux1_, //result
+                                               rocsparse_solve_policy_auto,
+                                               buffer_);
+    error_sum += status_rocsparse_;
+
+    status_rocsparse_ = rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), 
+                                               rocsparse_operation_none,
+                                               A_->getNumRows(),
+                                               A_->getNnzExpanded(), 
+                                               &(constants::ONE), 
+                                               descr_U_,
+                                               d_ILU_vals_, //vals_, 
+                                               A_->getRowData(ReSolve::memory::DEVICE), 
+                                               A_->getColData(ReSolve::memory::DEVICE), 
+                                               info_A_,
+                                               d_aux1_, //input
+                                               x->getData(ReSolve::memory::DEVICE),//result
+                                               rocsparse_solve_policy_auto,
+                                               buffer_);
+    error_sum += status_rocsparse_;
+    
+    x->setDataUpdated(ReSolve::memory::DEVICE);
+
+    return error_sum;
+  }
+}// namespace resolve
diff --git a/resolve/LinSolverDirectRocSparseILU0.hpp b/resolve/LinSolverDirectRocSparseILU0.hpp
new file mode 100644
index 000000000..789e8e2cc
--- /dev/null
+++ b/resolve/LinSolverDirectRocSparseILU0.hpp
@@ -0,0 +1,65 @@
+
+#pragma once
+#include "Common.hpp"
+#include "LinSolver.hpp"
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+
+#include <rocsparse/rocsparse.h>
+//#include <rocblas/rocblas.h>
+#include <rocsolver/rocsolver.h>
+#include <hip/hip_runtime.h>
+namespace ReSolve 
+{
+  // Forward declaration of vector::Vector class
+  namespace vector
+  {
+    class Vector;
+  }
+
+  // Forward declaration of matrix::Sparse class
+  namespace matrix
+  {
+    class Sparse;
+  }
+
+  class LinSolverDirectRocSparseILU0 : public LinSolverDirect 
+  {
+    using vector_type = vector::Vector;
+    
+    public: 
+      LinSolverDirectRocSparseILU0(LinAlgWorkspaceHIP* workspace);
+      ~LinSolverDirectRocSparseILU0();
+      
+      int setup(matrix::Sparse* A,
+                matrix::Sparse* L = nullptr,
+                matrix::Sparse* U = nullptr,
+                index_type*     P = nullptr,
+                index_type*     Q = nullptr,
+                vector_type* rhs  = nullptr);
+      // if values of A change, but the nnz pattern does not, redo the analysis only (reuse buffers though)
+      int reset(matrix::Sparse* A);
+       
+      int solve(vector_type* rhs, vector_type* x);
+      int solve(vector_type* rhs);// the solutuon is returned IN RHS (rhs is overwritten)
+    
+
+    private:
+      rocsparse_status status_rocsparse_;
+
+      MemoryHandler mem_; ///< Device memory manager object
+      LinAlgWorkspaceHIP* workspace_; 
+
+      rocsparse_mat_descr descr_A_{nullptr};
+      rocsparse_mat_descr descr_L_{nullptr};
+      rocsparse_mat_descr descr_U_{nullptr};
+
+      rocsparse_mat_info  info_A_{nullptr};
+      
+      void* buffer_; 
+      
+      real_type* d_aux1_;
+      // since ILU OVERWRITES THE MATRIX values, we need a buffer to keep the values of ILU decomposition. 
+      real_type* d_ILU_vals_;
+  };
+}// namespace
diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp
index 40fdb22c1..c0dcf48d5 100644
--- a/resolve/LinSolverIterativeFGMRES.cpp
+++ b/resolve/LinSolverIterativeFGMRES.cpp
@@ -19,6 +19,7 @@ namespace ReSolve
     maxit_= 100; //default
     restart_ = 10;
     conv_cond_ = 0;//default
+    flexible_ = 1;
 
     d_V_ = nullptr;
     d_Z_ = nullptr;
@@ -38,6 +39,7 @@ namespace ReSolve
     maxit_= 100; //default
     restart_ = 10;
     conv_cond_ = 0;//default
+    flexible_ = 1;
 
     d_V_ = nullptr;
     d_Z_ = nullptr;
@@ -61,6 +63,7 @@ namespace ReSolve
     maxit_= maxit; 
     restart_ = restart;
     conv_cond_ = conv_cond;
+    flexible_ = 1;
 
     d_V_ = nullptr;
     d_Z_ = nullptr;
@@ -88,7 +91,12 @@ namespace ReSolve
 
     d_V_ = new vector_type(n_, restart_ + 1);
     d_V_->allocate(memory::DEVICE);      
-    d_Z_ = new vector_type(n_, restart_ + 1);
+    if (flexible_) {
+      d_Z_ = new vector_type(n_, restart_ + 1);
+    } else {
+      // otherwise Z is just a one vector, not multivector and we dont keep it
+      d_Z_ = new vector_type(n_);
+    }
     d_Z_->allocate(memory::DEVICE);      
     h_H_  = new real_type[restart_ * (restart_ + 1)];
     h_c_  = new real_type[restart_];      // needed for givens
@@ -127,7 +135,6 @@ namespace ReSolve
     rnorm = 0.0;
     bnorm = vector_handler_->dot(rhs, rhs, memspace_);
     rnorm = vector_handler_->dot(d_V_, d_V_, memspace_);
-
     //rnorm = ||V_1||
     rnorm = sqrt(rnorm);
     bnorm = sqrt(bnorm);
@@ -173,9 +180,12 @@ namespace ReSolve
         it++;
 
         // Z_i = (LU)^{-1}*V_i
-
         vec_v->setData( d_V_->getVectorData(i, memory::DEVICE), memory::DEVICE);
-        vec_z->setData( d_Z_->getVectorData(i, memory::DEVICE), memory::DEVICE);
+        if (flexible_) {
+          vec_z->setData( d_Z_->getVectorData(i, memory::DEVICE), memory::DEVICE);
+        } else {
+          vec_z->setData( d_Z_->getVectorData(0, memory::DEVICE), memory::DEVICE);
+        }
         this->precV(vec_v, vec_z);
         mem_.deviceSynchronize();
 
@@ -234,9 +244,24 @@ namespace ReSolve
       }
 
       // get solution
-      for(j = 0; j <= i; j++) {
-        vec_z->setData( d_Z_->getVectorData(j, memory::DEVICE), memory::DEVICE);
-        vector_handler_->axpy(&h_rs_[j], vec_z, x, memspace_);
+      if (flexible_) {
+        for(j = 0; j <= i; j++) {
+          vec_z->setData( d_Z_->getVectorData(j, memory::DEVICE), memory::DEVICE);
+          vector_handler_->axpy(&h_rs_[j], vec_z, x, memspace_);
+        }
+      } else {
+        mem_.setZeroArrayOnDevice(d_Z_->getData(memory::DEVICE), d_Z_->getSize());
+        vec_z->setData( d_Z_->getVectorData(0, memory::DEVICE), memory::DEVICE);
+        for(j = 0; j <= i; j++) {
+          vec_v->setData( d_V_->getVectorData(j, memory::DEVICE), memory::DEVICE);
+          vector_handler_->axpy(&h_rs_[j], vec_v, vec_z, memspace_);
+        }
+        // now multiply d_Z by precon
+
+        vec_v->setData( d_V_->getData(memory::DEVICE), memory::DEVICE);
+        this->precV(vec_z, vec_v);
+        // and add to x 
+        vector_handler_->axpy(&ONE, vec_v, x, memspace_);
       }
 
       /* test solution */
@@ -292,6 +317,11 @@ namespace ReSolve
     return conv_cond_;
   }
 
+  bool  LinSolverIterativeFGMRES::getFlexible()
+  {
+    return flexible_;
+  }
+
   void  LinSolverIterativeFGMRES::setTol(real_type new_tol)
   {
     this->tol_ = new_tol;
@@ -311,6 +341,11 @@ namespace ReSolve
   {
     this->conv_cond_ = new_conv_cond;
   }
+  
+  void  LinSolverIterativeFGMRES::setFlexible(bool new_flex)
+  {
+    this->flexible_ = new_flex;
+  }
 
   int  LinSolverIterativeFGMRES::resetMatrix(matrix::Sparse* new_matrix)
   {
diff --git a/resolve/LinSolverIterativeFGMRES.hpp b/resolve/LinSolverIterativeFGMRES.hpp
index a9fc5058d..c9c140f9a 100644
--- a/resolve/LinSolverIterativeFGMRES.hpp
+++ b/resolve/LinSolverIterativeFGMRES.hpp
@@ -37,11 +37,13 @@ namespace ReSolve
     index_type getMaxit();
     index_type getRestart();
     index_type getConvCond();
+    bool getFlexible();
 
     void setTol(real_type new_tol);
     void setMaxit(index_type new_maxit);
     void setRestart(index_type new_restart);
     void setConvCond(index_type new_conv_cond);
+    void setFlexible(bool new_flexible);
 
     real_type getFinalResidualNorm();
     real_type getInitResidualNorm();
@@ -57,7 +59,7 @@ namespace ReSolve
     index_type restart_;
     std::string orth_option_;
     index_type conv_cond_;
-
+    bool flexible_{true}; // if can be run as "normal" GMRES if needed, set flexible_ to false. Default is true of course.
     vector_type* d_V_{nullptr};
     vector_type* d_Z_{nullptr};
 
diff --git a/resolve/LinSolverIterativeRandFGMRES.cpp b/resolve/LinSolverIterativeRandFGMRES.cpp
new file mode 100644
index 000000000..40cd08acc
--- /dev/null
+++ b/resolve/LinSolverIterativeRandFGMRES.cpp
@@ -0,0 +1,502 @@
+#include <iostream>
+#include <cassert>
+#include <cmath>
+#include <iomanip>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/matrix/MatrixHandler.hpp>
+#include "LinSolverIterativeRandFGMRES.hpp"
+#include <resolve/RandSketchingCountSketch.hpp> 
+#include <resolve/RandSketchingFWHT.hpp> 
+
+namespace ReSolve
+{
+  using out = io::Logger;
+ 
+  LinSolverIterativeRandFGMRES::LinSolverIterativeRandFGMRES(std::string memspace)
+  {
+    memspace_ = memspace;
+    this->matrix_handler_ = nullptr;
+    this->vector_handler_ = nullptr;
+    this->rand_method_ = cs; 
+
+    tol_ = 1e-12; //default
+    maxit_= 100; //default
+    restart_ = 10;
+    conv_cond_ = 0;//default
+    flexible_ = 1;
+
+    d_V_ = nullptr;
+    d_Z_ = nullptr;
+  }
+
+  LinSolverIterativeRandFGMRES::LinSolverIterativeRandFGMRES(MatrixHandler* matrix_handler,
+                                                             VectorHandler* vector_handler, 
+                                                             SketchingMethod rand_method, 
+                                                             GramSchmidt*   gs,
+                                                             std::string memspace)
+  {
+    memspace_ = memspace;
+    this->matrix_handler_ = matrix_handler;
+    this->vector_handler_ = vector_handler;
+    this->rand_method_ = rand_method;
+    this->GS_ = gs;
+
+    tol_ = 1e-14; //default
+    maxit_= 100; //default
+    restart_ = 10;
+    conv_cond_ = 0;//default
+    flexible_ = 1;
+
+    d_V_ = nullptr;
+    d_Z_ = nullptr;
+  }
+
+  LinSolverIterativeRandFGMRES::LinSolverIterativeRandFGMRES(index_type restart, 
+                                                             real_type  tol,
+                                                             index_type maxit,
+                                                             index_type conv_cond,
+                                                             MatrixHandler* matrix_handler,
+                                                             VectorHandler* vector_handler,
+                                                             SketchingMethod rand_method, 
+                                                             GramSchmidt*   gs,
+                                                             std::string memspace)
+  {
+    memspace_ = memspace;
+    this->matrix_handler_ = matrix_handler;
+    this->vector_handler_ = vector_handler;
+    this->rand_method_ = rand_method;
+    this->GS_ = gs;
+
+    tol_ = tol; 
+    maxit_= maxit; 
+    restart_ = restart;
+    conv_cond_ = conv_cond;
+    flexible_ = 1;
+
+    d_V_ = nullptr;
+    d_Z_ = nullptr;
+
+  }
+
+  LinSolverIterativeRandFGMRES::~LinSolverIterativeRandFGMRES()
+  {
+    if (d_V_ != nullptr) {
+      // cudaFree(d_V_);
+      delete d_V_;   
+    }
+
+    if (d_Z_ != nullptr) {
+      //      cudaFree(d_Z_);
+      delete d_Z_;   
+    }
+
+  }
+
+  int LinSolverIterativeRandFGMRES::setup(matrix::Sparse* A)
+  {
+    this->A_ = A;
+    n_ = A_->getNumRows();
+
+    d_V_ = new vector_type(n_, restart_ + 1);
+    d_V_->allocate(memory::DEVICE);      
+    if (flexible_) {
+      d_Z_ = new vector_type(n_, restart_ + 1);
+    } else {
+      // otherwise Z is just a one vector, not multivector and we dont keep it
+      d_Z_ = new vector_type(n_);
+    }
+    d_Z_->allocate(memory::DEVICE);      
+    h_H_  = new real_type[restart_ * (restart_ + 1)];
+    h_c_  = new real_type[restart_];      // needed for givens
+    h_s_  = new real_type[restart_];      // same
+    h_rs_ = new real_type[restart_ + 1]; // for residual norm history
+
+    mem_.allocateArrayOnDevice(&d_aux_, restart_ + 1); 
+    // rand method
+    k_rand_ = n_;
+    switch(rand_method_) {
+      case cs:
+        if(ceil(restart_ * log(n_)) < k_rand_) {
+          k_rand_ = static_cast<index_type>(std::ceil(restart_ * std::log(static_cast<real_type>(n_))));
+        }
+        rand_manager_ = new RandSketchingCountSketch();
+        //set k and n 
+        break;
+      case fwht:
+        if ( ceil(2.0 * restart_ * log(n_) / log(restart_)) < k_rand_) {
+          k_rand_ = static_cast<index_type>(std::ceil(2.0 * restart_ * std::log(n_) / std::log(restart_)));
+        }
+        rand_manager_ = new RandSketchingFWHT();
+        break;
+      default:
+        io::Logger::warning() << "Wrong sketching method, setting to default (CountSketch)\n"; 
+        rand_method_ = cs;
+        if(ceil(restart_ * log(n_)) < k_rand_) {
+          k_rand_ = static_cast<index_type>(std::ceil(restart_ * std::log(n_)));
+        }
+        rand_manager_ = new RandSketchingCountSketch();
+        break;
+    }
+
+    rand_manager_->setup(n_, k_rand_); 
+
+    one_over_k_ = 1.0 / sqrt((real_type) k_rand_);
+
+    d_S_ = new vector_type(k_rand_, restart_ + 1);
+    d_S_->allocate(memory::DEVICE);      
+    if (rand_method_ == cs) {
+      d_S_->setToZero(memory::DEVICE);
+    }
+    return 0;
+  }
+
+  int  LinSolverIterativeRandFGMRES::solve(vector_type* rhs, vector_type* x)
+  {
+    using namespace constants;
+
+    //io::Logger::setVerbosity(io::Logger::EVERYTHING);
+    
+    int outer_flag = 1;
+    int notconv = 1; 
+    int i = 0;
+    int it = 0;
+    int j;
+    int k;
+    int k1;
+
+    real_type t;
+    real_type rnorm;
+    real_type bnorm;
+    // real_type rnorm_aux;
+    real_type tolrel;
+    vector_type* vec_v = new vector_type(n_);
+    vector_type* vec_z = new vector_type(n_);
+    vector_type* vec_s = new vector_type(k_rand_);
+    //V[0] = b-A*x_0
+    //debug
+    d_Z_->setToZero(memory::DEVICE);
+    d_V_->setToZero(memory::DEVICE);
+
+    rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE);  
+    matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE, "csr", memspace_); 
+
+    vec_v->setData( d_V_->getVectorData(0, memory::DEVICE), memory::DEVICE);
+    vec_s->setData( d_S_->getVectorData(0, memory::DEVICE), memory::DEVICE);
+
+    rand_manager_->Theta(vec_v, vec_s);
+
+    if (rand_method_ == fwht){
+      //  cublasDscal(cublas_handle, k_rand, &oneOverK, d_S, 1); 
+      vector_handler_->scal(&one_over_k_, vec_s, memspace_);
+    }
+    mem_.deviceSynchronize();
+
+    rnorm = 0.0;
+    bnorm = vector_handler_->dot(rhs, rhs, memspace_);
+    rnorm = vector_handler_->dot(vec_s, vec_s, memspace_);
+    // double rnorm_true = vector_handler_->dot(vec_v, vec_v, memspace_);
+    //rnorm = ||V_1||
+    rnorm = sqrt(rnorm);
+    bnorm = sqrt(bnorm);
+    io::Logger::misc() << "it 0: norm of residual "
+                       << std::scientific << std::setprecision(16) 
+                       << rnorm << " Norm of rhs: " << bnorm << "\n";
+    initial_residual_norm_ = rnorm;
+    while(outer_flag) {
+      // check if maybe residual is already small enough?
+      if(it == 0) {
+        tolrel = tol_ * rnorm;
+        if(fabs(tolrel) < 1e-16) {
+          tolrel = 1e-16;
+        }
+      }
+      int exit_cond = 0;
+      if (conv_cond_ == 0) {
+        exit_cond =  ((fabs(rnorm - ZERO) <= EPSILON));
+      } else if (conv_cond_ == 1) {
+        exit_cond =  ((fabs(rnorm - ZERO) <= EPSILON) || (rnorm < tol_));
+      } else if (conv_cond_ == 2) {
+        exit_cond =  ((fabs(rnorm - ZERO) <= EPSILON) || (rnorm < (tol_*bnorm)));
+      }
+      if (exit_cond) {
+        outer_flag = 0;
+        final_residual_norm_ = rnorm;
+        initial_residual_norm_ = rnorm;
+        fgmres_iters_ = 0;
+        break;
+      }
+
+      // normalize first vector
+      t = 1.0 / rnorm;
+      vector_handler_->scal(&t, d_V_, memspace_);
+      vector_handler_->scal(&t, d_S_, memspace_);
+
+      mem_.deviceSynchronize();
+      // initialize norm history
+      h_rs_[0] = rnorm;
+      i = -1;
+      notconv = 1;
+
+      while((notconv) && (it < maxit_)) {
+        i++;
+        it++;
+
+        // Z_i = (LU)^{-1}*V_i
+        vec_v->setData( d_V_->getVectorData(i, memory::DEVICE), memory::DEVICE);
+        if (flexible_) {
+          vec_z->setData( d_Z_->getVectorData(i, memory::DEVICE), memory::DEVICE);
+        } else {
+          vec_z->setData( d_Z_->getVectorData(0, memory::DEVICE), memory::DEVICE);
+        }
+        this->precV(vec_v, vec_z);
+
+        mem_.deviceSynchronize();
+
+        // V_{i+1}=A*Z_i
+
+        vec_v->setData( d_V_->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
+
+        matrix_handler_->matvec(A_, vec_z, vec_v, &ONE, &ZERO,"csr", memspace_); 
+
+        // orthogonalize V[i+1], form a column of h_H_
+        // this is where it differs from normal solver GS
+        vec_s->setData( d_S_->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
+        rand_manager_->Theta(vec_v, vec_s); 
+        if (rand_method_ == fwht){
+          //  cublasDscal(cublas_handle, k_rand, &oneOverK, d_S, 1); 
+          vector_handler_->scal(&one_over_k_, vec_s, memspace_);
+        }
+        mem_.deviceSynchronize();
+        GS_->orthogonalize(k_rand_, d_S_, h_H_, i, memspace_);  
+        // now post-process
+        //checkCudaErrors(cudaMemcpy(d_Hcolumn, &h_H[i * (restart + 1)], sizeof(double) * (i + 1), cudaMemcpyHostToDevice));
+        mem_.copyArrayHostToDevice(d_aux_, &h_H_[i * (restart_ + 1)], i + 2);
+        vec_z->setData(d_aux_, memory::DEVICE);
+        vec_z->setCurrentSize(i + 1);
+        //V(:, i+1) =w-V(:, 1:i)*d_H_col = V(:, i+1)-d_H_col * V(:,1:i); 
+        //checkCudaErrors( cublasDgemv(cublas_handle, CUBLAS_OP_N, n, i + 1, &minusone, d_V, n, d_Hcolumn, 1,&one , &d_V[n * (i + 1)], 1));
+
+        vector_handler_->gemv("N", n_, i + 1, &MINUSONE, &ONE, d_V_, vec_z, vec_v, memspace_ );  
+
+        vec_z->setCurrentSize(n_);
+        t = 1.0 / h_H_[i * (restart_ + 1) + i + 1];
+        vector_handler_->scal(&t, vec_v, memspace_);  
+        mem_.deviceSynchronize();
+        vec_s->setData( d_S_->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
+
+        if (i != 0) {
+          for (int k = 1; k <= i; k++) {
+            k1 = k - 1;
+            t = h_H_[i * (restart_ + 1) + k1];
+            h_H_[i * (restart_ + 1) + k1] = h_c_[k1] * t + h_s_[k1] * h_H_[i * (restart_ + 1) + k];
+            h_H_[i * (restart_ + 1) + k] = -h_s_[k1] * t + h_c_[k1] * h_H_[i * (restart_ + 1) + k];
+          }
+        } // if i!=0
+        double Hii = h_H_[i * (restart_ + 1) + i];
+        double Hii1 = h_H_[(i) * (restart_ + 1) + i + 1];
+        double gam = sqrt(Hii * Hii + Hii1 * Hii1);
+
+        if(fabs(gam - ZERO) <= EPSILON) {
+          gam = EPSMAC;
+        }
+
+        /* next Given's rotation */
+        h_c_[i] = Hii / gam;
+        h_s_[i] = Hii1 / gam;
+        h_rs_[i + 1] = -h_s_[i] * h_rs_[i];
+        h_rs_[i] = h_c_[i] * h_rs_[i];
+
+        h_H_[(i) * (restart_ + 1) + (i)] = h_c_[i] * Hii + h_s_[i] * Hii1;
+        h_H_[(i) * (restart_ + 1) + (i + 1)] = h_c_[i] * Hii1 - h_s_[i] * Hii;
+
+        // residual norm estimate
+        rnorm = fabs(h_rs_[i + 1]);
+        
+        io::Logger::misc() << "it: "<<it<< " --> norm of the residual "
+                           << std::scientific << std::setprecision(16)
+                           << rnorm << "\n";
+        // check convergence
+        if (i + 1 >= restart_ || rnorm <= tolrel || it >= maxit_) {
+          notconv = 0;
+        }
+      } // inner while
+
+      io::Logger::misc() << "End of cycle, ESTIMATED norm of residual "
+                         << std::scientific << std::setprecision(16)
+                         << rnorm << "\n";
+      // solve tri system
+      h_rs_[i] = h_rs_[i] / h_H_[i * (restart_ + 1) + i];
+      for (int ii = 2; ii <= i + 1; ii++) {
+        k = i - ii + 1;
+        k1 = k + 1;
+        t = h_rs_[k];
+        for(j = k1; j <= i; j++) {
+          t -= h_H_[j * (restart_ + 1) + k] * h_rs_[j];
+        }
+        h_rs_[k] = t / h_H_[k * (restart_ + 1) + k];
+      }
+
+      // get solution
+      if (flexible_) {
+        for (j = 0; j <= i; j++) {
+          vec_z->setData( d_Z_->getVectorData(j, memory::DEVICE), memory::DEVICE);
+          vector_handler_->axpy(&h_rs_[j], vec_z, x, memspace_);
+        }
+      } else {
+        mem_.setZeroArrayOnDevice(d_Z_->getData(memory::DEVICE), d_Z_->getSize());
+        vec_z->setData( d_Z_->getVectorData(0, memory::DEVICE), memory::DEVICE);
+        for(j = 0; j <= i; j++) {
+          vec_v->setData( d_V_->getVectorData(j, memory::DEVICE), memory::DEVICE);
+          vector_handler_->axpy(&h_rs_[j], vec_v, vec_z, memspace_);
+
+        }
+        // now multiply d_Z by precon
+
+        vec_v->setData( d_V_->getData(memory::DEVICE), memory::DEVICE);
+        this->precV(vec_z, vec_v);
+        // and add to x 
+        vector_handler_->axpy(&ONE, vec_v, x, memspace_);
+      }
+
+      /* test solution */
+      if(rnorm <= tolrel || it >= maxit_) {
+        // rnorm_aux = rnorm;
+        outer_flag = 0;
+      }
+
+      rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE);  
+      matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE,"csr", memspace_); 
+      if (outer_flag) {
+
+        rand_manager_->reset();
+
+        if (rand_method_ == cs) {
+          mem_.setZeroArrayOnDevice(d_S_->getData(memory::DEVICE), d_S_->getSize() * d_S_->getNumVectors());
+        }
+        vec_v->setData( d_V_->getVectorData(0, memory::DEVICE), memory::DEVICE);
+        vec_s->setData( d_S_->getVectorData(0, memory::DEVICE), memory::DEVICE);
+        rand_manager_->Theta(vec_v, vec_s);
+        if (rand_method_ == fwht){
+          //  cublasDscal(cublas_handle, k_rand, &oneOverK, d_S, 1); 
+          vector_handler_->scal(&one_over_k_, vec_s, memspace_);
+        }
+        mem_.deviceSynchronize();
+        rnorm = vector_handler_->dot(d_S_, d_S_, memspace_);
+        // rnorm = ||S_0||
+        rnorm = sqrt(rnorm);
+      }
+
+      if (!outer_flag) {
+        rnorm = vector_handler_->dot(d_V_, d_V_, memspace_);
+        // rnorm = ||V_0||
+        rnorm = sqrt(rnorm);
+        
+        io::Logger::misc() << "End of cycle, COMPUTED norm of residual "
+                          << std::scientific << std::setprecision(16)
+                          << rnorm << "\n";
+        
+        final_residual_norm_ = rnorm;
+        fgmres_iters_ = it;
+      }
+    } // outer while
+    return 0;
+  }
+
+  int  LinSolverIterativeRandFGMRES::setupPreconditioner(std::string type, LinSolverDirect* LU_solver)
+  {
+    if (type != "LU") {
+      out::warning() << "Only cusolverRf tri solve can be used as a preconditioner at this time." << std::endl;
+      return 1;
+    } else {
+      LU_solver_ = LU_solver;  
+      return 0;
+    }
+
+  }
+
+  real_type  LinSolverIterativeRandFGMRES::getTol()
+  {
+    return tol_;
+  }
+
+  index_type  LinSolverIterativeRandFGMRES::getMaxit()
+  {
+    return maxit_;
+  }
+
+  index_type  LinSolverIterativeRandFGMRES::getRestart()
+  {
+    return restart_;
+  }
+
+  index_type  LinSolverIterativeRandFGMRES::getConvCond()
+  {
+    return conv_cond_;
+  }
+
+  bool  LinSolverIterativeRandFGMRES::getFlexible()
+  {
+    return flexible_;
+  }
+
+  index_type  LinSolverIterativeRandFGMRES::getKrand()
+  {
+    return k_rand_;
+  }
+
+  void  LinSolverIterativeRandFGMRES::setTol(real_type new_tol)
+  {
+    this->tol_ = new_tol;
+  }
+
+  void  LinSolverIterativeRandFGMRES::setMaxit(index_type new_maxit)
+  {
+    this->maxit_ = new_maxit;
+  }
+
+  void  LinSolverIterativeRandFGMRES::setRestart(index_type new_restart)
+  {
+    this->restart_ = new_restart;
+  }
+
+  void  LinSolverIterativeRandFGMRES::setConvCond(index_type new_conv_cond)
+  {
+    this->conv_cond_ = new_conv_cond;
+  }
+
+  void  LinSolverIterativeRandFGMRES::setFlexible(bool new_flex)
+  {
+    this->flexible_ = new_flex;
+  }
+
+  int  LinSolverIterativeRandFGMRES::resetMatrix(matrix::Sparse* new_matrix)
+  {
+    A_ = new_matrix;
+    matrix_handler_->setValuesChanged(true, memspace_);
+    return 0;
+  }
+
+
+
+  void  LinSolverIterativeRandFGMRES::precV(vector_type* rhs, vector_type* x)
+  { 
+    LU_solver_->solve(rhs, x);
+    //  x->update(rhs->getData(memory::DEVICE), memory::DEVICE, memory::DEVICE);
+  }
+
+  real_type LinSolverIterativeRandFGMRES::getFinalResidualNorm()
+  {
+    return final_residual_norm_;
+  }
+
+  real_type LinSolverIterativeRandFGMRES::getInitResidualNorm()
+  {
+    return initial_residual_norm_;
+  }
+
+  index_type LinSolverIterativeRandFGMRES::getNumIter()
+  {
+    return fgmres_iters_;
+  }
+} // namespace ReSolve
diff --git a/resolve/LinSolverIterativeRandFGMRES.hpp b/resolve/LinSolverIterativeRandFGMRES.hpp
new file mode 100644
index 000000000..f779f257b
--- /dev/null
+++ b/resolve/LinSolverIterativeRandFGMRES.hpp
@@ -0,0 +1,102 @@
+#pragma once
+#include "Common.hpp"
+#include <resolve/matrix/Sparse.hpp>
+#include <resolve/vector/Vector.hpp>
+#include "LinSolver.hpp"
+#include "GramSchmidt.hpp"
+#include "RandSketchingManager.hpp"
+
+namespace ReSolve 
+{
+
+  class LinSolverIterativeRandFGMRES : public LinSolverIterative
+  {
+    using vector_type = vector::Vector;
+
+    public:
+   
+    enum SketchingMethod { cs = 0, // count sketch 
+                           fwht = 1}; // fast Walsh-Hadamard transform
+   
+    LinSolverIterativeRandFGMRES(std::string memspace = "cuda");
+
+    LinSolverIterativeRandFGMRES( MatrixHandler* matrix_handler,
+                                  VectorHandler* vector_handler,
+                                  SketchingMethod rand_method, 
+                                  GramSchmidt*   gs,
+                                  std::string memspace = "cuda");
+
+    LinSolverIterativeRandFGMRES(index_type restart,
+                                 real_type  tol,
+                                 index_type maxit,
+                                 index_type conv_cond,
+                                 MatrixHandler* matrix_handler,
+                                 VectorHandler* vector_handler, 
+                                 SketchingMethod rand_method, 
+                                 GramSchmidt*   gs,
+                                 std::string memspace = "cuda");
+    ~LinSolverIterativeRandFGMRES();
+
+    int solve(vector_type* rhs, vector_type* x);
+    int setup(matrix::Sparse* A);
+    int resetMatrix(matrix::Sparse* new_A); 
+    int setupPreconditioner(std::string name, LinSolverDirect* LU_solver);
+
+    real_type getTol();
+    index_type getMaxit();
+    index_type getRestart();
+    index_type getConvCond();
+    bool getFlexible();
+    std::string getRandSketchingMethod();
+    index_type getKrand();
+
+    void setTol(real_type new_tol);
+    void setMaxit(index_type new_maxit);
+    void setRestart(index_type new_restart);
+    void setConvCond(index_type new_conv_cond);
+    void setFlexible(bool new_flexible);
+    void getRandSketchingMethod(std::string rand_method);
+
+    real_type getFinalResidualNorm();
+    real_type getInitResidualNorm();
+    index_type getNumIter();
+
+    private:
+    //remember matrix handler and vector handler are inherited.
+
+    std::string memspace_;
+
+    real_type tol_;
+    index_type maxit_;
+    index_type restart_;
+    std::string orth_option_;
+    index_type conv_cond_;
+    bool flexible_{1}; // if can be run as "normal" GMRES if needed, set flexible_ to 0. Default is 1 of course.
+
+    vector_type* d_V_{nullptr};
+    vector_type* d_Z_{nullptr};
+    // for performing Gram-Schmidt
+    vector_type* d_S_{nullptr};
+
+    real_type* h_H_{nullptr};
+    real_type* h_c_{nullptr};
+    real_type* h_s_{nullptr};
+    real_type* h_rs_{nullptr};
+    real_type* d_aux_{nullptr};
+
+
+    GramSchmidt* GS_;     
+    void precV(vector_type* rhs, vector_type* x); //multiply the vector by preconditioner
+    LinSolverDirect* LU_solver_;
+    index_type n_;// for simplicity
+    real_type final_residual_norm_;
+    real_type initial_residual_norm_;
+    index_type fgmres_iters_;
+    real_type one_over_k_{1.0};
+
+    index_type k_rand_{0}; // size of sketch space, we need to know it so we can allocate S!
+    MemoryHandler mem_; ///< Device memory manager object
+    RandSketchingManager* rand_manager_;
+    SketchingMethod rand_method_; 
+  };
+}
diff --git a/resolve/RandSketchingCountSketch.cpp b/resolve/RandSketchingCountSketch.cpp
new file mode 100644
index 000000000..7ce615505
--- /dev/null
+++ b/resolve/RandSketchingCountSketch.cpp
@@ -0,0 +1,104 @@
+#include "RandSketchingCountSketch.hpp"
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/vector/Vector.hpp>
+
+#ifdef RESOLVE_USE_HIP
+#include <resolve/hip/hipKernels.h>
+#endif
+#ifdef RESOLVE_USE_CUDA
+#include <resolve/cuda/cudaKernels.h>
+#endif
+#include <resolve/RandSketchingCountSketch.hpp> 
+
+namespace ReSolve 
+{
+  RandSketchingCountSketch::RandSketchingCountSketch()
+  {
+    h_labels_ = nullptr;
+    h_flip_ = nullptr;
+
+    d_labels_ = nullptr;
+    d_flip_ = nullptr;
+  }
+
+  // destructor
+  RandSketchingCountSketch::~RandSketchingCountSketch()
+  {
+    delete h_labels_;
+    delete h_flip_;
+    mem_.deleteOnDevice(d_labels_);
+    mem_.deleteOnDevice(d_flip_);
+  }
+
+  // Actual sketching process
+  int RandSketchingCountSketch::Theta(vector_type* input, vector_type* output)
+  {
+    mem_.deviceSynchronize();
+    count_sketch_theta(n_,
+                       k_rand_,
+                       d_labels_,
+                       d_flip_,
+                       input->getData(ReSolve::memory::DEVICE), 
+                       output->getData(ReSolve::memory::DEVICE));
+    mem_.deviceSynchronize();
+
+
+    return 0;
+  }
+
+  // Setup the parameters, sampling matrices, permuations, etc
+  int RandSketchingCountSketch::setup(index_type n, index_type k)
+  {
+    k_rand_ = k;
+    n_ = n;
+    srand(static_cast<unsigned>(time(nullptr)));
+    //allocate labeling scheme vector and move to GPU
+
+    h_labels_ = new int[n_];
+    //allocate sgn - a vector of flip signs
+    h_flip_  = new int[n_];
+
+    //populate labeling scheme (can be done on the gpu really)
+    //to be fixed, this can be done on the GPU
+    for (int i=0; i<n; ++i) {
+      h_labels_[i] = rand() % k_rand_;
+      int r = rand()%100;
+      if (r < 50){
+        h_flip_[i] = -1;
+      } else { 
+        h_flip_[i] = 1;
+      }
+    }
+
+    mem_.allocateArrayOnDevice(&d_labels_, n_); 
+    mem_.allocateArrayOnDevice(&d_flip_, n_); 
+
+    //then copy
+
+    mem_.copyArrayHostToDevice(d_labels_, h_labels_, n_);
+    mem_.copyArrayHostToDevice(d_flip_, h_flip_, n_);
+
+    mem_.deviceSynchronize();
+    return 0;
+  }
+
+  //to be fixed, this can be done on the GPU
+  int RandSketchingCountSketch::reset() // if needed can be reset (like when Krylov method restarts)
+  {
+    for (int i = 0; i < n_; ++i) {
+      h_labels_[i] = rand() % k_rand_;
+
+      int r = rand()%100;
+      if (r < 50){
+        h_flip_[i] = -1;
+      } else { 
+        h_flip_[i] = 1;
+      }
+    }
+    mem_.copyArrayHostToDevice(d_labels_, h_labels_, n_);
+    mem_.copyArrayHostToDevice(d_flip_, h_flip_, n_);
+
+    mem_.deviceSynchronize();
+    return 0;
+  }
+}
diff --git a/resolve/RandSketchingCountSketch.hpp b/resolve/RandSketchingCountSketch.hpp
new file mode 100644
index 000000000..a796f12cd
--- /dev/null
+++ b/resolve/RandSketchingCountSketch.hpp
@@ -0,0 +1,39 @@
+#pragma once
+#include <resolve/Common.hpp>
+#include <resolve/RandSketchingManager.hpp>
+
+namespace ReSolve {
+
+  // Forward declaration of vector::Vector class
+  namespace vector
+  {
+    class Vector;
+  }
+
+  class RandSketchingCountSketch : public RandSketchingManager
+  {
+
+    using vector_type = vector::Vector;
+    public: 
+
+      // constructor
+      RandSketchingCountSketch();
+
+      // destructor
+      virtual ~RandSketchingCountSketch();
+
+      // Actual sketching process
+      virtual int Theta(vector_type* input, vector_type* output);
+
+      // Setup the parameters, sampling matrices, permuations, etc
+      virtual int setup(index_type n, index_type k);
+      virtual int reset(); // if needed can be reset (like when Krylov method restarts)
+
+    private:
+      index_type* h_labels_;
+      index_type* h_flip_;
+
+      index_type* d_labels_;
+      index_type* d_flip_;
+  };
+}
diff --git a/resolve/RandSketchingFWHT.cpp b/resolve/RandSketchingFWHT.cpp
new file mode 100644
index 000000000..76e916740
--- /dev/null
+++ b/resolve/RandSketchingFWHT.cpp
@@ -0,0 +1,163 @@
+#include <cmath>
+#include <limits>
+
+#include "RandSketchingFWHT.hpp"
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/utilities/logger/Logger.hpp>
+#ifdef RESOLVE_USE_HIP
+#include <resolve/hip/hipKernels.h>
+#endif
+#ifdef RESOLVE_USE_CUDA
+#include <resolve/cuda/cudaKernels.h>
+#endif
+#include <resolve/RandSketchingFWHT.hpp> 
+namespace ReSolve 
+{
+  using out = io::Logger;
+
+  RandSketchingFWHT::RandSketchingFWHT()
+  {
+    h_seq_ = nullptr;
+    h_D_ = nullptr;
+    h_perm_ = nullptr;
+
+    d_D_ = nullptr;
+    d_perm_ = nullptr;
+    d_aux_ = nullptr; 
+  }
+
+  // destructor
+  RandSketchingFWHT::~RandSketchingFWHT()
+  {
+    delete h_seq_;
+    delete h_D_;
+    delete h_perm_;
+
+    mem_.deleteOnDevice(d_D_);
+    mem_.deleteOnDevice(d_perm_);
+    mem_.deleteOnDevice(d_aux_);
+  }
+
+  // Actual sketching process
+  int RandSketchingFWHT::Theta(vector_type* input, vector_type* output)
+  {
+    mem_.setZeroArrayOnDevice(d_aux_, N_);
+    FWHT_scaleByD(n_, 
+                  d_D_,
+                  input->getData(ReSolve::memory::DEVICE), 
+                  d_aux_);  
+
+    mem_.deviceSynchronize();
+    FWHT(1, log2N_, d_aux_);
+
+    mem_.deviceSynchronize();
+    FWHT_select(k_rand_, 
+                d_perm_, 
+                d_aux_, 
+                output->getData(ReSolve::memory::DEVICE)); 
+    mem_.deviceSynchronize();
+    // remember - scaling is the solver's problem 
+    return 0;
+  }
+
+  // Setup the parameters, sampling matrices, permuations, etc
+  int RandSketchingFWHT::setup(index_type n, index_type k)
+  {
+    k_rand_ = k;
+    n_ = n;
+    // pad to the nearest power of 2
+    real_type N_real = std::pow(2.0, std::ceil(std::log(n_)/std::log(2.0)));
+    if (N_real > static_cast<real_type>(std::numeric_limits<index_type>::max())) {
+      out::error() << "Exceeded numerical limits of index_type ...\n";
+      return 1;
+    }
+    N_ = static_cast<index_type>(N_real);
+    log2N_ = static_cast<index_type>(std::log2(N_real));
+    one_over_k_ = 1.0/std::sqrt(static_cast<real_type>(k_rand_));
+
+    srand(static_cast<unsigned>(time(nullptr)));
+
+    h_seq_  = new int[N_];
+    h_perm_  = new int[k_rand_];
+    h_D_  = new int[n_];
+
+    int r;
+    int temp;
+
+    for (int i = 0; i < N_; ++i) {
+      h_seq_[i] = i;
+    } 
+    //Fisher-Yates
+    for (int i = N_ - 1; i > 0; i--) {
+      r = rand() % i; 
+      temp = h_seq_[i];
+      h_seq_[i] = h_seq_[r];
+      h_seq_[r] = temp; 
+    }
+    for (int i = 0; i < k_rand_; ++i) {
+      h_perm_[i] = h_seq_[i];
+    }
+
+    // and D
+    for (int i = 0; i < n_; ++i){
+      r = rand() % 100;
+      if (r < 50){
+        h_D_[i] = -1;
+      } else { 
+        h_D_[i] = 1;
+      }
+    }
+
+    mem_.allocateArrayOnDevice(&d_perm_, k_rand_); 
+    mem_.allocateArrayOnDevice(&d_D_, n_); 
+    mem_.allocateArrayOnDevice(&d_aux_, N_); 
+
+    //then copy
+
+    mem_.copyArrayHostToDevice(d_perm_, h_perm_, k_rand_);
+    mem_.copyArrayHostToDevice(d_D_, h_D_, n_);
+
+    return 0;
+  }
+
+  //to be fixed, this can be done on the GPU
+  int RandSketchingFWHT::reset() // if needed can be reset (like when Krylov method restarts)
+  {
+    srand(static_cast<unsigned>(time(nullptr)));
+
+    int r;
+    int temp;
+
+    for (int i = 0; i < N_; ++i) {
+      h_seq_[i] = i;
+    } 
+    //Fisher-Yates
+    for (int i = N_ - 1; i > 0; i--) {
+      r = rand() % i; 
+      temp = h_seq_[i];
+      h_seq_[i] = h_seq_[r];
+      h_seq_[r] = temp; 
+    }
+    for (int i = 0; i < k_rand_; ++i) {
+      h_perm_[i] = h_seq_[i];
+    }
+
+    // and D
+    for (int i = 0; i < n_; ++i){
+      r = rand() % 100;
+      if (r < 50){
+        h_D_[i] = -1;
+      } else { 
+        h_D_[i] = 1;
+      }
+    }
+
+    //and copy
+
+    mem_.copyArrayHostToDevice(d_perm_, h_perm_, k_rand_);
+    mem_.copyArrayHostToDevice(d_D_, h_D_, n_);
+
+    return 0;
+  }
+}
diff --git a/resolve/RandSketchingFWHT.hpp b/resolve/RandSketchingFWHT.hpp
new file mode 100644
index 000000000..4cab9e510
--- /dev/null
+++ b/resolve/RandSketchingFWHT.hpp
@@ -0,0 +1,44 @@
+#pragma once
+#include <resolve/Common.hpp>
+#include <resolve/RandSketchingManager.hpp>
+
+namespace ReSolve {
+  // Forward declaration of vector::Vector class
+  namespace vector
+  {
+    class Vector;
+  }
+  
+  class RandSketchingFWHT : public RandSketchingManager
+  {
+
+    using vector_type = vector::Vector;
+    public: 
+      // constructor
+
+      RandSketchingFWHT();
+
+      // destructor
+      virtual ~RandSketchingFWHT();
+
+      // Actual sketching process
+      virtual int Theta(vector_type* input, vector_type* output);
+
+      // Setup the parameters, sampling matrices, permuations, etc
+      virtual int setup(index_type n, index_type k);
+      virtual int reset(); // if needed can be reset (like when Krylov method restarts)
+
+    private:
+      index_type* h_seq_;
+      index_type* h_D_;
+      index_type* h_perm_;
+
+      index_type* d_D_;
+      index_type* d_perm_;
+      real_type* d_aux_;
+      
+      index_type N_;
+      index_type log2N_;
+      real_type one_over_k_;
+  };
+}
diff --git a/resolve/RandSketchingManager.cpp b/resolve/RandSketchingManager.cpp
new file mode 100644
index 000000000..3cb426706
--- /dev/null
+++ b/resolve/RandSketchingManager.cpp
@@ -0,0 +1,33 @@
+// this is a virtual class
+#include "RandSketchingManager.hpp"
+
+namespace ReSolve {
+  
+  // constructor
+   RandSketchingManager::RandSketchingManager()
+   {
+     k_rand_ = 0;
+     n_ = 0;
+     N_ = 0;
+   }
+
+  // destructor
+  RandSketchingManager::~RandSketchingManager()
+  {
+  }
+
+  index_type RandSketchingManager::getVectorSize()
+  {
+    return n_;
+  }
+  
+  index_type RandSketchingManager::getSketchSize()
+  {
+    return k_rand_;
+  }
+
+  index_type RandSketchingManager::getPaddedSize()
+  {
+    return N_;
+  }
+}
diff --git a/resolve/RandSketchingManager.hpp b/resolve/RandSketchingManager.hpp
new file mode 100644
index 000000000..baff7b0ed
--- /dev/null
+++ b/resolve/RandSketchingManager.hpp
@@ -0,0 +1,36 @@
+// this is a virtual class
+#pragma once
+#include <resolve/Common.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/MemoryUtils.hpp>
+
+namespace ReSolve { 
+  class RandSketchingManager {
+    using vector_type = vector::Vector;
+    
+    public: 
+      // constructor
+      RandSketchingManager();
+
+      // destructor
+      virtual ~RandSketchingManager();
+
+      // Actual sketching process
+      virtual int Theta(vector_type* input, vector_type* output) = 0;
+
+      // Setup the parameters, sampling matrices, permuations, etc
+      virtual int setup(index_type n, index_type k) = 0;
+      virtual int reset() = 0;
+
+      virtual index_type getVectorSize();
+      virtual index_type getSketchSize();
+      virtual index_type getPaddedSize();
+
+    protected:
+      index_type n_;// size of base vector
+      index_type k_rand_; // size of sketched vector
+      index_type N_; // padded n -- generally N_ > n_
+    
+      MemoryHandler mem_; ///< Device memory manager object
+  };
+} // namespace ReSolve
diff --git a/resolve/cuda/cudaKernels.cu b/resolve/cuda/cudaKernels.cu
index 023e114f3..4773464c9 100644
--- a/resolve/cuda/cudaKernels.cu
+++ b/resolve/cuda/cudaKernels.cu
@@ -1,4 +1,5 @@
 #include "cudaKernels.h"
+#include <cooperative_groups.h>
 #define maxk 1024
 #define Tv5 1024
 //computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1
@@ -138,6 +139,173 @@ __global__ void matrixInfNormPart1(const int n,
   }
 }
 
+// for count sketch sketching random method
+__global__ void  count_sketch_kernel(const int n,
+                                     const int k, 
+                                     const int* labels,
+                                     const int* flip,
+                                     const double* input,
+                                     double* output){
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+  while (idx < n){
+    //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] );
+    double val = input[idx];
+    if (flip[idx] != 1){
+      val *= -1.0;
+    }    
+    atomicAdd(&output[labels[idx]], val);
+    idx += blockDim.x * gridDim.x;
+  }
+}
+
+// for Walsh-Hadamard transform
+
+//kernel 1
+__global__ void select_kernel(const int k, 
+                              const int* perm,
+                              const double* input,
+                              double* output){
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+  while (idx < k){
+    //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] );
+    output[idx] = input[perm[idx]];
+    idx += blockDim.x * gridDim.x;
+  }
+}
+
+//kernel 2
+__global__ void scaleByD_kernel(const int n,
+                                const int* D,
+                                const double* x,
+                                double* y){
+  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+
+  while (idx < n){
+
+    if (D[idx] == 1) y[idx]=x[idx];
+    else y[idx]= (-1.0)*x[idx];
+
+    idx += blockDim.x * gridDim.x;
+  }
+}
+
+//kernels 3 and 4
+
+
+#define ELEMENTARY_LOG2SIZE 11
+namespace cg = cooperative_groups;
+////////////////////////////////////////////////////////////////////////////////
+// Single in-global memory radix-4 Fast Walsh Transform pass
+// (for strides exceeding elementary vector size)
+////////////////////////////////////////////////////////////////////////////////
+
+__global__ void fwtBatch2Kernel(double* d_Output, double* d_Input, int stride) 
+{
+  const int pos = blockIdx.x * blockDim.x + threadIdx.x;
+  const int N = blockDim.x * gridDim.x * 4;
+
+  double* d_Src = d_Input + blockIdx.y * N;
+  double* d_Dst = d_Output + blockIdx.y * N;
+
+  int lo = pos & (stride - 1);
+  int i0 = ((pos - lo) << 2) + lo;
+  int i1 = i0 + stride;
+  int i2 = i1 + stride;
+  int i3 = i2 + stride;
+
+  double D0 = d_Src[i0];
+  double D1 = d_Src[i1];
+  double D2 = d_Src[i2];
+  double D3 = d_Src[i3];
+
+  double T;
+  T = D0;
+  D0 = D0 + D2;
+  D2 = T - D2;
+  T = D1;
+  D1 = D1 + D3;
+  D3 = T - D3;
+  T = D0;
+  d_Dst[i0] = D0 + D1;
+  d_Dst[i1] = T - D1;
+  T = D2;
+  d_Dst[i2] = D2 + D3;
+  d_Dst[i3] = T - D3;
+}
+
+
+__global__ void fwtBatch1Kernel(double* d_Output, double* d_Input, int log2N) 
+{
+  // Handle to thread block group
+
+  cg::thread_block cta = cg::this_thread_block();
+  const int N = 1 << log2N;
+  const int base = blockIdx.x << log2N;
+
+  //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80
+  extern __shared__ double s_data[];
+  double* d_Src = d_Input + base;
+  double* d_Dst = d_Output + base;
+
+  for (int pos = threadIdx.x; pos < N; pos += blockDim.x) {
+    s_data[pos] = d_Src[pos];
+  }
+
+  // Main radix-4 stages
+  const int pos = threadIdx.x;
+
+  for (int stride = N >> 2; stride > 0; stride >>= 2) {
+    int lo = pos & (stride - 1);
+    int i0 = ((pos - lo) << 2) + lo;
+    int i1 = i0 + stride;
+    int i2 = i1 + stride;
+    int i3 = i2 + stride;
+
+    cg::sync(cta);
+    double D0 = s_data[i0];
+    double D1 = s_data[i1];
+    double D2 = s_data[i2];
+    double D3 = s_data[i3];
+
+    double T;
+    T = D0;
+    D0 = D0 + D2;
+    D2 = T - D2;
+    T = D1;
+    D1 = D1 + D3;
+    D3 = T - D3;
+    T = D0;
+    s_data[i0] = D0 + D1;
+    s_data[i1] = T - D1;
+    T = D2;
+    s_data[i2] = D2 + D3;
+    s_data[i3] = T - D3;
+  }
+
+  // Do single radix-2 stage for odd power of two
+  if (log2N & 1) {
+
+    cg::sync(cta);
+
+    for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) {
+      int i0 = pos << 1;
+      int i1 = i0 + 1;
+
+      double D0 = s_data[i0];
+      double D1 = s_data[i1];
+      s_data[i0] = D0 + D1;
+      s_data[i1] = D0 - D1;
+    }
+  }
+
+  cg::sync(cta);
+
+  for (int pos = threadIdx.x; pos < N; pos += blockDim.x) {
+    d_Dst[pos] = s_data[pos];
+  }
+}
 
 void mass_inner_product_two_vectors(int n, 
                                     int i, 
@@ -148,6 +316,7 @@ void mass_inner_product_two_vectors(int n,
 {
   MassIPTwoVec_kernel<<<i + 1, 1024>>>(vec1, vec2, mvec, result, i + 1, n);
 }
+
 void mass_axpy(int n, int i, double* x, double* y, double* alpha)
 {
   massAxpy3_kernel<<<(n + 384 - 1) / 384, 384>>>(n, i + 1, x, y, alpha);
@@ -160,4 +329,44 @@ void matrix_row_sums(int n,
                      double* result)
 {
   matrixInfNormPart1<<<1000,1024>>>(n, nnz, a_ia, a_val, result);
-}
\ No newline at end of file
+}
+
+void  count_sketch_theta(int n,
+                         int k,
+                         int* labels,
+                         int* flip,
+                         double* input,
+                         double* output)
+{
+
+  count_sketch_kernel<<<10000, 1024>>>(n, k, labels, flip, input, output);
+}
+
+void FWHT_select(int k,
+                 int* perm,
+                 double* input,
+                 double* output)
+{
+  select_kernel<<<1000,1024>>>(k, perm, input, output);
+}
+
+void FWHT_scaleByD(int n,
+                   int* D,
+                   double* x,
+                   double* y)
+{
+  scaleByD_kernel<<<1000,1024>>>(n, D, x, y);
+}
+
+void FWHT(int M, int log2N, double* d_Data) {
+
+  const int THREAD_N = 1024;
+  int N = 1 << log2N;
+  dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1);
+
+  for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) {
+    fwtBatch2Kernel<<<grid, THREAD_N>>>(d_Data, d_Data, N / 4);
+  }
+
+  fwtBatch1Kernel<<<M, N / 4, N * sizeof(double)>>>(d_Data, d_Data, log2N);
+}
diff --git a/resolve/cuda/cudaKernels.h b/resolve/cuda/cudaKernels.h
index 9c48783a8..460118f9d 100644
--- a/resolve/cuda/cudaKernels.h
+++ b/resolve/cuda/cudaKernels.h
@@ -12,3 +12,23 @@ void matrix_row_sums(int n,
                      int* a_ia,
                      double* a_val, 
                      double* result);
+
+// needed for rand solver
+void  count_sketch_theta(int n,
+                         int k,
+                         int* labels,
+                         int* flip,
+                         double* input,
+                         double* output);
+
+void FWHT_select(int k,
+                 int* perm,
+                 double* input,
+                 double* output);
+
+void FWHT_scaleByD(int n,
+                   int* D,
+                   double* x,
+                   double* y);
+
+void FWHT(int M, int log2N, double* d_Data); 
diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h
index 986efc841..43872be67 100644
--- a/resolve/hip/hipKernels.h
+++ b/resolve/hip/hipKernels.h
@@ -19,7 +19,28 @@ void permuteVectorP(int n,
                     int* perm_vector,
                     double* vec_in, 
                     double* vec_out);
+
 void permuteVectorQ(int n, 
                     int* perm_vector,
                     double* vec_in, 
                     double* vec_out);
+
+// needed for rand solver
+void  count_sketch_theta(int n,
+                         int k,
+                         int* labels,
+                         int* flip,
+                         double* input,
+                         double* output);
+
+void FWHT_select(int k,
+                 int* perm,
+                 double* input,
+                 double* output);
+
+void FWHT_scaleByD(int n,
+                   int* D,
+                   double* x,
+                   double* y);
+
+void FWHT(int M, int log2N, double* d_Data); 
diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip
index 3fd2a8a0d..2b4d65293 100644
--- a/resolve/hip/hipKernels.hip
+++ b/resolve/hip/hipKernels.hip
@@ -1,6 +1,7 @@
 #include "hipKernels.h"
 
 #include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
 
 //computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1
 template <size_t Tv5 = 1024>
@@ -152,7 +153,7 @@ __global__ void permuteVectorP_kernel(const int n,
   int idx = blockIdx.x*blockDim.x + threadIdx.x;
   while (idx<n) {
     vec_out[idx] = vec_in[perm_vector[idx]];
-    idx+= (blockDim.x*gridDim.x);
+    idx += (blockDim.x*gridDim.x);
   }
 }
 
@@ -165,10 +166,180 @@ __global__ void permuteVectorQ_kernel(const int n,
   int idx = blockIdx.x*blockDim.x + threadIdx.x;
   while (idx<n) {
     vec_out[perm_vector[idx]] = vec_in[idx];
-    idx+= (blockDim.x*gridDim.x);
+    idx += (blockDim.x*gridDim.x);
+  }
+}
+
+// for count sketch sketching random method
+__global__ void  count_sketch_kernel(const int n,
+                                     const int k, 
+                                     const int* labels,
+                                     const int* flip,
+                                     const double* input,
+                                     double* output){
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+  while (idx < n){
+    //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] );
+    double val = input[idx];
+    if (flip[idx] != 1){
+      val *= -1.0;
+    }    
+    atomicAdd(&output[labels[idx]], val);
+    idx += blockDim.x * gridDim.x;
+  }
+}
+
+// for Walsh-Hadamard transform
+
+//kernel 1
+__global__ void select_kernel(const int k, 
+                              const int* perm,
+                              const double* input,
+                              double* output){
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+  while (idx < k){
+    //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] );
+    output[idx] = input[perm[idx]];
+    idx += blockDim.x * gridDim.x;
+  }
+}
+
+//kernel 2
+__global__ void scaleByD_kernel(const int n,
+                                const int* D,
+                                const double* x,
+                                double* y){
+  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+
+  while (idx < n){
+
+    if (D[idx] == 1) y[idx]=x[idx];
+    else y[idx]= (-1.0)*x[idx];
+
+    idx += blockDim.x * gridDim.x;
   }
 }
 
+//kernels 3 and 4
+
+
+#define ELEMENTARY_LOG2SIZE 11
+namespace cg = cooperative_groups;
+////////////////////////////////////////////////////////////////////////////////
+// Single in-global memory radix-4 Fast Walsh Transform pass
+// (for strides exceeding elementary vector size)
+////////////////////////////////////////////////////////////////////////////////
+
+__global__ void fwtBatch2Kernel(double* d_Output, double* d_Input, int stride) 
+{
+  const int pos = blockIdx.x * blockDim.x + threadIdx.x;
+  const int N = blockDim.x * gridDim.x * 4;
+
+  double* d_Src = d_Input + blockIdx.y * N;
+  double* d_Dst = d_Output + blockIdx.y * N;
+
+  int lo = pos & (stride - 1);
+  int i0 = ((pos - lo) << 2) + lo;
+  int i1 = i0 + stride;
+  int i2 = i1 + stride;
+  int i3 = i2 + stride;
+
+  double D0 = d_Src[i0];
+  double D1 = d_Src[i1];
+  double D2 = d_Src[i2];
+  double D3 = d_Src[i3];
+
+  double T;
+  T = D0;
+  D0 = D0 + D2;
+  D2 = T - D2;
+  T = D1;
+  D1 = D1 + D3;
+  D3 = T - D3;
+  T = D0;
+  d_Dst[i0] = D0 + D1;
+  d_Dst[i1] = T - D1;
+  T = D2;
+  d_Dst[i2] = D2 + D3;
+  d_Dst[i3] = T - D3;
+}
+
+
+__global__ void fwtBatch1Kernel(double* d_Output, double* d_Input, int log2N) 
+{
+  // Handle to thread block group
+
+  cg::thread_block cta = cg::this_thread_block();
+  const int N = 1 << log2N;
+  const int base = blockIdx.x << log2N;
+
+  //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80
+  extern __shared__ double s_data[];
+  double* d_Src = d_Input + base;
+  double* d_Dst = d_Output + base;
+
+  for (int pos = threadIdx.x; pos < N; pos += blockDim.x) {
+    s_data[pos] = d_Src[pos];
+  }
+
+  // Main radix-4 stages
+  const int pos = threadIdx.x;
+
+  for (int stride = N >> 2; stride > 0; stride >>= 2) {
+    int lo = pos & (stride - 1);
+    int i0 = ((pos - lo) << 2) + lo;
+    int i1 = i0 + stride;
+    int i2 = i1 + stride;
+    int i3 = i2 + stride;
+
+    cg::sync(cta);
+    double D0 = s_data[i0];
+    double D1 = s_data[i1];
+    double D2 = s_data[i2];
+    double D3 = s_data[i3];
+
+    double T;
+    T = D0;
+    D0 = D0 + D2;
+    D2 = T - D2;
+    T = D1;
+    D1 = D1 + D3;
+    D3 = T - D3;
+    T = D0;
+    s_data[i0] = D0 + D1;
+    s_data[i1] = T - D1;
+    T = D2;
+    s_data[i2] = D2 + D3;
+    s_data[i3] = T - D3;
+  }
+
+  // Do single radix-2 stage for odd power of two
+  if (log2N & 1) {
+
+    cg::sync(cta);
+
+    for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) {
+      int i0 = pos << 1;
+      int i1 = i0 + 1;
+
+      double D0 = s_data[i0];
+      double D1 = s_data[i1];
+      s_data[i0] = D0 + D1;
+      s_data[i1] = D0 - D1;
+    }
+  }
+
+  cg::sync(cta);
+
+  for (int pos = threadIdx.x; pos < N; pos += blockDim.x) {
+    d_Dst[pos] = s_data[pos];
+  }
+}
+
+
+
 void mass_inner_product_two_vectors(int n, 
                                     int i, 
                                     double* vec1, 
@@ -208,3 +379,43 @@ void permuteVectorQ(int n,
 {
   hipLaunchKernelGGL(permuteVectorQ_kernel, dim3(1000), dim3(1024), 0, 0, n, perm_vector, vec_in, vec_out);
 }
+
+void  count_sketch_theta(int n,
+                         int k,
+                         int* labels,
+                         int* flip,
+                         double* input,
+                         double* output)
+{
+
+  hipLaunchKernelGGL(count_sketch_kernel, dim3(10000), dim3(1024), 0, 0, n, k, labels, flip, input, output);
+}
+
+void FWHT_select(int k,
+                 int* perm,
+                 double* input,
+                 double* output)
+{
+  hipLaunchKernelGGL(select_kernel, dim3(1000), dim3(1024), 0, 0,k, perm, input, output);
+}
+
+void FWHT_scaleByD(int n,
+                   int* D,
+                   double* x,
+                   double* y)
+{
+  hipLaunchKernelGGL(scaleByD_kernel, dim3(1000), dim3(1024), 0, 0, n, D, x, y);
+}
+
+void FWHT(int M, int log2N, double* d_Data) {
+
+  const int THREAD_N = 1024;
+  int N = 1 << log2N;
+  dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1);
+
+  for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) {
+    hipLaunchKernelGGL(fwtBatch2Kernel, grid, dim3(THREAD_N), 0, 0,  d_Data, d_Data, N / 4);
+  }
+
+  hipLaunchKernelGGL(fwtBatch1Kernel, dim3(M), dim3(N / 4), N * sizeof(double), 0, d_Data, d_Data, log2N);
+}
diff --git a/resolve/vector/VectorHandlerHip.cpp b/resolve/vector/VectorHandlerHip.cpp
index 1e1195fc8..9689a2bb3 100644
--- a/resolve/vector/VectorHandlerHip.cpp
+++ b/resolve/vector/VectorHandlerHip.cpp
@@ -51,6 +51,7 @@ namespace ReSolve {
     rocblas_handle  handle_rocblas =  workspaceHIP->getRocblasHandle();
     double nrm = 0.0;
     rocblas_status st= rocblas_ddot (handle_rocblas,  x->getSize(), x->getData(memory::DEVICE), 1, y->getData(memory::DEVICE), 1, &nrm);
+    
     if (st!=0) {printf("dot product crashed with code %d \n", st);}
     return nrm;
   }
@@ -68,6 +69,7 @@ namespace ReSolve {
     LinAlgWorkspaceHIP* workspaceHIP = workspace_;
     rocblas_handle handle_rocblas =  workspaceHIP->getRocblasHandle();
     rocblas_status st = rocblas_dscal(handle_rocblas, x->getSize(), alpha, x->getData(memory::DEVICE), 1);
+    
     if (st!=0) {
       ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n";
     }
@@ -94,6 +96,7 @@ namespace ReSolve {
                   1,
                   y->getData(memory::DEVICE),
                   1);
+    
   }
 
   /** 
@@ -153,6 +156,7 @@ namespace ReSolve {
                     x->getData(memory::DEVICE),
                     1);
     }
+    
   }
 
   /** 
@@ -172,6 +176,7 @@ namespace ReSolve {
     using namespace constants;
     if (k < 200) {
       mass_axpy(size, k, x->getData(memory::DEVICE), y->getData(memory::DEVICE),alpha->getData(memory::DEVICE));
+    
     } else {
       LinAlgWorkspaceHIP* workspaceHIP = workspace_;
       rocblas_handle handle_rocblas =  workspaceHIP->getRocblasHandle();
@@ -189,6 +194,7 @@ namespace ReSolve {
                     &ONE,
                     y->getData(memory::DEVICE),          // c
                     size);      // ldc     
+    
     }
   }
 
@@ -213,6 +219,7 @@ namespace ReSolve {
 
     if (k < 200) {
       mass_inner_product_two_vectors(size, k, x->getData(memory::DEVICE) , x->getData(1, memory::DEVICE), V->getData(memory::DEVICE), res->getData(memory::DEVICE));
+    
     } else {
       LinAlgWorkspaceHIP* workspaceHIP = workspace_;
       rocblas_handle handle_rocblas =  workspaceHIP->getRocblasHandle();
@@ -230,6 +237,7 @@ namespace ReSolve {
                     &ZERO,
                     res->getData(memory::DEVICE),     //c
                     k + 1);  //ldc 
+    
     }
   }
 
diff --git a/tests/functionality/CMakeLists.txt b/tests/functionality/CMakeLists.txt
index 85b47fd72..7d2aaa1b7 100644
--- a/tests/functionality/CMakeLists.txt
+++ b/tests/functionality/CMakeLists.txt
@@ -23,7 +23,10 @@ if(RESOLVE_USE_CUDA)
   # Build KLU+GLU test
   add_executable(klu_glu_test.exe testKLU_GLU.cpp)
   target_link_libraries(klu_glu_test.exe PRIVATE ReSolve)
-
+  
+  # Build randSolver test
+  add_executable(rand_gmres_cuda_test.exe testRandGMRES_Cuda.cpp)
+  target_link_libraries(rand_gmres_cuda_test.exe PRIVATE ReSolve)
 endif(RESOLVE_USE_CUDA)
 
 
@@ -37,6 +40,9 @@ if(RESOLVE_USE_HIP)
   add_executable(rocsolver_rf_fgmres_test.exe testKLU_RocSolver_FGMRES.cpp)
   target_link_libraries(rocsolver_rf_fgmres_test.exe PRIVATE ReSolve)
 
+  # Build randSolver test
+  add_executable(rand_gmres_hip_test.exe testRandGMRES_Rocm.cpp)
+  target_link_libraries(rand_gmres_hip_test.exe PRIVATE ReSolve)
 endif(RESOLVE_USE_HIP)
 
 # Install tests
@@ -46,13 +52,15 @@ if(RESOLVE_USE_CUDA)
   set(installable_tests ${installable_tests}
                         klu_rf_test.exe
                         klu_rf_fgmres_test.exe
-                        klu_glu_test.exe)
+                        klu_glu_test.exe
+                        rand_gmres_cuda_test.exe)
 endif(RESOLVE_USE_CUDA)
 
 if(RESOLVE_USE_HIP)
   set(installable_tests ${installable_tests}
                         rocsolver_rf_test.exe
-                        rocsolver_rf_fgmres_test.exe)
+                        rocsolver_rf_fgmres_test.exe
+                        rand_gmres_hip_test.exe)
 endif(RESOLVE_USE_HIP)
 
 install(TARGETS ${installable_tests} 
@@ -68,9 +76,11 @@ if(RESOLVE_USE_CUDA)
   add_test(NAME klu_rf_test  COMMAND $<TARGET_FILE:klu_rf_test.exe>  "${test_data_dir}")
   add_test(NAME klu_rf_fgmres_test COMMAND $<TARGET_FILE:klu_rf_fgmres_test.exe> "${test_data_dir}")
   add_test(NAME klu_glu_test COMMAND $<TARGET_FILE:klu_glu_test.exe> "${test_data_dir}")
+  add_test(NAME rand_gmres_cuda_test COMMAND $<TARGET_FILE:rand_gmres_cuda_test.exe>)
 endif(RESOLVE_USE_CUDA)
 
 if(RESOLVE_USE_HIP)
   add_test(NAME rocsolver_rf_test  COMMAND $<TARGET_FILE:rocsolver_rf_test.exe>  "${test_data_dir}")
   add_test(NAME rocsolver_rf_fgmres_test  COMMAND $<TARGET_FILE:rocsolver_rf_fgmres_test.exe>  "${test_data_dir}")
+  add_test(NAME rand_gmres_hip_test COMMAND $<TARGET_FILE:rand_gmres_hip_test.exe>)
 endif(RESOLVE_USE_HIP)
diff --git a/tests/functionality/testRandGMRES_Cuda.cpp b/tests/functionality/testRandGMRES_Cuda.cpp
new file mode 100644
index 000000000..b4cdffec0
--- /dev/null
+++ b/tests/functionality/testRandGMRES_Cuda.cpp
@@ -0,0 +1,231 @@
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/io.hpp>
+#include <resolve/matrix/MatrixHandler.hpp>
+#include <resolve/vector/VectorHandler.hpp>
+#include <resolve/LinSolverDirectKLU.hpp>
+#include <resolve/LinSolverDirectCuSparseILU0.hpp>
+#include <resolve/LinSolverIterativeRandFGMRES.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+
+using namespace ReSolve::constants;
+using real_type  = ReSolve::real_type;
+using index_type  = ReSolve::index_type;
+using vector_type = ReSolve::vector::Vector;
+
+ReSolve::matrix::Csr* generateMatrix(const index_type N);
+ReSolve::vector::Vector* generateRhs(const index_type N);
+
+int main(int argc, char *argv[])
+{
+  // Use the same data types as those you specified in ReSolve build.
+
+
+  //we want error sum to be 0 at the end
+  //that means PASS.
+  //otheriwse it is a FAIL.
+  int error_sum = 0;
+  int status;
+  const index_type N = (argc == 2) ? atoi(argv[1]) : 10000;
+  ReSolve::matrix::Csr* A = generateMatrix(N);
+
+  vector_type* vec_rhs = generateRhs(N);
+
+  ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA();
+  workspace_CUDA->initializeHandles();
+  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
+  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
+
+  vector_type* vec_x;
+
+  ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2);
+
+  ReSolve::LinSolverDirectCuSparseILU0* Rf = new ReSolve::LinSolverDirectCuSparseILU0(workspace_CUDA);
+  ReSolve::LinSolverIterativeRandFGMRES* FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::cs, GS, "cuda");
+
+
+  vec_x = new vector_type(A->getNumRows());
+  vec_x->allocate(ReSolve::memory::HOST);
+
+  //iinit guess is 0
+  vec_x->allocate(ReSolve::memory::DEVICE);
+  vec_x->setToZero(ReSolve::memory::DEVICE);
+
+  real_type norm_b;
+  matrix_handler->setValuesChanged(true, "cuda");
+
+  status = Rf->setup(A);
+  error_sum += status;
+
+  FGMRES->setRestart(200);
+  FGMRES->setMaxit(2500);
+  FGMRES->setTol(1e-12);
+  FGMRES->setup(A);
+
+  status = GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); 
+  error_sum += status;
+
+  //matrix_handler->setValuesChanged(true, "cuda");
+  status = FGMRES->resetMatrix(A);
+  error_sum += status;
+
+  status = FGMRES->setupPreconditioner("LU", Rf);
+  error_sum += status;
+
+  FGMRES->setFlexible(1); 
+
+  FGMRES->solve(vec_rhs, vec_x);
+
+  norm_b = vector_handler->dot(vec_rhs, vec_rhs, "cuda");
+  norm_b = std::sqrt(norm_b);
+  real_type final_norm_first =  FGMRES->getFinalResidualNorm();
+  std::cout << "Randomized FGMRES results (first run): \n"
+    << "\t Sketching method:                                    : CountSketch\n" 
+    << "\t Initial residual norm:          ||b-Ax_0||_2         : " 
+    << std::scientific << std::setprecision(16) 
+    << FGMRES->getInitResidualNorm()<<" \n"
+    << "\t Initial relative residual norm: ||b-Ax_0||_2/||b||_2 : "
+    << FGMRES->getInitResidualNorm()/norm_b<<" \n"
+    << "\t Final residual norm:            ||b-Ax||_2           : " 
+    << FGMRES->getFinalResidualNorm() <<" \n"
+    << "\t Final relative residual norm:   ||b-Ax||_2/||b||_2   : " 
+    << FGMRES->getFinalResidualNorm()/norm_b <<" \n"
+    << "\t Number of iterations                                 : " << FGMRES->getNumIter() << "\n";
+
+  delete FGMRES;
+  delete GS;
+  GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2);
+  FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::fwht, GS, "cuda");
+
+
+  FGMRES->setRestart(150);
+  FGMRES->setMaxit(2500);
+  FGMRES->setTol(1e-12);
+  FGMRES->setup(A);
+
+  status = GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); 
+  error_sum += status;
+
+  status = FGMRES->setupPreconditioner("LU", Rf);
+  error_sum += status;
+
+  vec_x->setToZero(ReSolve::memory::DEVICE);
+  FGMRES->solve(vec_rhs, vec_x);
+
+
+  std::cout << "Randomized FGMRES results (second run): \n"
+    << "\t Sketching method:                                    : FWHT\n" 
+    << "\t Initial residual norm:          ||b-Ax_0||_2         : " 
+    << std::scientific << std::setprecision(16) 
+    << FGMRES->getInitResidualNorm()<<" \n"
+    << "\t Initial relative residual norm: ||b-Ax_0||_2/||b||_2 : "
+    << FGMRES->getInitResidualNorm()/norm_b<<" \n"
+    << "\t Final residual norm:            ||b-Ax||_2           : " 
+    << FGMRES->getFinalResidualNorm() <<" \n"
+    << "\t Final relative residual norm:   ||b-Ax||_2/||b||_2   : " 
+    << FGMRES->getFinalResidualNorm()/norm_b <<" \n"
+    << "\t Number of iterations                                 : " << FGMRES->getNumIter() << "\n";
+
+  if ((error_sum == 0) && (final_norm_first/norm_b < 1e-11) && (FGMRES->getFinalResidualNorm()/norm_b < 1e-11 )) {
+    std::cout<<"Test 5 (randomized GMRES) PASSED"<<std::endl<<std::endl;;
+  } else {
+    std::cout<<"Test 5 (randomized GMRES) FAILED, error sum: "<<error_sum<<std::endl<<std::endl;;
+  }
+  delete A;
+  delete Rf;
+  delete vec_x;
+  delete workspace_CUDA;
+  delete matrix_handler;
+  delete vector_handler;
+
+  return error_sum;
+}
+
+
+ReSolve::vector::Vector* generateRhs(const index_type N)
+{
+  vector_type* vec_rhs = new vector_type(N);
+  vec_rhs->allocate(ReSolve::memory::HOST);
+  vec_rhs->allocate(ReSolve::memory::DEVICE);
+
+  real_type* data = vec_rhs->getData(ReSolve::memory::HOST);
+  for (int i = 0; i < N; ++i) {
+    if (i % 2) {
+      data[i] = 1.0;
+    } else {
+
+      data[i] = -111.0;
+    }
+  }
+  vec_rhs->copyData(ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  return vec_rhs;
+} 
+
+ReSolve::matrix::Csr* generateMatrix(const index_type N)
+{
+  std::vector<real_type> r1 = {1., 5., 7., 8., 3., 2., 4.}; // sum 30
+  std::vector<real_type> r2 = {1., 3., 2., 2., 1., 6., 7., 3., 2., 3.}; // sum 30
+  std::vector<real_type> r3 = {11., 15., 4.}; // sum 30
+  std::vector<real_type> r4 = {1., 1., 5., 1., 9., 2., 1., 2., 3., 2., 3.}; // sum 30
+  std::vector<real_type> r5 = {6., 5., 7., 3., 2., 5., 2.}; // sum 30
+
+
+  const std::vector<std::vector<real_type> > data = {r1, r2, r3, r4, r5};
+
+  // std::cout << N << "\n";
+
+  // First compute number of nonzeros
+  index_type NNZ = 0;
+  for (index_type i = 0; i < N; ++i)
+  {
+    size_t reminder = static_cast<size_t>(i%5);
+    NNZ += static_cast<index_type>(data[reminder].size());
+  }
+
+  // Allocate NxN CSR matrix with NNZ nonzeros
+  ReSolve::matrix::Csr* A = new ReSolve::matrix::Csr(N, N, NNZ);
+  A->allocateMatrixData(ReSolve::memory::HOST);
+
+  index_type* rowptr = A->getRowData(ReSolve::memory::HOST);
+  index_type* colidx = A->getColData(ReSolve::memory::HOST);
+  real_type* val     = A->getValues(ReSolve::memory::HOST); 
+
+  // Populate CSR matrix using same row pattern as for NNZ calculation
+  rowptr[0] = 0;
+  index_type where;
+  real_type what;
+  for (index_type i=0; i < N; ++i)
+  {
+    size_t reminder = static_cast<size_t>(i%5);
+    const std::vector<real_type>& row_sample = data[reminder];
+    index_type nnz_per_row = static_cast<index_type>(row_sample.size());
+
+    rowptr[i+1] = rowptr[i] + nnz_per_row;
+    bool c = false;
+    for (index_type j = rowptr[i]; j < rowptr[i+1]; ++j)
+    {
+      if (((!c) && (((j - rowptr[i]) * N/nnz_per_row + (N%(N/nnz_per_row))) >= i)) || ((!c) && (j == (rowptr[i+1] - 1)) )) {
+        c = true;
+        where = i;
+        what = 4.;
+      } else {
+        where =  (j - rowptr[i]) * N/nnz_per_row + (N%(N/nnz_per_row));
+        what = row_sample[static_cast<size_t>(j - rowptr[i])];
+      } 
+      colidx[j] = where;
+      // evenly distribute nonzeros ^^^^             ^^^^^^^^ perturb offset
+      val[j] = what;
+    }
+  }
+
+
+  A->setUpdated(ReSolve::memory::HOST);
+  A->copyData(ReSolve::memory::DEVICE);
+  return A;
+}
diff --git a/tests/functionality/testRandGMRES_Rocm.cpp b/tests/functionality/testRandGMRES_Rocm.cpp
new file mode 100644
index 000000000..a5eb2313b
--- /dev/null
+++ b/tests/functionality/testRandGMRES_Rocm.cpp
@@ -0,0 +1,231 @@
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/io.hpp>
+#include <resolve/matrix/MatrixHandler.hpp>
+#include <resolve/vector/VectorHandler.hpp>
+#include <resolve/LinSolverDirectKLU.hpp>
+#include <resolve/LinSolverDirectRocSparseILU0.hpp>
+#include <resolve/LinSolverIterativeRandFGMRES.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+
+using namespace ReSolve::constants;
+using real_type  = ReSolve::real_type;
+using index_type  = ReSolve::index_type;
+using vector_type = ReSolve::vector::Vector;
+
+ReSolve::matrix::Csr* generateMatrix(const index_type N);
+ReSolve::vector::Vector* generateRhs(const index_type N);
+
+int main(int argc, char *argv[])
+{
+  // Use the same data types as those you specified in ReSolve build.
+
+
+  //we want error sum to be 0 at the end
+  //that means PASS.
+  //otheriwse it is a FAIL.
+  int error_sum = 0;
+  int status;
+  const index_type N = (argc == 2) ? atoi(argv[1]) : 10000;
+  ReSolve::matrix::Csr* A = generateMatrix(N);
+
+  vector_type* vec_rhs = generateRhs(N);
+
+  ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP();
+  workspace_HIP->initializeHandles();
+  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_HIP);
+  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_HIP);
+
+  vector_type* vec_x;
+
+  ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2);
+
+  ReSolve::LinSolverDirectRocSparseILU0* Rf = new ReSolve::LinSolverDirectRocSparseILU0(workspace_HIP);
+  ReSolve::LinSolverIterativeRandFGMRES* FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::cs, GS, "hip");
+
+
+  vec_x = new vector_type(A->getNumRows());
+  vec_x->allocate(ReSolve::memory::HOST);
+
+  //iinit guess is 0
+  vec_x->allocate(ReSolve::memory::DEVICE);
+  vec_x->setToZero(ReSolve::memory::DEVICE);
+
+  real_type norm_b;
+  matrix_handler->setValuesChanged(true, "hip");
+
+  status = Rf->setup(A);
+  error_sum += status;
+
+  FGMRES->setRestart(200);
+  FGMRES->setMaxit(2500);
+  FGMRES->setTol(1e-12);
+  FGMRES->setup(A);
+
+  status = GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); 
+  error_sum += status;
+
+  //matrix_handler->setValuesChanged(true, "hip");
+  status = FGMRES->resetMatrix(A);
+  error_sum += status;
+
+  status = FGMRES->setupPreconditioner("LU", Rf);
+  error_sum += status;
+
+  FGMRES->setFlexible(1); 
+
+  FGMRES->solve(vec_rhs, vec_x);
+
+  norm_b = vector_handler->dot(vec_rhs, vec_rhs, "hip");
+  norm_b = std::sqrt(norm_b);
+  real_type final_norm_first =  FGMRES->getFinalResidualNorm();
+  std::cout << "Randomized FGMRES results (first run): \n"
+    << "\t Sketching method:                                    : CountSketch\n" 
+    << "\t Initial residual norm:          ||b-Ax_0||_2         : " 
+    << std::scientific << std::setprecision(16) 
+    << FGMRES->getInitResidualNorm()<<" \n"
+    << "\t Initial relative residual norm: ||b-Ax_0||_2/||b||_2 : "
+    << FGMRES->getInitResidualNorm()/norm_b<<" \n"
+    << "\t Final residual norm:            ||b-Ax||_2           : " 
+    << FGMRES->getFinalResidualNorm() <<" \n"
+    << "\t Final relative residual norm:   ||b-Ax||_2/||b||_2   : " 
+    << FGMRES->getFinalResidualNorm()/norm_b <<" \n"
+    << "\t Number of iterations                                 : " << FGMRES->getNumIter() << "\n";
+
+  delete FGMRES;
+  delete GS;
+  GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2);
+  FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::fwht, GS, "hip");
+
+
+  FGMRES->setRestart(150);
+  FGMRES->setMaxit(2500);
+  FGMRES->setTol(1e-12);
+  FGMRES->setup(A);
+
+  status = GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); 
+  error_sum += status;
+
+  status = FGMRES->setupPreconditioner("LU", Rf);
+  error_sum += status;
+
+  vec_x->setToZero(ReSolve::memory::DEVICE);
+  FGMRES->solve(vec_rhs, vec_x);
+
+
+  std::cout << "Randomized FGMRES results (second run): \n"
+    << "\t Sketching method:                                    : FWHT\n" 
+    << "\t Initial residual norm:          ||b-Ax_0||_2         : " 
+    << std::scientific << std::setprecision(16) 
+    << FGMRES->getInitResidualNorm()<<" \n"
+    << "\t Initial relative residual norm: ||b-Ax_0||_2/||b||_2 : "
+    << FGMRES->getInitResidualNorm()/norm_b<<" \n"
+    << "\t Final residual norm:            ||b-Ax||_2           : " 
+    << FGMRES->getFinalResidualNorm() <<" \n"
+    << "\t Final relative residual norm:   ||b-Ax||_2/||b||_2   : " 
+    << FGMRES->getFinalResidualNorm()/norm_b <<" \n"
+    << "\t Number of iterations                                 : " << FGMRES->getNumIter() << "\n";
+
+  if ((error_sum == 0) && (final_norm_first/norm_b < 1e-11) && (FGMRES->getFinalResidualNorm()/norm_b < 1e-11 )) {
+    std::cout<<"Test 5 (randomized GMRES) PASSED"<<std::endl<<std::endl;;
+  } else {
+    std::cout<<"Test 5 (randomized GMRES) FAILED, error sum: "<<error_sum<<std::endl<<std::endl;;
+  }
+  delete A;
+  delete Rf;
+  delete vec_x;
+  delete workspace_HIP;
+  delete matrix_handler;
+  delete vector_handler;
+
+  return error_sum;
+}
+
+
+ReSolve::vector::Vector* generateRhs(const index_type N)
+{
+  vector_type* vec_rhs = new vector_type(N);
+  vec_rhs->allocate(ReSolve::memory::HOST);
+  vec_rhs->allocate(ReSolve::memory::DEVICE);
+
+  real_type* data = vec_rhs->getData(ReSolve::memory::HOST);
+  for (int i = 0; i < N; ++i) {
+    if (i % 2) {
+      data[i] = 1.0;
+    } else {
+
+      data[i] = -111.0;
+    }
+  }
+  vec_rhs->copyData(ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  return vec_rhs;
+} 
+
+ReSolve::matrix::Csr* generateMatrix(const index_type N)
+{
+  std::vector<real_type> r1 = {1., 5., 7., 8., 3., 2., 4.}; // sum 30
+  std::vector<real_type> r2 = {1., 3., 2., 2., 1., 6., 7., 3., 2., 3.}; // sum 30
+  std::vector<real_type> r3 = {11., 15., 4.}; // sum 30
+  std::vector<real_type> r4 = {1., 1., 5., 1., 9., 2., 1., 2., 3., 2., 3.}; // sum 30
+  std::vector<real_type> r5 = {6., 5., 7., 3., 2., 5., 2.}; // sum 30
+
+
+  const std::vector<std::vector<real_type> > data = {r1, r2, r3, r4, r5};
+
+  // std::cout << N << "\n";
+
+  // First compute number of nonzeros
+  index_type NNZ = 0;
+  for (index_type i = 0; i < N; ++i)
+  {
+    size_t reminder = static_cast<size_t>(i%5);
+    NNZ += static_cast<index_type>(data[reminder].size());
+  }
+
+  // Allocate NxN CSR matrix with NNZ nonzeros
+  ReSolve::matrix::Csr* A = new ReSolve::matrix::Csr(N, N, NNZ);
+  A->allocateMatrixData(ReSolve::memory::HOST);
+
+  index_type* rowptr = A->getRowData(ReSolve::memory::HOST);
+  index_type* colidx = A->getColData(ReSolve::memory::HOST);
+  real_type* val     = A->getValues(ReSolve::memory::HOST); 
+
+  // Populate CSR matrix using same row pattern as for NNZ calculation
+  rowptr[0] = 0;
+  index_type where;
+  real_type what;
+  for (index_type i=0; i < N; ++i)
+  {
+    size_t reminder = static_cast<size_t>(i%5);
+    const std::vector<real_type>& row_sample = data[reminder];
+    index_type nnz_per_row = static_cast<index_type>(row_sample.size());
+
+    rowptr[i+1] = rowptr[i] + nnz_per_row;
+    bool c = false;
+    for (index_type j = rowptr[i]; j < rowptr[i+1]; ++j)
+    {
+      if (((!c) && (((j - rowptr[i]) * N/nnz_per_row + (N%(N/nnz_per_row))) >= i)) || ((!c) && (j == (rowptr[i+1] - 1)) )) {
+        c = true;
+        where = i;
+        what = 4.;
+      } else {
+        where =  (j - rowptr[i]) * N/nnz_per_row + (N%(N/nnz_per_row));
+        what = row_sample[static_cast<size_t>(j - rowptr[i])];
+      } 
+      colidx[j] = where;
+      // evenly distribute nonzeros ^^^^             ^^^^^^^^ perturb offset
+      val[j] = what;
+    }
+  }
+
+
+  A->setUpdated(ReSolve::memory::HOST);
+  A->copyData(ReSolve::memory::DEVICE);
+  return A;
+}