diff --git a/CMakePresets.json b/CMakePresets.json index 8942b80a2..02f57d3cb 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -14,7 +14,8 @@ "installDir": "${sourceDir}/install", "generator": "Unix Makefiles", "cacheVariables": { - "RESOLVE_USE_CUDA": "ON" + "RESOLVE_USE_CUDA": "ON", + "CMAKE_CUDA_ARCHITECTURES": "60" } }, { @@ -45,7 +46,8 @@ "description": "Custom changes specific for Ascent", "cacheVariables": { "CMAKE_C_COMPILER": "$env{OLCF_GCC_ROOT}/bin/gcc", - "CMAKE_CXX_COMPILER": "$env{OLCF_GCC_ROOT}/bin/g++" + "CMAKE_CXX_COMPILER": "$env{OLCF_GCC_ROOT}/bin/g++", + "CMAKE_CUDA_ARCHITECTURES": "70" } }, { diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 5fb26a284..0148a7e17 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -36,7 +36,10 @@ if(RESOLVE_USE_CUDA) # Build example where matrix data is updated add_executable(klu_glu_values_update.exe r_KLU_GLU_matrix_values_update.cpp) target_link_libraries(klu_glu_values_update.exe PRIVATE ReSolve) - + + #rand solver + add_executable(gmres_cusparse_rand.exe r_randGMRES_CUDA.cpp) + target_link_libraries(gmres_cusparse_rand.exe PRIVATE ReSolve) endif(RESOLVE_USE_CUDA) # Create HIP examples @@ -53,17 +56,21 @@ if(RESOLVE_USE_HIP) add_executable(klu_rocsolverrf_check_redo.exe r_KLU_rocsolverrf_redo_factorization.cpp) target_link_libraries(klu_rocsolverrf_check_redo.exe PRIVATE ReSolve) + + # Rand GMRES test with rocsparse + add_executable(gmres_rocsparse_rand.exe r_randGMRES.cpp) + target_link_libraries(gmres_rocsparse_rand.exe PRIVATE ReSolve) endif(RESOLVE_USE_HIP) # Install all examples in bin directory set(installable_executables klu_klu.exe klu_klu_standalone.exe) if(RESOLVE_USE_CUDA) - set(installable_executables ${installable_executables} klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe) + set(installable_executables ${installable_executables} klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe gmres_cusparse_rand.exe) endif(RESOLVE_USE_CUDA) if(RESOLVE_USE_HIP) - set(installable_executables ${installable_executables} klu_rocsolverrf.exe klu_rocsolverrf_fgmres.exe klu_rocsolverrf_check_redo.exe) + set(installable_executables ${installable_executables} klu_rocsolverrf.exe klu_rocsolverrf_fgmres.exe klu_rocsolverrf_check_redo.exe gmres_rocsparse_rand.exe) endif(RESOLVE_USE_HIP) install(TARGETS ${installable_executables} diff --git a/examples/r_randGMRES.cpp b/examples/r_randGMRES.cpp new file mode 100644 index 000000000..b3f85cc95 --- /dev/null +++ b/examples/r_randGMRES.cpp @@ -0,0 +1,128 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ReSolve::constants; + +int main(int argc, char *argv[]) +{ + // Use the same data types as those you specified in ReSolve build. + using real_type = ReSolve::real_type; + using vector_type = ReSolve::vector::Vector; + + (void) argc; // TODO: Check if the number of input parameters is correct. + std::string matrixFileName = argv[1]; + std::string rhsFileName = argv[2]; + + + ReSolve::matrix::Coo* A_coo; + ReSolve::matrix::Csr* A; + ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP(); + workspace_HIP->initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_HIP); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_HIP); + real_type* rhs = nullptr; + real_type* x = nullptr; + + vector_type* vec_rhs; + vector_type* vec_x; + vector_type* vec_r; + + ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2); + + ReSolve::LinSolverDirectRocSparseILU0* Rf = new ReSolve::LinSolverDirectRocSparseILU0(workspace_HIP); + ReSolve::LinSolverIterativeRandFGMRES* FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::cs, GS, "hip"); + + std::cout << std::endl << std::endl << std::endl; + std::cout << "========================================================================================================================"<getNumRows(), + A_coo->getNumColumns(), + A_coo->getNnz(), + A_coo->symmetric(), + A_coo->expanded()); + + rhs = ReSolve::io::readRhsFromFile(rhs_file); + x = new real_type[A->getNumRows()]; + vec_rhs = new vector_type(A->getNumRows()); + vec_x = new vector_type(A->getNumRows()); + vec_x->allocate(ReSolve::memory::HOST); + //iinit guess is 0U + vec_x->allocate(ReSolve::memory::DEVICE); + vec_x->setToZero(ReSolve::memory::DEVICE); + vec_r = new vector_type(A->getNumRows()); + std::cout<<"Finished reading the matrix and rhs, size: "<getNumRows()<<" x "<getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<symmetric()<< ", Expanded? "<expanded()<coo2csr(A_coo,A, "hip"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + //Now call direct solver + real_type norm_b; + matrix_handler->setValuesChanged(true, "hip"); + + Rf->setup(A); + FGMRES->setRestart(150); + FGMRES->setMaxit(2500); + FGMRES->setTol(1e-12); + FGMRES->setup(A); + GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); + + //matrix_handler->setValuesChanged(true, "hip"); + FGMRES->resetMatrix(A); + FGMRES->setupPreconditioner("LU", Rf); + FGMRES->setFlexible(1); + + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + FGMRES->solve(vec_rhs, vec_x); + + norm_b = vector_handler->dot(vec_rhs, vec_rhs, "hip"); + norm_b = sqrt(norm_b); + std::cout << "FGMRES: init nrm: " + << std::scientific << std::setprecision(16) + << FGMRES->getInitResidualNorm()/norm_b + << " final nrm: " + << FGMRES->getFinalResidualNorm()/norm_b + << " iter: " << FGMRES->getNumIter() << "\n"; + + + delete A; + delete A_coo; + delete Rf; + delete [] x; + delete [] rhs; + delete vec_r; + delete vec_x; + delete workspace_HIP; + delete matrix_handler; + delete vector_handler; + + return 0; +} diff --git a/examples/r_randGMRES_CUDA.cpp b/examples/r_randGMRES_CUDA.cpp new file mode 100644 index 000000000..814ca2d4f --- /dev/null +++ b/examples/r_randGMRES_CUDA.cpp @@ -0,0 +1,128 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ReSolve::constants; + +int main(int argc, char *argv[]) +{ + // Use the same data types as those you specified in ReSolve build. + using real_type = ReSolve::real_type; + using vector_type = ReSolve::vector::Vector; + + (void) argc; // TODO: Check if the number of input parameters is correct. + std::string matrixFileName = argv[1]; + std::string rhsFileName = argv[2]; + + + ReSolve::matrix::Coo* A_coo; + ReSolve::matrix::Csr* A; + ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA(); + workspace_CUDA->initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); + real_type* rhs = nullptr; + real_type* x = nullptr; + + vector_type* vec_rhs; + vector_type* vec_x; + vector_type* vec_r; + + ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2); + + ReSolve::LinSolverDirectCuSparseILU0* Rf = new ReSolve::LinSolverDirectCuSparseILU0(workspace_CUDA); + ReSolve::LinSolverIterativeRandFGMRES* FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::fwht, GS, "cuda"); + + std::cout << std::endl << std::endl << std::endl; + std::cout << "========================================================================================================================"<getNumRows(), + A_coo->getNumColumns(), + A_coo->getNnz(), + A_coo->symmetric(), + A_coo->expanded()); + + rhs = ReSolve::io::readRhsFromFile(rhs_file); + x = new real_type[A->getNumRows()]; + vec_rhs = new vector_type(A->getNumRows()); + vec_x = new vector_type(A->getNumRows()); + vec_x->allocate(ReSolve::memory::HOST); + //iinit guess is 0U + vec_x->allocate(ReSolve::memory::DEVICE); + vec_x->setToZero(ReSolve::memory::DEVICE); + vec_r = new vector_type(A->getNumRows()); + std::cout<<"Finished reading the matrix and rhs, size: "<getNumRows()<<" x "<getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<symmetric()<< ", Expanded? "<expanded()<coo2csr(A_coo,A, "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + //Now call direct solver + real_type norm_b; + matrix_handler->setValuesChanged(true, "cuda"); + + Rf->setup(A, nullptr, nullptr, nullptr, nullptr, vec_rhs); + FGMRES->setRestart(800); + FGMRES->setMaxit(800); + FGMRES->setTol(1e-12); + FGMRES->setup(A); + GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); + + //matrix_handler->setValuesChanged(true, "cuda"); + FGMRES->resetMatrix(A); + FGMRES->setupPreconditioner("LU", Rf); + FGMRES->setFlexible(1); + + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + FGMRES->solve(vec_rhs, vec_x); + + norm_b = vector_handler->dot(vec_rhs, vec_rhs, "cuda"); + norm_b = sqrt(norm_b); + std::cout << "FGMRES: init nrm: " + << std::scientific << std::setprecision(16) + << FGMRES->getInitResidualNorm()/norm_b + << " final nrm: " + << FGMRES->getFinalResidualNorm()/norm_b + << " iter: " << FGMRES->getNumIter() << "\n"; + + + delete A; + delete A_coo; + delete Rf; + delete [] x; + delete [] rhs; + delete vec_r; + delete vec_x; + delete workspace_CUDA; + delete matrix_handler; + delete vector_handler; + + return 0; +} diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt index b98c82344..501b0609d 100644 --- a/resolve/CMakeLists.txt +++ b/resolve/CMakeLists.txt @@ -18,16 +18,22 @@ set(ReSolve_SRC set(ReSolve_GPU_SRC GramSchmidt.cpp LinSolverIterativeFGMRES.cpp -) + LinSolverIterativeRandFGMRES.cpp + RandSketchingManager.cpp + RandSketchingCountSketch.cpp + RandSketchingFWHT.cpp + ) # C++ code that links to CUDA SDK libraries set(ReSolve_CUDASDK_SRC LinSolverDirectCuSolverGLU.cpp LinSolverDirectCuSolverRf.cpp + LinSolverDirectCuSparseILU0.cpp ) # HIP files set(ReSolve_ROCM_SRC LinSolverDirectRocSolverRf.cpp + LinSolverDirectRocSparseILU0.cpp ) # Header files to be installed set(ReSolve_HEADER_INSTALL @@ -37,13 +43,15 @@ set(ReSolve_HEADER_INSTALL LinSolverDirectCuSolverGLU.hpp LinSolverDirectCuSolverRf.hpp LinSolverDirectRocSolverRf.hpp + LinSolverDirectRocSparseILU0.hpp + LinSolverDirectCuSparseILU0.hpp LinSolverDirectKLU.hpp LinSolverIterativeFGMRES.hpp RefactorizationSolver.hpp SystemSolver.hpp GramSchmidt.hpp MemoryUtils.hpp -) + RandSketchingManager.hpp) # Now, build workspaces add_subdirectory(workspace) diff --git a/resolve/GramSchmidt.cpp b/resolve/GramSchmidt.cpp index 7a6572b3b..8385d9d39 100644 --- a/resolve/GramSchmidt.cpp +++ b/resolve/GramSchmidt.cpp @@ -151,32 +151,38 @@ namespace ReSolve vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_, memspace); // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, memspace ); - + mem_.deviceSynchronize(); + // copy H_col to aux, we will need it later vec_Hcolumn_->setDataUpdated(memory::DEVICE); vec_Hcolumn_->setCurrentSize(i + 1); vec_Hcolumn_->deepCopyVectorData(h_aux_, 0, memory::HOST); + mem_.deviceSynchronize(); //Hcol = V(:,1:i)^T*V(:,i+1); vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_, memspace); + mem_.deviceSynchronize(); // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, memspace ); + mem_.deviceSynchronize(); // copy H_col to H vec_Hcolumn_->setDataUpdated(memory::DEVICE); vec_Hcolumn_->deepCopyVectorData(&H[ idxmap(i, 0, num_vecs_ + 1)], 0, memory::HOST); + mem_.deviceSynchronize(); // add both pieces together (unstable otherwise, careful here!!) t = 0.0; for(int j = 0; j <= i; ++j) { - H[ idxmap(i, j, num_vecs_ + 1)] += h_aux_[j]; + H[ idxmap(i, j, num_vecs_ + 1)] += h_aux_[j]; } t = vector_handler_->dot(vec_v_, vec_v_, memspace); //set the last entry in Hessenberg matrix t = sqrt(t); H[ idxmap(i, i + 1, num_vecs_ + 1) ] = t; + if(fabs(t) > EPSILON) { t = 1.0/t; vector_handler_->scal(&t, vec_v_, memspace); diff --git a/resolve/GramSchmidt.hpp b/resolve/GramSchmidt.hpp index 7d7b93bef..68ea8b09b 100644 --- a/resolve/GramSchmidt.hpp +++ b/resolve/GramSchmidt.hpp @@ -1,6 +1,7 @@ #pragma once #include "Common.hpp" #include +#include #include #include namespace ReSolve @@ -42,6 +43,8 @@ namespace ReSolve vector_type* vec_v_{nullptr}; // aux variable vector_type* vec_w_{nullptr}; // aux variable + + MemoryHandler mem_; ///< Device memory manager object }; }//namespace diff --git a/resolve/LinSolverDirectCuSparseILU0.cpp b/resolve/LinSolverDirectCuSparseILU0.cpp new file mode 100644 index 000000000..cf3ea005b --- /dev/null +++ b/resolve/LinSolverDirectCuSparseILU0.cpp @@ -0,0 +1,312 @@ +#include +#include +#include "LinSolverDirectCuSparseILU0.hpp" +#include + +namespace ReSolve +{ + LinSolverDirectCuSparseILU0::LinSolverDirectCuSparseILU0(LinAlgWorkspaceCUDA* workspace) + { + workspace_ = workspace; + } + + LinSolverDirectCuSparseILU0::~LinSolverDirectCuSparseILU0() + { + mem_.deleteOnDevice(d_aux1_); + mem_.deleteOnDevice(d_aux2_); + mem_.deleteOnDevice(d_ILU_vals_); + } + + int LinSolverDirectCuSparseILU0::setup(matrix::Sparse* A, + matrix::Sparse*, + matrix::Sparse*, + index_type*, + index_type*, + vector_type* ) + { + //remember - P and Q are generally CPU variables + int error_sum = 0; + this->A_ = (matrix::Csr*) A; + index_type n = A_->getNumRows(); + + index_type nnz = A_->getNnzExpanded(); + mem_.allocateArrayOnDevice(&d_ILU_vals_,nnz); + //copy A values to a buffer first + mem_.copyArrayDeviceToDevice(d_ILU_vals_, A_->getValues(ReSolve::memory::DEVICE), nnz); + + mem_.allocateArrayOnDevice(&d_aux1_,n); + mem_.allocateArrayOnDevice(&d_aux2_,n); + cudaMemset(d_aux1_, 1, n*sizeof(double)); + cusparseCreateDnVec(&vec_X_, n, d_aux1_, CUDA_R_64F); + cusparseCreateDnVec(&vec_Y_, n, d_aux2_, CUDA_R_64F); + + //set up descriptors + + // Create matrix descriptor for A + cusparseCreateMatDescr(&descr_A_); + cusparseSetMatIndexBase(descr_A_, CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatType(descr_A_, CUSPARSE_MATRIX_TYPE_GENERAL); + + // Create matrix descriptor for L and U + cusparseSpSV_createDescr(&descr_spsv_L_); + cusparseSpSV_createDescr(&descr_spsv_U_); + + cusparseCreateCsr(&mat_L_, + n, + n, + nnz, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + d_ILU_vals_, //vals_, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); + + cusparseCreateCsr(&mat_U_, + n, + n, + nnz, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + d_ILU_vals_, //vals_, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); + + + // Create matrix info structure + cusparseCreateCsrilu02Info(&info_A_); + + int buffer_size_A; + size_t buffer_size_L; + size_t buffer_size_U; + + status_cusparse_ = cusparseDcsrilu02_bufferSize(workspace_->getCusparseHandle(), + n, + nnz, + descr_A_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + &buffer_size_A); + + mem_.allocateBufferOnDevice(&buffer_,(size_t) buffer_size_A); + error_sum += status_cusparse_; + // Now analysis + status_cusparse_ = cusparseDcsrilu02_analysis(workspace_->getCusparseHandle(), + n, + nnz, + descr_A_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + CUSPARSE_SOLVE_POLICY_USE_LEVEL, + buffer_); + + error_sum += status_cusparse_; + // and now the actual decomposition + + // Compute incomplete LU factorization + status_cusparse_ = cusparseDcsrilu02(workspace_->getCusparseHandle(), + n, + nnz, + descr_A_, + d_ILU_vals_, //vals_ + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + CUSPARSE_SOLVE_POLICY_USE_LEVEL, + buffer_); + + error_sum += status_cusparse_; + + // now take care of LU solve + + // now create actual Sparse matrix OBJECTS for L and U + + cusparseFillMode_t fillmodeL = CUSPARSE_FILL_MODE_LOWER; + cusparseFillMode_t fillmodeU = CUSPARSE_FILL_MODE_UPPER; + + cusparseDiagType_t diagtypeL = CUSPARSE_DIAG_TYPE_UNIT; + cusparseDiagType_t diagtypeU = CUSPARSE_DIAG_TYPE_NON_UNIT; + + cusparseSpMatSetAttribute(mat_L_, + CUSPARSE_SPMAT_FILL_MODE, + &fillmodeL, + sizeof(fillmodeL)); + + cusparseSpMatSetAttribute(mat_U_, + CUSPARSE_SPMAT_FILL_MODE, + &fillmodeU, + sizeof(fillmodeU)); + + cusparseSpMatSetAttribute(mat_L_, + CUSPARSE_SPMAT_DIAG_TYPE, + &diagtypeL, + sizeof(diagtypeL)); + + cusparseSpMatSetAttribute(mat_U_, + CUSPARSE_SPMAT_DIAG_TYPE, + &diagtypeU, + sizeof(diagtypeU)); + + status_cusparse_ = cusparseSpSV_bufferSize(workspace_->getCusparseHandle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + &(constants::ONE), + mat_L_, + vec_X_, + vec_Y_, + CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, + descr_spsv_L_, + &buffer_size_L); + error_sum += status_cusparse_; + + mem_.allocateBufferOnDevice(&buffer_L_, buffer_size_L); + status_cusparse_ = cusparseSpSV_bufferSize(workspace_->getCusparseHandle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + &(constants::ONE), + mat_U_, + vec_X_, + vec_Y_, + CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, + descr_spsv_U_, + &buffer_size_U); + error_sum += status_cusparse_; + + mem_.allocateBufferOnDevice(&buffer_U_, buffer_size_U); + + status_cusparse_ = cusparseSpSV_analysis(workspace_->getCusparseHandle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + &(constants::ONE), + mat_L_, + vec_X_, + vec_Y_, + CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, + descr_spsv_L_, + buffer_L_); + error_sum += status_cusparse_; + + + status_cusparse_ = cusparseSpSV_analysis(workspace_->getCusparseHandle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + &(constants::ONE), + mat_U_, + vec_X_, + vec_Y_, + CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, + descr_spsv_U_, + buffer_U_); + + error_sum += status_cusparse_; + cusparseDestroyDnVec(vec_X_); + cusparseDestroyDnVec(vec_Y_); + return error_sum; + } + + int LinSolverDirectCuSparseILU0::reset(matrix::Sparse* A) + { + int error_sum = 0; + this->A_ = A; + index_type n = A_->getNumRows(); + index_type nnz = A_->getNnzExpanded(); + mem_.copyArrayDeviceToDevice(d_ILU_vals_, A_->getValues(ReSolve::memory::DEVICE), nnz); + + status_cusparse_ = cusparseDcsrilu02(workspace_->getCusparseHandle(), + n, + nnz, + descr_A_, + d_ILU_vals_, //vals_ + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + CUSPARSE_SOLVE_POLICY_USE_LEVEL, + buffer_); + //rerun tri solve analysis - to be updated + error_sum += status_cusparse_; + + return error_sum; + } + // solution is returned in RHS + int LinSolverDirectCuSparseILU0::solve(vector_type* rhs) + { + int error_sum = 0; + + cusparseCreateDnVec(&vec_X_, A_->getNumRows(), rhs->getData(ReSolve::memory::DEVICE), CUDA_R_64F); + cusparseCreateDnVec(&vec_Y_, A_->getNumRows(), d_aux1_, CUDA_R_64F); + + status_cusparse_ = cusparseSpSV_solve(workspace_->getCusparseHandle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + &(constants::ONE), + mat_L_, + vec_X_, + vec_Y_, + CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, + descr_spsv_L_); + error_sum += status_cusparse_; + + status_cusparse_ = cusparseSpSV_solve(workspace_->getCusparseHandle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + &(constants::ONE), + mat_U_, + vec_Y_, + vec_X_, + CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, + descr_spsv_U_); + error_sum += status_cusparse_; + + rhs->setDataUpdated(ReSolve::memory::DEVICE); + + cusparseDestroyDnVec(vec_X_); + cusparseDestroyDnVec(vec_Y_); + + return error_sum; + } + + int LinSolverDirectCuSparseILU0::solve(vector_type* rhs, vector_type* x) + { + int error_sum = 0; + + cusparseCreateDnVec(&vec_X_, A_->getNumRows(), rhs->getData(ReSolve::memory::DEVICE), CUDA_R_64F); + cusparseCreateDnVec(&vec_Y_, A_->getNumRows(), d_aux1_, CUDA_R_64F); + + status_cusparse_ = cusparseSpSV_solve(workspace_->getCusparseHandle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + &(constants::ONE), + mat_L_, + vec_X_, + vec_Y_, + CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, + descr_spsv_L_); + error_sum += status_cusparse_; + + cusparseCreateDnVec(&vec_X_, A_->getNumRows(), x->getData(ReSolve::memory::DEVICE), CUDA_R_64F); + status_cusparse_ = cusparseSpSV_solve(workspace_->getCusparseHandle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + &(constants::ONE), + mat_U_, + vec_Y_, + vec_X_, + CUDA_R_64F, + CUSPARSE_SPSV_ALG_DEFAULT, + descr_spsv_U_); + error_sum += status_cusparse_; + + x->setDataUpdated(ReSolve::memory::DEVICE); + + cusparseDestroyDnVec(vec_X_); + cusparseDestroyDnVec(vec_Y_); + + return error_sum; + } +}// namespace resolve diff --git a/resolve/LinSolverDirectCuSparseILU0.hpp b/resolve/LinSolverDirectCuSparseILU0.hpp new file mode 100644 index 000000000..1807bb991 --- /dev/null +++ b/resolve/LinSolverDirectCuSparseILU0.hpp @@ -0,0 +1,73 @@ +#pragma once +#include "Common.hpp" +#include "LinSolver.hpp" +#include +#include + +#include +#include + +namespace ReSolve +{ + // Forward declaration of vector::Vector class + namespace vector + { + class Vector; + } + + // Forward declaration of matrix::Sparse class + namespace matrix + { + class Sparse; + } + + class LinSolverDirectCuSparseILU0 : public LinSolverDirect + { + using vector_type = vector::Vector; + + public: + LinSolverDirectCuSparseILU0(LinAlgWorkspaceCUDA* workspace); + ~LinSolverDirectCuSparseILU0(); + + int setup(matrix::Sparse* A, + matrix::Sparse* L = nullptr, + matrix::Sparse* U = nullptr, + index_type* P = nullptr, + index_type* Q = nullptr, + vector_type* rhs = nullptr); + // if values of A change, but the nnz pattern does not, redo the analysis only (reuse buffers though) + int reset(matrix::Sparse* A); + + int solve(vector_type* rhs, vector_type* x); + int solve(vector_type* rhs);// the solutuon is returned IN RHS (rhs is overwritten) + + + private: + cusparseStatus_t status_cusparse_; + + MemoryHandler mem_; ///< Device memory manager object + LinAlgWorkspaceCUDA* workspace_; + + cusparseMatDescr_t descr_A_{nullptr}; + + cusparseSpMatDescr_t mat_L_; + cusparseSpMatDescr_t mat_U_; + + cusparseSpSVDescr_t descr_spsv_L_{nullptr}; + cusparseSpSVDescr_t descr_spsv_U_{nullptr}; + csrilu02Info_t info_A_{nullptr}; + + void* buffer_; + void* buffer_L_; + void* buffer_U_; + + real_type* d_aux1_; + real_type* d_aux2_; + + cusparseDnVecDescr_t vec_X_; + cusparseDnVecDescr_t vec_Y_; + + // since ILU OVERWRITES THE MATRIX values, we need a buffer to keep the values of ILU decomposition. + real_type* d_ILU_vals_; + }; +}// namespace diff --git a/resolve/LinSolverDirectRocSparseILU0.cpp b/resolve/LinSolverDirectRocSparseILU0.cpp new file mode 100644 index 000000000..8f15e0579 --- /dev/null +++ b/resolve/LinSolverDirectRocSparseILU0.cpp @@ -0,0 +1,293 @@ +#include +#include +#include "LinSolverDirectRocSparseILU0.hpp" + +#include + +namespace ReSolve +{ + LinSolverDirectRocSparseILU0::LinSolverDirectRocSparseILU0(LinAlgWorkspaceHIP* workspace) + { + workspace_ = workspace; + } + + LinSolverDirectRocSparseILU0::~LinSolverDirectRocSparseILU0() + { + mem_.deleteOnDevice(d_aux1_); + mem_.deleteOnDevice(d_ILU_vals_); + } + + int LinSolverDirectRocSparseILU0::setup(matrix::Sparse* A, + matrix::Sparse*, + matrix::Sparse*, + index_type*, + index_type*, + vector_type* ) + { + //remember - P and Q are generally CPU variables + int error_sum = 0; + this->A_ = (matrix::Csr*) A; + index_type n = A_->getNumRows(); + + index_type nnz = A_->getNnzExpanded(); + mem_.allocateArrayOnDevice(&d_ILU_vals_,nnz); + //copy A values to a buffer first + mem_.copyArrayDeviceToDevice(d_ILU_vals_, A_->getValues(ReSolve::memory::DEVICE), nnz); + + //set up descriptors + + // Create matrix descriptor for A + rocsparse_create_mat_descr(&descr_A_); + + // Create matrix descriptor for L + rocsparse_create_mat_descr(&descr_L_); + rocsparse_set_mat_fill_mode(descr_L_, rocsparse_fill_mode_lower); + rocsparse_set_mat_diag_type(descr_L_, rocsparse_diag_type_unit); + + // Create matrix descriptor for U + rocsparse_create_mat_descr(&descr_U_); + rocsparse_set_mat_fill_mode(descr_U_, rocsparse_fill_mode_upper); + rocsparse_set_mat_diag_type(descr_U_, rocsparse_diag_type_non_unit); + + // Create matrix info structure + rocsparse_create_mat_info(&info_A_); + + size_t buffer_size_A; + size_t buffer_size_L; + size_t buffer_size_U; + + status_rocsparse_ = rocsparse_dcsrilu0_buffer_size(workspace_->getRocsparseHandle(), + n, + nnz, + descr_A_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + &buffer_size_A); + if (status_rocsparse_ != 0) { + io::Logger::warning() << "Buffer size estimate for ILU0 failed with code: " <getRocsparseHandle(), + rocsparse_operation_none, + n, + nnz, + descr_L_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + &buffer_size_L); + if (status_rocsparse_ != 0) { + io::Logger::warning() << "Buffer size estimate for L solve failed with code: " <getRocsparseHandle(), + rocsparse_operation_none, + n, + nnz, + descr_U_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + &buffer_size_U); + if (status_rocsparse_ != 0) { + io::Logger::warning() << "Buffer size estimate for U solve failed with code: " <getRocsparseHandle(), + n, + nnz, + descr_A_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + rocsparse_analysis_policy_reuse, + rocsparse_solve_policy_auto, + buffer_); + + if (status_rocsparse_ != 0) { + io::Logger::warning() << "ILU0 decomposition analysis failed with code: " <getRocsparseHandle(), + rocsparse_operation_none, + n, + nnz, + descr_L_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + rocsparse_analysis_policy_reuse, + rocsparse_solve_policy_auto, + buffer_); + if (status_rocsparse_ != 0) { + io::Logger::warning() << "Solve analysis for L solve failed with code: " <getRocsparseHandle(), + rocsparse_operation_none, + n, + nnz, + descr_U_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + rocsparse_analysis_policy_reuse, + rocsparse_solve_policy_auto, + buffer_); + if (status_rocsparse_ != 0) { + io::Logger::warning() << "Solve analysis for U solve failed with code: " <getRocsparseHandle(), + n, + nnz, + descr_A_, + d_ILU_vals_, //vals_ + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + rocsparse_solve_policy_auto, + buffer_); + if (status_rocsparse_ != 0) { + io::Logger::warning() << "ILU0 decomposition failed with code: " <A_ = A; + index_type n = A_->getNumRows(); + index_type nnz = A_->getNnzExpanded(); + mem_.copyArrayDeviceToDevice(d_ILU_vals_, A_->getValues(ReSolve::memory::DEVICE), nnz); + + status_rocsparse_ = rocsparse_dcsrilu0(workspace_->getRocsparseHandle(), + n, + nnz, + descr_A_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + rocsparse_solve_policy_auto, + buffer_); + + error_sum += status_rocsparse_; + + return error_sum; + } + // solution is returned in RHS + int LinSolverDirectRocSparseILU0::solve(vector_type* rhs) + { + int error_sum = 0; + status_rocsparse_ = rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + A_->getNnzExpanded(), + &(constants::ONE), + descr_L_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + rhs->getData(ReSolve::memory::DEVICE), + d_aux1_, //result + rocsparse_solve_policy_auto, + buffer_); + error_sum += status_rocsparse_; + + status_rocsparse_ = rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + A_->getNnzExpanded(), + &(constants::ONE), + descr_U_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + d_aux1_, //input + rhs->getData(ReSolve::memory::DEVICE),//result + rocsparse_solve_policy_auto, + buffer_); + error_sum += status_rocsparse_; + rhs->setDataUpdated(ReSolve::memory::DEVICE); + + return error_sum; + } + + int LinSolverDirectRocSparseILU0::solve(vector_type* rhs, vector_type* x) + { + int error_sum = 0; + + + + status_rocsparse_ = rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + A_->getNnzExpanded(), + &(constants::ONE), + descr_L_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + rhs->getData(ReSolve::memory::DEVICE), + d_aux1_, //result + rocsparse_solve_policy_auto, + buffer_); + error_sum += status_rocsparse_; + + status_rocsparse_ = rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + A_->getNnzExpanded(), + &(constants::ONE), + descr_U_, + d_ILU_vals_, //vals_, + A_->getRowData(ReSolve::memory::DEVICE), + A_->getColData(ReSolve::memory::DEVICE), + info_A_, + d_aux1_, //input + x->getData(ReSolve::memory::DEVICE),//result + rocsparse_solve_policy_auto, + buffer_); + error_sum += status_rocsparse_; + + x->setDataUpdated(ReSolve::memory::DEVICE); + + return error_sum; + } +}// namespace resolve diff --git a/resolve/LinSolverDirectRocSparseILU0.hpp b/resolve/LinSolverDirectRocSparseILU0.hpp new file mode 100644 index 000000000..789e8e2cc --- /dev/null +++ b/resolve/LinSolverDirectRocSparseILU0.hpp @@ -0,0 +1,65 @@ + +#pragma once +#include "Common.hpp" +#include "LinSolver.hpp" +#include +#include + +#include +//#include +#include +#include +namespace ReSolve +{ + // Forward declaration of vector::Vector class + namespace vector + { + class Vector; + } + + // Forward declaration of matrix::Sparse class + namespace matrix + { + class Sparse; + } + + class LinSolverDirectRocSparseILU0 : public LinSolverDirect + { + using vector_type = vector::Vector; + + public: + LinSolverDirectRocSparseILU0(LinAlgWorkspaceHIP* workspace); + ~LinSolverDirectRocSparseILU0(); + + int setup(matrix::Sparse* A, + matrix::Sparse* L = nullptr, + matrix::Sparse* U = nullptr, + index_type* P = nullptr, + index_type* Q = nullptr, + vector_type* rhs = nullptr); + // if values of A change, but the nnz pattern does not, redo the analysis only (reuse buffers though) + int reset(matrix::Sparse* A); + + int solve(vector_type* rhs, vector_type* x); + int solve(vector_type* rhs);// the solutuon is returned IN RHS (rhs is overwritten) + + + private: + rocsparse_status status_rocsparse_; + + MemoryHandler mem_; ///< Device memory manager object + LinAlgWorkspaceHIP* workspace_; + + rocsparse_mat_descr descr_A_{nullptr}; + rocsparse_mat_descr descr_L_{nullptr}; + rocsparse_mat_descr descr_U_{nullptr}; + + rocsparse_mat_info info_A_{nullptr}; + + void* buffer_; + + real_type* d_aux1_; + // since ILU OVERWRITES THE MATRIX values, we need a buffer to keep the values of ILU decomposition. + real_type* d_ILU_vals_; + }; +}// namespace diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp index 40fdb22c1..c0dcf48d5 100644 --- a/resolve/LinSolverIterativeFGMRES.cpp +++ b/resolve/LinSolverIterativeFGMRES.cpp @@ -19,6 +19,7 @@ namespace ReSolve maxit_= 100; //default restart_ = 10; conv_cond_ = 0;//default + flexible_ = 1; d_V_ = nullptr; d_Z_ = nullptr; @@ -38,6 +39,7 @@ namespace ReSolve maxit_= 100; //default restart_ = 10; conv_cond_ = 0;//default + flexible_ = 1; d_V_ = nullptr; d_Z_ = nullptr; @@ -61,6 +63,7 @@ namespace ReSolve maxit_= maxit; restart_ = restart; conv_cond_ = conv_cond; + flexible_ = 1; d_V_ = nullptr; d_Z_ = nullptr; @@ -88,7 +91,12 @@ namespace ReSolve d_V_ = new vector_type(n_, restart_ + 1); d_V_->allocate(memory::DEVICE); - d_Z_ = new vector_type(n_, restart_ + 1); + if (flexible_) { + d_Z_ = new vector_type(n_, restart_ + 1); + } else { + // otherwise Z is just a one vector, not multivector and we dont keep it + d_Z_ = new vector_type(n_); + } d_Z_->allocate(memory::DEVICE); h_H_ = new real_type[restart_ * (restart_ + 1)]; h_c_ = new real_type[restart_]; // needed for givens @@ -127,7 +135,6 @@ namespace ReSolve rnorm = 0.0; bnorm = vector_handler_->dot(rhs, rhs, memspace_); rnorm = vector_handler_->dot(d_V_, d_V_, memspace_); - //rnorm = ||V_1|| rnorm = sqrt(rnorm); bnorm = sqrt(bnorm); @@ -173,9 +180,12 @@ namespace ReSolve it++; // Z_i = (LU)^{-1}*V_i - vec_v->setData( d_V_->getVectorData(i, memory::DEVICE), memory::DEVICE); - vec_z->setData( d_Z_->getVectorData(i, memory::DEVICE), memory::DEVICE); + if (flexible_) { + vec_z->setData( d_Z_->getVectorData(i, memory::DEVICE), memory::DEVICE); + } else { + vec_z->setData( d_Z_->getVectorData(0, memory::DEVICE), memory::DEVICE); + } this->precV(vec_v, vec_z); mem_.deviceSynchronize(); @@ -234,9 +244,24 @@ namespace ReSolve } // get solution - for(j = 0; j <= i; j++) { - vec_z->setData( d_Z_->getVectorData(j, memory::DEVICE), memory::DEVICE); - vector_handler_->axpy(&h_rs_[j], vec_z, x, memspace_); + if (flexible_) { + for(j = 0; j <= i; j++) { + vec_z->setData( d_Z_->getVectorData(j, memory::DEVICE), memory::DEVICE); + vector_handler_->axpy(&h_rs_[j], vec_z, x, memspace_); + } + } else { + mem_.setZeroArrayOnDevice(d_Z_->getData(memory::DEVICE), d_Z_->getSize()); + vec_z->setData( d_Z_->getVectorData(0, memory::DEVICE), memory::DEVICE); + for(j = 0; j <= i; j++) { + vec_v->setData( d_V_->getVectorData(j, memory::DEVICE), memory::DEVICE); + vector_handler_->axpy(&h_rs_[j], vec_v, vec_z, memspace_); + } + // now multiply d_Z by precon + + vec_v->setData( d_V_->getData(memory::DEVICE), memory::DEVICE); + this->precV(vec_z, vec_v); + // and add to x + vector_handler_->axpy(&ONE, vec_v, x, memspace_); } /* test solution */ @@ -292,6 +317,11 @@ namespace ReSolve return conv_cond_; } + bool LinSolverIterativeFGMRES::getFlexible() + { + return flexible_; + } + void LinSolverIterativeFGMRES::setTol(real_type new_tol) { this->tol_ = new_tol; @@ -311,6 +341,11 @@ namespace ReSolve { this->conv_cond_ = new_conv_cond; } + + void LinSolverIterativeFGMRES::setFlexible(bool new_flex) + { + this->flexible_ = new_flex; + } int LinSolverIterativeFGMRES::resetMatrix(matrix::Sparse* new_matrix) { diff --git a/resolve/LinSolverIterativeFGMRES.hpp b/resolve/LinSolverIterativeFGMRES.hpp index a9fc5058d..c9c140f9a 100644 --- a/resolve/LinSolverIterativeFGMRES.hpp +++ b/resolve/LinSolverIterativeFGMRES.hpp @@ -37,11 +37,13 @@ namespace ReSolve index_type getMaxit(); index_type getRestart(); index_type getConvCond(); + bool getFlexible(); void setTol(real_type new_tol); void setMaxit(index_type new_maxit); void setRestart(index_type new_restart); void setConvCond(index_type new_conv_cond); + void setFlexible(bool new_flexible); real_type getFinalResidualNorm(); real_type getInitResidualNorm(); @@ -57,7 +59,7 @@ namespace ReSolve index_type restart_; std::string orth_option_; index_type conv_cond_; - + bool flexible_{true}; // if can be run as "normal" GMRES if needed, set flexible_ to false. Default is true of course. vector_type* d_V_{nullptr}; vector_type* d_Z_{nullptr}; diff --git a/resolve/LinSolverIterativeRandFGMRES.cpp b/resolve/LinSolverIterativeRandFGMRES.cpp new file mode 100644 index 000000000..40cd08acc --- /dev/null +++ b/resolve/LinSolverIterativeRandFGMRES.cpp @@ -0,0 +1,502 @@ +#include +#include +#include +#include + +#include +#include +#include "LinSolverIterativeRandFGMRES.hpp" +#include +#include + +namespace ReSolve +{ + using out = io::Logger; + + LinSolverIterativeRandFGMRES::LinSolverIterativeRandFGMRES(std::string memspace) + { + memspace_ = memspace; + this->matrix_handler_ = nullptr; + this->vector_handler_ = nullptr; + this->rand_method_ = cs; + + tol_ = 1e-12; //default + maxit_= 100; //default + restart_ = 10; + conv_cond_ = 0;//default + flexible_ = 1; + + d_V_ = nullptr; + d_Z_ = nullptr; + } + + LinSolverIterativeRandFGMRES::LinSolverIterativeRandFGMRES(MatrixHandler* matrix_handler, + VectorHandler* vector_handler, + SketchingMethod rand_method, + GramSchmidt* gs, + std::string memspace) + { + memspace_ = memspace; + this->matrix_handler_ = matrix_handler; + this->vector_handler_ = vector_handler; + this->rand_method_ = rand_method; + this->GS_ = gs; + + tol_ = 1e-14; //default + maxit_= 100; //default + restart_ = 10; + conv_cond_ = 0;//default + flexible_ = 1; + + d_V_ = nullptr; + d_Z_ = nullptr; + } + + LinSolverIterativeRandFGMRES::LinSolverIterativeRandFGMRES(index_type restart, + real_type tol, + index_type maxit, + index_type conv_cond, + MatrixHandler* matrix_handler, + VectorHandler* vector_handler, + SketchingMethod rand_method, + GramSchmidt* gs, + std::string memspace) + { + memspace_ = memspace; + this->matrix_handler_ = matrix_handler; + this->vector_handler_ = vector_handler; + this->rand_method_ = rand_method; + this->GS_ = gs; + + tol_ = tol; + maxit_= maxit; + restart_ = restart; + conv_cond_ = conv_cond; + flexible_ = 1; + + d_V_ = nullptr; + d_Z_ = nullptr; + + } + + LinSolverIterativeRandFGMRES::~LinSolverIterativeRandFGMRES() + { + if (d_V_ != nullptr) { + // cudaFree(d_V_); + delete d_V_; + } + + if (d_Z_ != nullptr) { + // cudaFree(d_Z_); + delete d_Z_; + } + + } + + int LinSolverIterativeRandFGMRES::setup(matrix::Sparse* A) + { + this->A_ = A; + n_ = A_->getNumRows(); + + d_V_ = new vector_type(n_, restart_ + 1); + d_V_->allocate(memory::DEVICE); + if (flexible_) { + d_Z_ = new vector_type(n_, restart_ + 1); + } else { + // otherwise Z is just a one vector, not multivector and we dont keep it + d_Z_ = new vector_type(n_); + } + d_Z_->allocate(memory::DEVICE); + h_H_ = new real_type[restart_ * (restart_ + 1)]; + h_c_ = new real_type[restart_]; // needed for givens + h_s_ = new real_type[restart_]; // same + h_rs_ = new real_type[restart_ + 1]; // for residual norm history + + mem_.allocateArrayOnDevice(&d_aux_, restart_ + 1); + // rand method + k_rand_ = n_; + switch(rand_method_) { + case cs: + if(ceil(restart_ * log(n_)) < k_rand_) { + k_rand_ = static_cast(std::ceil(restart_ * std::log(static_cast(n_)))); + } + rand_manager_ = new RandSketchingCountSketch(); + //set k and n + break; + case fwht: + if ( ceil(2.0 * restart_ * log(n_) / log(restart_)) < k_rand_) { + k_rand_ = static_cast(std::ceil(2.0 * restart_ * std::log(n_) / std::log(restart_))); + } + rand_manager_ = new RandSketchingFWHT(); + break; + default: + io::Logger::warning() << "Wrong sketching method, setting to default (CountSketch)\n"; + rand_method_ = cs; + if(ceil(restart_ * log(n_)) < k_rand_) { + k_rand_ = static_cast(std::ceil(restart_ * std::log(n_))); + } + rand_manager_ = new RandSketchingCountSketch(); + break; + } + + rand_manager_->setup(n_, k_rand_); + + one_over_k_ = 1.0 / sqrt((real_type) k_rand_); + + d_S_ = new vector_type(k_rand_, restart_ + 1); + d_S_->allocate(memory::DEVICE); + if (rand_method_ == cs) { + d_S_->setToZero(memory::DEVICE); + } + return 0; + } + + int LinSolverIterativeRandFGMRES::solve(vector_type* rhs, vector_type* x) + { + using namespace constants; + + //io::Logger::setVerbosity(io::Logger::EVERYTHING); + + int outer_flag = 1; + int notconv = 1; + int i = 0; + int it = 0; + int j; + int k; + int k1; + + real_type t; + real_type rnorm; + real_type bnorm; + // real_type rnorm_aux; + real_type tolrel; + vector_type* vec_v = new vector_type(n_); + vector_type* vec_z = new vector_type(n_); + vector_type* vec_s = new vector_type(k_rand_); + //V[0] = b-A*x_0 + //debug + d_Z_->setToZero(memory::DEVICE); + d_V_->setToZero(memory::DEVICE); + + rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE); + matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE, "csr", memspace_); + + vec_v->setData( d_V_->getVectorData(0, memory::DEVICE), memory::DEVICE); + vec_s->setData( d_S_->getVectorData(0, memory::DEVICE), memory::DEVICE); + + rand_manager_->Theta(vec_v, vec_s); + + if (rand_method_ == fwht){ + // cublasDscal(cublas_handle, k_rand, &oneOverK, d_S, 1); + vector_handler_->scal(&one_over_k_, vec_s, memspace_); + } + mem_.deviceSynchronize(); + + rnorm = 0.0; + bnorm = vector_handler_->dot(rhs, rhs, memspace_); + rnorm = vector_handler_->dot(vec_s, vec_s, memspace_); + // double rnorm_true = vector_handler_->dot(vec_v, vec_v, memspace_); + //rnorm = ||V_1|| + rnorm = sqrt(rnorm); + bnorm = sqrt(bnorm); + io::Logger::misc() << "it 0: norm of residual " + << std::scientific << std::setprecision(16) + << rnorm << " Norm of rhs: " << bnorm << "\n"; + initial_residual_norm_ = rnorm; + while(outer_flag) { + // check if maybe residual is already small enough? + if(it == 0) { + tolrel = tol_ * rnorm; + if(fabs(tolrel) < 1e-16) { + tolrel = 1e-16; + } + } + int exit_cond = 0; + if (conv_cond_ == 0) { + exit_cond = ((fabs(rnorm - ZERO) <= EPSILON)); + } else if (conv_cond_ == 1) { + exit_cond = ((fabs(rnorm - ZERO) <= EPSILON) || (rnorm < tol_)); + } else if (conv_cond_ == 2) { + exit_cond = ((fabs(rnorm - ZERO) <= EPSILON) || (rnorm < (tol_*bnorm))); + } + if (exit_cond) { + outer_flag = 0; + final_residual_norm_ = rnorm; + initial_residual_norm_ = rnorm; + fgmres_iters_ = 0; + break; + } + + // normalize first vector + t = 1.0 / rnorm; + vector_handler_->scal(&t, d_V_, memspace_); + vector_handler_->scal(&t, d_S_, memspace_); + + mem_.deviceSynchronize(); + // initialize norm history + h_rs_[0] = rnorm; + i = -1; + notconv = 1; + + while((notconv) && (it < maxit_)) { + i++; + it++; + + // Z_i = (LU)^{-1}*V_i + vec_v->setData( d_V_->getVectorData(i, memory::DEVICE), memory::DEVICE); + if (flexible_) { + vec_z->setData( d_Z_->getVectorData(i, memory::DEVICE), memory::DEVICE); + } else { + vec_z->setData( d_Z_->getVectorData(0, memory::DEVICE), memory::DEVICE); + } + this->precV(vec_v, vec_z); + + mem_.deviceSynchronize(); + + // V_{i+1}=A*Z_i + + vec_v->setData( d_V_->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); + + matrix_handler_->matvec(A_, vec_z, vec_v, &ONE, &ZERO,"csr", memspace_); + + // orthogonalize V[i+1], form a column of h_H_ + // this is where it differs from normal solver GS + vec_s->setData( d_S_->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); + rand_manager_->Theta(vec_v, vec_s); + if (rand_method_ == fwht){ + // cublasDscal(cublas_handle, k_rand, &oneOverK, d_S, 1); + vector_handler_->scal(&one_over_k_, vec_s, memspace_); + } + mem_.deviceSynchronize(); + GS_->orthogonalize(k_rand_, d_S_, h_H_, i, memspace_); + // now post-process + //checkCudaErrors(cudaMemcpy(d_Hcolumn, &h_H[i * (restart + 1)], sizeof(double) * (i + 1), cudaMemcpyHostToDevice)); + mem_.copyArrayHostToDevice(d_aux_, &h_H_[i * (restart_ + 1)], i + 2); + vec_z->setData(d_aux_, memory::DEVICE); + vec_z->setCurrentSize(i + 1); + //V(:, i+1) =w-V(:, 1:i)*d_H_col = V(:, i+1)-d_H_col * V(:,1:i); + //checkCudaErrors( cublasDgemv(cublas_handle, CUBLAS_OP_N, n, i + 1, &minusone, d_V, n, d_Hcolumn, 1,&one , &d_V[n * (i + 1)], 1)); + + vector_handler_->gemv("N", n_, i + 1, &MINUSONE, &ONE, d_V_, vec_z, vec_v, memspace_ ); + + vec_z->setCurrentSize(n_); + t = 1.0 / h_H_[i * (restart_ + 1) + i + 1]; + vector_handler_->scal(&t, vec_v, memspace_); + mem_.deviceSynchronize(); + vec_s->setData( d_S_->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); + + if (i != 0) { + for (int k = 1; k <= i; k++) { + k1 = k - 1; + t = h_H_[i * (restart_ + 1) + k1]; + h_H_[i * (restart_ + 1) + k1] = h_c_[k1] * t + h_s_[k1] * h_H_[i * (restart_ + 1) + k]; + h_H_[i * (restart_ + 1) + k] = -h_s_[k1] * t + h_c_[k1] * h_H_[i * (restart_ + 1) + k]; + } + } // if i!=0 + double Hii = h_H_[i * (restart_ + 1) + i]; + double Hii1 = h_H_[(i) * (restart_ + 1) + i + 1]; + double gam = sqrt(Hii * Hii + Hii1 * Hii1); + + if(fabs(gam - ZERO) <= EPSILON) { + gam = EPSMAC; + } + + /* next Given's rotation */ + h_c_[i] = Hii / gam; + h_s_[i] = Hii1 / gam; + h_rs_[i + 1] = -h_s_[i] * h_rs_[i]; + h_rs_[i] = h_c_[i] * h_rs_[i]; + + h_H_[(i) * (restart_ + 1) + (i)] = h_c_[i] * Hii + h_s_[i] * Hii1; + h_H_[(i) * (restart_ + 1) + (i + 1)] = h_c_[i] * Hii1 - h_s_[i] * Hii; + + // residual norm estimate + rnorm = fabs(h_rs_[i + 1]); + + io::Logger::misc() << "it: "< norm of the residual " + << std::scientific << std::setprecision(16) + << rnorm << "\n"; + // check convergence + if (i + 1 >= restart_ || rnorm <= tolrel || it >= maxit_) { + notconv = 0; + } + } // inner while + + io::Logger::misc() << "End of cycle, ESTIMATED norm of residual " + << std::scientific << std::setprecision(16) + << rnorm << "\n"; + // solve tri system + h_rs_[i] = h_rs_[i] / h_H_[i * (restart_ + 1) + i]; + for (int ii = 2; ii <= i + 1; ii++) { + k = i - ii + 1; + k1 = k + 1; + t = h_rs_[k]; + for(j = k1; j <= i; j++) { + t -= h_H_[j * (restart_ + 1) + k] * h_rs_[j]; + } + h_rs_[k] = t / h_H_[k * (restart_ + 1) + k]; + } + + // get solution + if (flexible_) { + for (j = 0; j <= i; j++) { + vec_z->setData( d_Z_->getVectorData(j, memory::DEVICE), memory::DEVICE); + vector_handler_->axpy(&h_rs_[j], vec_z, x, memspace_); + } + } else { + mem_.setZeroArrayOnDevice(d_Z_->getData(memory::DEVICE), d_Z_->getSize()); + vec_z->setData( d_Z_->getVectorData(0, memory::DEVICE), memory::DEVICE); + for(j = 0; j <= i; j++) { + vec_v->setData( d_V_->getVectorData(j, memory::DEVICE), memory::DEVICE); + vector_handler_->axpy(&h_rs_[j], vec_v, vec_z, memspace_); + + } + // now multiply d_Z by precon + + vec_v->setData( d_V_->getData(memory::DEVICE), memory::DEVICE); + this->precV(vec_z, vec_v); + // and add to x + vector_handler_->axpy(&ONE, vec_v, x, memspace_); + } + + /* test solution */ + if(rnorm <= tolrel || it >= maxit_) { + // rnorm_aux = rnorm; + outer_flag = 0; + } + + rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE); + matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE,"csr", memspace_); + if (outer_flag) { + + rand_manager_->reset(); + + if (rand_method_ == cs) { + mem_.setZeroArrayOnDevice(d_S_->getData(memory::DEVICE), d_S_->getSize() * d_S_->getNumVectors()); + } + vec_v->setData( d_V_->getVectorData(0, memory::DEVICE), memory::DEVICE); + vec_s->setData( d_S_->getVectorData(0, memory::DEVICE), memory::DEVICE); + rand_manager_->Theta(vec_v, vec_s); + if (rand_method_ == fwht){ + // cublasDscal(cublas_handle, k_rand, &oneOverK, d_S, 1); + vector_handler_->scal(&one_over_k_, vec_s, memspace_); + } + mem_.deviceSynchronize(); + rnorm = vector_handler_->dot(d_S_, d_S_, memspace_); + // rnorm = ||S_0|| + rnorm = sqrt(rnorm); + } + + if (!outer_flag) { + rnorm = vector_handler_->dot(d_V_, d_V_, memspace_); + // rnorm = ||V_0|| + rnorm = sqrt(rnorm); + + io::Logger::misc() << "End of cycle, COMPUTED norm of residual " + << std::scientific << std::setprecision(16) + << rnorm << "\n"; + + final_residual_norm_ = rnorm; + fgmres_iters_ = it; + } + } // outer while + return 0; + } + + int LinSolverIterativeRandFGMRES::setupPreconditioner(std::string type, LinSolverDirect* LU_solver) + { + if (type != "LU") { + out::warning() << "Only cusolverRf tri solve can be used as a preconditioner at this time." << std::endl; + return 1; + } else { + LU_solver_ = LU_solver; + return 0; + } + + } + + real_type LinSolverIterativeRandFGMRES::getTol() + { + return tol_; + } + + index_type LinSolverIterativeRandFGMRES::getMaxit() + { + return maxit_; + } + + index_type LinSolverIterativeRandFGMRES::getRestart() + { + return restart_; + } + + index_type LinSolverIterativeRandFGMRES::getConvCond() + { + return conv_cond_; + } + + bool LinSolverIterativeRandFGMRES::getFlexible() + { + return flexible_; + } + + index_type LinSolverIterativeRandFGMRES::getKrand() + { + return k_rand_; + } + + void LinSolverIterativeRandFGMRES::setTol(real_type new_tol) + { + this->tol_ = new_tol; + } + + void LinSolverIterativeRandFGMRES::setMaxit(index_type new_maxit) + { + this->maxit_ = new_maxit; + } + + void LinSolverIterativeRandFGMRES::setRestart(index_type new_restart) + { + this->restart_ = new_restart; + } + + void LinSolverIterativeRandFGMRES::setConvCond(index_type new_conv_cond) + { + this->conv_cond_ = new_conv_cond; + } + + void LinSolverIterativeRandFGMRES::setFlexible(bool new_flex) + { + this->flexible_ = new_flex; + } + + int LinSolverIterativeRandFGMRES::resetMatrix(matrix::Sparse* new_matrix) + { + A_ = new_matrix; + matrix_handler_->setValuesChanged(true, memspace_); + return 0; + } + + + + void LinSolverIterativeRandFGMRES::precV(vector_type* rhs, vector_type* x) + { + LU_solver_->solve(rhs, x); + // x->update(rhs->getData(memory::DEVICE), memory::DEVICE, memory::DEVICE); + } + + real_type LinSolverIterativeRandFGMRES::getFinalResidualNorm() + { + return final_residual_norm_; + } + + real_type LinSolverIterativeRandFGMRES::getInitResidualNorm() + { + return initial_residual_norm_; + } + + index_type LinSolverIterativeRandFGMRES::getNumIter() + { + return fgmres_iters_; + } +} // namespace ReSolve diff --git a/resolve/LinSolverIterativeRandFGMRES.hpp b/resolve/LinSolverIterativeRandFGMRES.hpp new file mode 100644 index 000000000..f779f257b --- /dev/null +++ b/resolve/LinSolverIterativeRandFGMRES.hpp @@ -0,0 +1,102 @@ +#pragma once +#include "Common.hpp" +#include +#include +#include "LinSolver.hpp" +#include "GramSchmidt.hpp" +#include "RandSketchingManager.hpp" + +namespace ReSolve +{ + + class LinSolverIterativeRandFGMRES : public LinSolverIterative + { + using vector_type = vector::Vector; + + public: + + enum SketchingMethod { cs = 0, // count sketch + fwht = 1}; // fast Walsh-Hadamard transform + + LinSolverIterativeRandFGMRES(std::string memspace = "cuda"); + + LinSolverIterativeRandFGMRES( MatrixHandler* matrix_handler, + VectorHandler* vector_handler, + SketchingMethod rand_method, + GramSchmidt* gs, + std::string memspace = "cuda"); + + LinSolverIterativeRandFGMRES(index_type restart, + real_type tol, + index_type maxit, + index_type conv_cond, + MatrixHandler* matrix_handler, + VectorHandler* vector_handler, + SketchingMethod rand_method, + GramSchmidt* gs, + std::string memspace = "cuda"); + ~LinSolverIterativeRandFGMRES(); + + int solve(vector_type* rhs, vector_type* x); + int setup(matrix::Sparse* A); + int resetMatrix(matrix::Sparse* new_A); + int setupPreconditioner(std::string name, LinSolverDirect* LU_solver); + + real_type getTol(); + index_type getMaxit(); + index_type getRestart(); + index_type getConvCond(); + bool getFlexible(); + std::string getRandSketchingMethod(); + index_type getKrand(); + + void setTol(real_type new_tol); + void setMaxit(index_type new_maxit); + void setRestart(index_type new_restart); + void setConvCond(index_type new_conv_cond); + void setFlexible(bool new_flexible); + void getRandSketchingMethod(std::string rand_method); + + real_type getFinalResidualNorm(); + real_type getInitResidualNorm(); + index_type getNumIter(); + + private: + //remember matrix handler and vector handler are inherited. + + std::string memspace_; + + real_type tol_; + index_type maxit_; + index_type restart_; + std::string orth_option_; + index_type conv_cond_; + bool flexible_{1}; // if can be run as "normal" GMRES if needed, set flexible_ to 0. Default is 1 of course. + + vector_type* d_V_{nullptr}; + vector_type* d_Z_{nullptr}; + // for performing Gram-Schmidt + vector_type* d_S_{nullptr}; + + real_type* h_H_{nullptr}; + real_type* h_c_{nullptr}; + real_type* h_s_{nullptr}; + real_type* h_rs_{nullptr}; + real_type* d_aux_{nullptr}; + + + GramSchmidt* GS_; + void precV(vector_type* rhs, vector_type* x); //multiply the vector by preconditioner + LinSolverDirect* LU_solver_; + index_type n_;// for simplicity + real_type final_residual_norm_; + real_type initial_residual_norm_; + index_type fgmres_iters_; + real_type one_over_k_{1.0}; + + index_type k_rand_{0}; // size of sketch space, we need to know it so we can allocate S! + MemoryHandler mem_; ///< Device memory manager object + RandSketchingManager* rand_manager_; + SketchingMethod rand_method_; + }; +} diff --git a/resolve/RandSketchingCountSketch.cpp b/resolve/RandSketchingCountSketch.cpp new file mode 100644 index 000000000..7ce615505 --- /dev/null +++ b/resolve/RandSketchingCountSketch.cpp @@ -0,0 +1,104 @@ +#include "RandSketchingCountSketch.hpp" +#include +#include + +#ifdef RESOLVE_USE_HIP +#include +#endif +#ifdef RESOLVE_USE_CUDA +#include +#endif +#include + +namespace ReSolve +{ + RandSketchingCountSketch::RandSketchingCountSketch() + { + h_labels_ = nullptr; + h_flip_ = nullptr; + + d_labels_ = nullptr; + d_flip_ = nullptr; + } + + // destructor + RandSketchingCountSketch::~RandSketchingCountSketch() + { + delete h_labels_; + delete h_flip_; + mem_.deleteOnDevice(d_labels_); + mem_.deleteOnDevice(d_flip_); + } + + // Actual sketching process + int RandSketchingCountSketch::Theta(vector_type* input, vector_type* output) + { + mem_.deviceSynchronize(); + count_sketch_theta(n_, + k_rand_, + d_labels_, + d_flip_, + input->getData(ReSolve::memory::DEVICE), + output->getData(ReSolve::memory::DEVICE)); + mem_.deviceSynchronize(); + + + return 0; + } + + // Setup the parameters, sampling matrices, permuations, etc + int RandSketchingCountSketch::setup(index_type n, index_type k) + { + k_rand_ = k; + n_ = n; + srand(static_cast(time(nullptr))); + //allocate labeling scheme vector and move to GPU + + h_labels_ = new int[n_]; + //allocate sgn - a vector of flip signs + h_flip_ = new int[n_]; + + //populate labeling scheme (can be done on the gpu really) + //to be fixed, this can be done on the GPU + for (int i=0; i +#include + +namespace ReSolve { + + // Forward declaration of vector::Vector class + namespace vector + { + class Vector; + } + + class RandSketchingCountSketch : public RandSketchingManager + { + + using vector_type = vector::Vector; + public: + + // constructor + RandSketchingCountSketch(); + + // destructor + virtual ~RandSketchingCountSketch(); + + // Actual sketching process + virtual int Theta(vector_type* input, vector_type* output); + + // Setup the parameters, sampling matrices, permuations, etc + virtual int setup(index_type n, index_type k); + virtual int reset(); // if needed can be reset (like when Krylov method restarts) + + private: + index_type* h_labels_; + index_type* h_flip_; + + index_type* d_labels_; + index_type* d_flip_; + }; +} diff --git a/resolve/RandSketchingFWHT.cpp b/resolve/RandSketchingFWHT.cpp new file mode 100644 index 000000000..76e916740 --- /dev/null +++ b/resolve/RandSketchingFWHT.cpp @@ -0,0 +1,163 @@ +#include +#include + +#include "RandSketchingFWHT.hpp" +#include +#include +#include +#ifdef RESOLVE_USE_HIP +#include +#endif +#ifdef RESOLVE_USE_CUDA +#include +#endif +#include +namespace ReSolve +{ + using out = io::Logger; + + RandSketchingFWHT::RandSketchingFWHT() + { + h_seq_ = nullptr; + h_D_ = nullptr; + h_perm_ = nullptr; + + d_D_ = nullptr; + d_perm_ = nullptr; + d_aux_ = nullptr; + } + + // destructor + RandSketchingFWHT::~RandSketchingFWHT() + { + delete h_seq_; + delete h_D_; + delete h_perm_; + + mem_.deleteOnDevice(d_D_); + mem_.deleteOnDevice(d_perm_); + mem_.deleteOnDevice(d_aux_); + } + + // Actual sketching process + int RandSketchingFWHT::Theta(vector_type* input, vector_type* output) + { + mem_.setZeroArrayOnDevice(d_aux_, N_); + FWHT_scaleByD(n_, + d_D_, + input->getData(ReSolve::memory::DEVICE), + d_aux_); + + mem_.deviceSynchronize(); + FWHT(1, log2N_, d_aux_); + + mem_.deviceSynchronize(); + FWHT_select(k_rand_, + d_perm_, + d_aux_, + output->getData(ReSolve::memory::DEVICE)); + mem_.deviceSynchronize(); + // remember - scaling is the solver's problem + return 0; + } + + // Setup the parameters, sampling matrices, permuations, etc + int RandSketchingFWHT::setup(index_type n, index_type k) + { + k_rand_ = k; + n_ = n; + // pad to the nearest power of 2 + real_type N_real = std::pow(2.0, std::ceil(std::log(n_)/std::log(2.0))); + if (N_real > static_cast(std::numeric_limits::max())) { + out::error() << "Exceeded numerical limits of index_type ...\n"; + return 1; + } + N_ = static_cast(N_real); + log2N_ = static_cast(std::log2(N_real)); + one_over_k_ = 1.0/std::sqrt(static_cast(k_rand_)); + + srand(static_cast(time(nullptr))); + + h_seq_ = new int[N_]; + h_perm_ = new int[k_rand_]; + h_D_ = new int[n_]; + + int r; + int temp; + + for (int i = 0; i < N_; ++i) { + h_seq_[i] = i; + } + //Fisher-Yates + for (int i = N_ - 1; i > 0; i--) { + r = rand() % i; + temp = h_seq_[i]; + h_seq_[i] = h_seq_[r]; + h_seq_[r] = temp; + } + for (int i = 0; i < k_rand_; ++i) { + h_perm_[i] = h_seq_[i]; + } + + // and D + for (int i = 0; i < n_; ++i){ + r = rand() % 100; + if (r < 50){ + h_D_[i] = -1; + } else { + h_D_[i] = 1; + } + } + + mem_.allocateArrayOnDevice(&d_perm_, k_rand_); + mem_.allocateArrayOnDevice(&d_D_, n_); + mem_.allocateArrayOnDevice(&d_aux_, N_); + + //then copy + + mem_.copyArrayHostToDevice(d_perm_, h_perm_, k_rand_); + mem_.copyArrayHostToDevice(d_D_, h_D_, n_); + + return 0; + } + + //to be fixed, this can be done on the GPU + int RandSketchingFWHT::reset() // if needed can be reset (like when Krylov method restarts) + { + srand(static_cast(time(nullptr))); + + int r; + int temp; + + for (int i = 0; i < N_; ++i) { + h_seq_[i] = i; + } + //Fisher-Yates + for (int i = N_ - 1; i > 0; i--) { + r = rand() % i; + temp = h_seq_[i]; + h_seq_[i] = h_seq_[r]; + h_seq_[r] = temp; + } + for (int i = 0; i < k_rand_; ++i) { + h_perm_[i] = h_seq_[i]; + } + + // and D + for (int i = 0; i < n_; ++i){ + r = rand() % 100; + if (r < 50){ + h_D_[i] = -1; + } else { + h_D_[i] = 1; + } + } + + //and copy + + mem_.copyArrayHostToDevice(d_perm_, h_perm_, k_rand_); + mem_.copyArrayHostToDevice(d_D_, h_D_, n_); + + return 0; + } +} diff --git a/resolve/RandSketchingFWHT.hpp b/resolve/RandSketchingFWHT.hpp new file mode 100644 index 000000000..4cab9e510 --- /dev/null +++ b/resolve/RandSketchingFWHT.hpp @@ -0,0 +1,44 @@ +#pragma once +#include +#include + +namespace ReSolve { + // Forward declaration of vector::Vector class + namespace vector + { + class Vector; + } + + class RandSketchingFWHT : public RandSketchingManager + { + + using vector_type = vector::Vector; + public: + // constructor + + RandSketchingFWHT(); + + // destructor + virtual ~RandSketchingFWHT(); + + // Actual sketching process + virtual int Theta(vector_type* input, vector_type* output); + + // Setup the parameters, sampling matrices, permuations, etc + virtual int setup(index_type n, index_type k); + virtual int reset(); // if needed can be reset (like when Krylov method restarts) + + private: + index_type* h_seq_; + index_type* h_D_; + index_type* h_perm_; + + index_type* d_D_; + index_type* d_perm_; + real_type* d_aux_; + + index_type N_; + index_type log2N_; + real_type one_over_k_; + }; +} diff --git a/resolve/RandSketchingManager.cpp b/resolve/RandSketchingManager.cpp new file mode 100644 index 000000000..3cb426706 --- /dev/null +++ b/resolve/RandSketchingManager.cpp @@ -0,0 +1,33 @@ +// this is a virtual class +#include "RandSketchingManager.hpp" + +namespace ReSolve { + + // constructor + RandSketchingManager::RandSketchingManager() + { + k_rand_ = 0; + n_ = 0; + N_ = 0; + } + + // destructor + RandSketchingManager::~RandSketchingManager() + { + } + + index_type RandSketchingManager::getVectorSize() + { + return n_; + } + + index_type RandSketchingManager::getSketchSize() + { + return k_rand_; + } + + index_type RandSketchingManager::getPaddedSize() + { + return N_; + } +} diff --git a/resolve/RandSketchingManager.hpp b/resolve/RandSketchingManager.hpp new file mode 100644 index 000000000..baff7b0ed --- /dev/null +++ b/resolve/RandSketchingManager.hpp @@ -0,0 +1,36 @@ +// this is a virtual class +#pragma once +#include +#include +#include + +namespace ReSolve { + class RandSketchingManager { + using vector_type = vector::Vector; + + public: + // constructor + RandSketchingManager(); + + // destructor + virtual ~RandSketchingManager(); + + // Actual sketching process + virtual int Theta(vector_type* input, vector_type* output) = 0; + + // Setup the parameters, sampling matrices, permuations, etc + virtual int setup(index_type n, index_type k) = 0; + virtual int reset() = 0; + + virtual index_type getVectorSize(); + virtual index_type getSketchSize(); + virtual index_type getPaddedSize(); + + protected: + index_type n_;// size of base vector + index_type k_rand_; // size of sketched vector + index_type N_; // padded n -- generally N_ > n_ + + MemoryHandler mem_; ///< Device memory manager object + }; +} // namespace ReSolve diff --git a/resolve/cuda/cudaKernels.cu b/resolve/cuda/cudaKernels.cu index 023e114f3..4773464c9 100644 --- a/resolve/cuda/cudaKernels.cu +++ b/resolve/cuda/cudaKernels.cu @@ -1,4 +1,5 @@ #include "cudaKernels.h" +#include #define maxk 1024 #define Tv5 1024 //computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1 @@ -138,6 +139,173 @@ __global__ void matrixInfNormPart1(const int n, } } +// for count sketch sketching random method +__global__ void count_sketch_kernel(const int n, + const int k, + const int* labels, + const int* flip, + const double* input, + double* output){ + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + while (idx < n){ + //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] ); + double val = input[idx]; + if (flip[idx] != 1){ + val *= -1.0; + } + atomicAdd(&output[labels[idx]], val); + idx += blockDim.x * gridDim.x; + } +} + +// for Walsh-Hadamard transform + +//kernel 1 +__global__ void select_kernel(const int k, + const int* perm, + const double* input, + double* output){ + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + while (idx < k){ + //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] ); + output[idx] = input[perm[idx]]; + idx += blockDim.x * gridDim.x; + } +} + +//kernel 2 +__global__ void scaleByD_kernel(const int n, + const int* D, + const double* x, + double* y){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + while (idx < n){ + + if (D[idx] == 1) y[idx]=x[idx]; + else y[idx]= (-1.0)*x[idx]; + + idx += blockDim.x * gridDim.x; + } +} + +//kernels 3 and 4 + + +#define ELEMENTARY_LOG2SIZE 11 +namespace cg = cooperative_groups; +//////////////////////////////////////////////////////////////////////////////// +// Single in-global memory radix-4 Fast Walsh Transform pass +// (for strides exceeding elementary vector size) +//////////////////////////////////////////////////////////////////////////////// + +__global__ void fwtBatch2Kernel(double* d_Output, double* d_Input, int stride) +{ + const int pos = blockIdx.x * blockDim.x + threadIdx.x; + const int N = blockDim.x * gridDim.x * 4; + + double* d_Src = d_Input + blockIdx.y * N; + double* d_Dst = d_Output + blockIdx.y * N; + + int lo = pos & (stride - 1); + int i0 = ((pos - lo) << 2) + lo; + int i1 = i0 + stride; + int i2 = i1 + stride; + int i3 = i2 + stride; + + double D0 = d_Src[i0]; + double D1 = d_Src[i1]; + double D2 = d_Src[i2]; + double D3 = d_Src[i3]; + + double T; + T = D0; + D0 = D0 + D2; + D2 = T - D2; + T = D1; + D1 = D1 + D3; + D3 = T - D3; + T = D0; + d_Dst[i0] = D0 + D1; + d_Dst[i1] = T - D1; + T = D2; + d_Dst[i2] = D2 + D3; + d_Dst[i3] = T - D3; +} + + +__global__ void fwtBatch1Kernel(double* d_Output, double* d_Input, int log2N) +{ + // Handle to thread block group + + cg::thread_block cta = cg::this_thread_block(); + const int N = 1 << log2N; + const int base = blockIdx.x << log2N; + + //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80 + extern __shared__ double s_data[]; + double* d_Src = d_Input + base; + double* d_Dst = d_Output + base; + + for (int pos = threadIdx.x; pos < N; pos += blockDim.x) { + s_data[pos] = d_Src[pos]; + } + + // Main radix-4 stages + const int pos = threadIdx.x; + + for (int stride = N >> 2; stride > 0; stride >>= 2) { + int lo = pos & (stride - 1); + int i0 = ((pos - lo) << 2) + lo; + int i1 = i0 + stride; + int i2 = i1 + stride; + int i3 = i2 + stride; + + cg::sync(cta); + double D0 = s_data[i0]; + double D1 = s_data[i1]; + double D2 = s_data[i2]; + double D3 = s_data[i3]; + + double T; + T = D0; + D0 = D0 + D2; + D2 = T - D2; + T = D1; + D1 = D1 + D3; + D3 = T - D3; + T = D0; + s_data[i0] = D0 + D1; + s_data[i1] = T - D1; + T = D2; + s_data[i2] = D2 + D3; + s_data[i3] = T - D3; + } + + // Do single radix-2 stage for odd power of two + if (log2N & 1) { + + cg::sync(cta); + + for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) { + int i0 = pos << 1; + int i1 = i0 + 1; + + double D0 = s_data[i0]; + double D1 = s_data[i1]; + s_data[i0] = D0 + D1; + s_data[i1] = D0 - D1; + } + } + + cg::sync(cta); + + for (int pos = threadIdx.x; pos < N; pos += blockDim.x) { + d_Dst[pos] = s_data[pos]; + } +} void mass_inner_product_two_vectors(int n, int i, @@ -148,6 +316,7 @@ void mass_inner_product_two_vectors(int n, { MassIPTwoVec_kernel<<>>(vec1, vec2, mvec, result, i + 1, n); } + void mass_axpy(int n, int i, double* x, double* y, double* alpha) { massAxpy3_kernel<<<(n + 384 - 1) / 384, 384>>>(n, i + 1, x, y, alpha); @@ -160,4 +329,44 @@ void matrix_row_sums(int n, double* result) { matrixInfNormPart1<<<1000,1024>>>(n, nnz, a_ia, a_val, result); -} \ No newline at end of file +} + +void count_sketch_theta(int n, + int k, + int* labels, + int* flip, + double* input, + double* output) +{ + + count_sketch_kernel<<<10000, 1024>>>(n, k, labels, flip, input, output); +} + +void FWHT_select(int k, + int* perm, + double* input, + double* output) +{ + select_kernel<<<1000,1024>>>(k, perm, input, output); +} + +void FWHT_scaleByD(int n, + int* D, + double* x, + double* y) +{ + scaleByD_kernel<<<1000,1024>>>(n, D, x, y); +} + +void FWHT(int M, int log2N, double* d_Data) { + + const int THREAD_N = 1024; + int N = 1 << log2N; + dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1); + + for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) { + fwtBatch2Kernel<<>>(d_Data, d_Data, N / 4); + } + + fwtBatch1Kernel<<>>(d_Data, d_Data, log2N); +} diff --git a/resolve/cuda/cudaKernels.h b/resolve/cuda/cudaKernels.h index 9c48783a8..460118f9d 100644 --- a/resolve/cuda/cudaKernels.h +++ b/resolve/cuda/cudaKernels.h @@ -12,3 +12,23 @@ void matrix_row_sums(int n, int* a_ia, double* a_val, double* result); + +// needed for rand solver +void count_sketch_theta(int n, + int k, + int* labels, + int* flip, + double* input, + double* output); + +void FWHT_select(int k, + int* perm, + double* input, + double* output); + +void FWHT_scaleByD(int n, + int* D, + double* x, + double* y); + +void FWHT(int M, int log2N, double* d_Data); diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h index 986efc841..43872be67 100644 --- a/resolve/hip/hipKernels.h +++ b/resolve/hip/hipKernels.h @@ -19,7 +19,28 @@ void permuteVectorP(int n, int* perm_vector, double* vec_in, double* vec_out); + void permuteVectorQ(int n, int* perm_vector, double* vec_in, double* vec_out); + +// needed for rand solver +void count_sketch_theta(int n, + int k, + int* labels, + int* flip, + double* input, + double* output); + +void FWHT_select(int k, + int* perm, + double* input, + double* output); + +void FWHT_scaleByD(int n, + int* D, + double* x, + double* y); + +void FWHT(int M, int log2N, double* d_Data); diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip index 3fd2a8a0d..2b4d65293 100644 --- a/resolve/hip/hipKernels.hip +++ b/resolve/hip/hipKernels.hip @@ -1,6 +1,7 @@ #include "hipKernels.h" #include +#include //computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1 template @@ -152,7 +153,7 @@ __global__ void permuteVectorP_kernel(const int n, int idx = blockIdx.x*blockDim.x + threadIdx.x; while (idx> 2; stride > 0; stride >>= 2) { + int lo = pos & (stride - 1); + int i0 = ((pos - lo) << 2) + lo; + int i1 = i0 + stride; + int i2 = i1 + stride; + int i3 = i2 + stride; + + cg::sync(cta); + double D0 = s_data[i0]; + double D1 = s_data[i1]; + double D2 = s_data[i2]; + double D3 = s_data[i3]; + + double T; + T = D0; + D0 = D0 + D2; + D2 = T - D2; + T = D1; + D1 = D1 + D3; + D3 = T - D3; + T = D0; + s_data[i0] = D0 + D1; + s_data[i1] = T - D1; + T = D2; + s_data[i2] = D2 + D3; + s_data[i3] = T - D3; + } + + // Do single radix-2 stage for odd power of two + if (log2N & 1) { + + cg::sync(cta); + + for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) { + int i0 = pos << 1; + int i1 = i0 + 1; + + double D0 = s_data[i0]; + double D1 = s_data[i1]; + s_data[i0] = D0 + D1; + s_data[i1] = D0 - D1; + } + } + + cg::sync(cta); + + for (int pos = threadIdx.x; pos < N; pos += blockDim.x) { + d_Dst[pos] = s_data[pos]; + } +} + + + void mass_inner_product_two_vectors(int n, int i, double* vec1, @@ -208,3 +379,43 @@ void permuteVectorQ(int n, { hipLaunchKernelGGL(permuteVectorQ_kernel, dim3(1000), dim3(1024), 0, 0, n, perm_vector, vec_in, vec_out); } + +void count_sketch_theta(int n, + int k, + int* labels, + int* flip, + double* input, + double* output) +{ + + hipLaunchKernelGGL(count_sketch_kernel, dim3(10000), dim3(1024), 0, 0, n, k, labels, flip, input, output); +} + +void FWHT_select(int k, + int* perm, + double* input, + double* output) +{ + hipLaunchKernelGGL(select_kernel, dim3(1000), dim3(1024), 0, 0,k, perm, input, output); +} + +void FWHT_scaleByD(int n, + int* D, + double* x, + double* y) +{ + hipLaunchKernelGGL(scaleByD_kernel, dim3(1000), dim3(1024), 0, 0, n, D, x, y); +} + +void FWHT(int M, int log2N, double* d_Data) { + + const int THREAD_N = 1024; + int N = 1 << log2N; + dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1); + + for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) { + hipLaunchKernelGGL(fwtBatch2Kernel, grid, dim3(THREAD_N), 0, 0, d_Data, d_Data, N / 4); + } + + hipLaunchKernelGGL(fwtBatch1Kernel, dim3(M), dim3(N / 4), N * sizeof(double), 0, d_Data, d_Data, log2N); +} diff --git a/resolve/vector/VectorHandlerHip.cpp b/resolve/vector/VectorHandlerHip.cpp index 1e1195fc8..9689a2bb3 100644 --- a/resolve/vector/VectorHandlerHip.cpp +++ b/resolve/vector/VectorHandlerHip.cpp @@ -51,6 +51,7 @@ namespace ReSolve { rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); double nrm = 0.0; rocblas_status st= rocblas_ddot (handle_rocblas, x->getSize(), x->getData(memory::DEVICE), 1, y->getData(memory::DEVICE), 1, &nrm); + if (st!=0) {printf("dot product crashed with code %d \n", st);} return nrm; } @@ -68,6 +69,7 @@ namespace ReSolve { LinAlgWorkspaceHIP* workspaceHIP = workspace_; rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); rocblas_status st = rocblas_dscal(handle_rocblas, x->getSize(), alpha, x->getData(memory::DEVICE), 1); + if (st!=0) { ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n"; } @@ -94,6 +96,7 @@ namespace ReSolve { 1, y->getData(memory::DEVICE), 1); + } /** @@ -153,6 +156,7 @@ namespace ReSolve { x->getData(memory::DEVICE), 1); } + } /** @@ -172,6 +176,7 @@ namespace ReSolve { using namespace constants; if (k < 200) { mass_axpy(size, k, x->getData(memory::DEVICE), y->getData(memory::DEVICE),alpha->getData(memory::DEVICE)); + } else { LinAlgWorkspaceHIP* workspaceHIP = workspace_; rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); @@ -189,6 +194,7 @@ namespace ReSolve { &ONE, y->getData(memory::DEVICE), // c size); // ldc + } } @@ -213,6 +219,7 @@ namespace ReSolve { if (k < 200) { mass_inner_product_two_vectors(size, k, x->getData(memory::DEVICE) , x->getData(1, memory::DEVICE), V->getData(memory::DEVICE), res->getData(memory::DEVICE)); + } else { LinAlgWorkspaceHIP* workspaceHIP = workspace_; rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); @@ -230,6 +237,7 @@ namespace ReSolve { &ZERO, res->getData(memory::DEVICE), //c k + 1); //ldc + } } diff --git a/tests/functionality/CMakeLists.txt b/tests/functionality/CMakeLists.txt index 85b47fd72..7d2aaa1b7 100644 --- a/tests/functionality/CMakeLists.txt +++ b/tests/functionality/CMakeLists.txt @@ -23,7 +23,10 @@ if(RESOLVE_USE_CUDA) # Build KLU+GLU test add_executable(klu_glu_test.exe testKLU_GLU.cpp) target_link_libraries(klu_glu_test.exe PRIVATE ReSolve) - + + # Build randSolver test + add_executable(rand_gmres_cuda_test.exe testRandGMRES_Cuda.cpp) + target_link_libraries(rand_gmres_cuda_test.exe PRIVATE ReSolve) endif(RESOLVE_USE_CUDA) @@ -37,6 +40,9 @@ if(RESOLVE_USE_HIP) add_executable(rocsolver_rf_fgmres_test.exe testKLU_RocSolver_FGMRES.cpp) target_link_libraries(rocsolver_rf_fgmres_test.exe PRIVATE ReSolve) + # Build randSolver test + add_executable(rand_gmres_hip_test.exe testRandGMRES_Rocm.cpp) + target_link_libraries(rand_gmres_hip_test.exe PRIVATE ReSolve) endif(RESOLVE_USE_HIP) # Install tests @@ -46,13 +52,15 @@ if(RESOLVE_USE_CUDA) set(installable_tests ${installable_tests} klu_rf_test.exe klu_rf_fgmres_test.exe - klu_glu_test.exe) + klu_glu_test.exe + rand_gmres_cuda_test.exe) endif(RESOLVE_USE_CUDA) if(RESOLVE_USE_HIP) set(installable_tests ${installable_tests} rocsolver_rf_test.exe - rocsolver_rf_fgmres_test.exe) + rocsolver_rf_fgmres_test.exe + rand_gmres_hip_test.exe) endif(RESOLVE_USE_HIP) install(TARGETS ${installable_tests} @@ -68,9 +76,11 @@ if(RESOLVE_USE_CUDA) add_test(NAME klu_rf_test COMMAND $ "${test_data_dir}") add_test(NAME klu_rf_fgmres_test COMMAND $ "${test_data_dir}") add_test(NAME klu_glu_test COMMAND $ "${test_data_dir}") + add_test(NAME rand_gmres_cuda_test COMMAND $) endif(RESOLVE_USE_CUDA) if(RESOLVE_USE_HIP) add_test(NAME rocsolver_rf_test COMMAND $ "${test_data_dir}") add_test(NAME rocsolver_rf_fgmres_test COMMAND $ "${test_data_dir}") + add_test(NAME rand_gmres_hip_test COMMAND $) endif(RESOLVE_USE_HIP) diff --git a/tests/functionality/testRandGMRES_Cuda.cpp b/tests/functionality/testRandGMRES_Cuda.cpp new file mode 100644 index 000000000..b4cdffec0 --- /dev/null +++ b/tests/functionality/testRandGMRES_Cuda.cpp @@ -0,0 +1,231 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ReSolve::constants; +using real_type = ReSolve::real_type; +using index_type = ReSolve::index_type; +using vector_type = ReSolve::vector::Vector; + +ReSolve::matrix::Csr* generateMatrix(const index_type N); +ReSolve::vector::Vector* generateRhs(const index_type N); + +int main(int argc, char *argv[]) +{ + // Use the same data types as those you specified in ReSolve build. + + + //we want error sum to be 0 at the end + //that means PASS. + //otheriwse it is a FAIL. + int error_sum = 0; + int status; + const index_type N = (argc == 2) ? atoi(argv[1]) : 10000; + ReSolve::matrix::Csr* A = generateMatrix(N); + + vector_type* vec_rhs = generateRhs(N); + + ReSolve::LinAlgWorkspaceCUDA* workspace_CUDA = new ReSolve::LinAlgWorkspaceCUDA(); + workspace_CUDA->initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); + + vector_type* vec_x; + + ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2); + + ReSolve::LinSolverDirectCuSparseILU0* Rf = new ReSolve::LinSolverDirectCuSparseILU0(workspace_CUDA); + ReSolve::LinSolverIterativeRandFGMRES* FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::cs, GS, "cuda"); + + + vec_x = new vector_type(A->getNumRows()); + vec_x->allocate(ReSolve::memory::HOST); + + //iinit guess is 0 + vec_x->allocate(ReSolve::memory::DEVICE); + vec_x->setToZero(ReSolve::memory::DEVICE); + + real_type norm_b; + matrix_handler->setValuesChanged(true, "cuda"); + + status = Rf->setup(A); + error_sum += status; + + FGMRES->setRestart(200); + FGMRES->setMaxit(2500); + FGMRES->setTol(1e-12); + FGMRES->setup(A); + + status = GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); + error_sum += status; + + //matrix_handler->setValuesChanged(true, "cuda"); + status = FGMRES->resetMatrix(A); + error_sum += status; + + status = FGMRES->setupPreconditioner("LU", Rf); + error_sum += status; + + FGMRES->setFlexible(1); + + FGMRES->solve(vec_rhs, vec_x); + + norm_b = vector_handler->dot(vec_rhs, vec_rhs, "cuda"); + norm_b = std::sqrt(norm_b); + real_type final_norm_first = FGMRES->getFinalResidualNorm(); + std::cout << "Randomized FGMRES results (first run): \n" + << "\t Sketching method: : CountSketch\n" + << "\t Initial residual norm: ||b-Ax_0||_2 : " + << std::scientific << std::setprecision(16) + << FGMRES->getInitResidualNorm()<<" \n" + << "\t Initial relative residual norm: ||b-Ax_0||_2/||b||_2 : " + << FGMRES->getInitResidualNorm()/norm_b<<" \n" + << "\t Final residual norm: ||b-Ax||_2 : " + << FGMRES->getFinalResidualNorm() <<" \n" + << "\t Final relative residual norm: ||b-Ax||_2/||b||_2 : " + << FGMRES->getFinalResidualNorm()/norm_b <<" \n" + << "\t Number of iterations : " << FGMRES->getNumIter() << "\n"; + + delete FGMRES; + delete GS; + GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2); + FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::fwht, GS, "cuda"); + + + FGMRES->setRestart(150); + FGMRES->setMaxit(2500); + FGMRES->setTol(1e-12); + FGMRES->setup(A); + + status = GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); + error_sum += status; + + status = FGMRES->setupPreconditioner("LU", Rf); + error_sum += status; + + vec_x->setToZero(ReSolve::memory::DEVICE); + FGMRES->solve(vec_rhs, vec_x); + + + std::cout << "Randomized FGMRES results (second run): \n" + << "\t Sketching method: : FWHT\n" + << "\t Initial residual norm: ||b-Ax_0||_2 : " + << std::scientific << std::setprecision(16) + << FGMRES->getInitResidualNorm()<<" \n" + << "\t Initial relative residual norm: ||b-Ax_0||_2/||b||_2 : " + << FGMRES->getInitResidualNorm()/norm_b<<" \n" + << "\t Final residual norm: ||b-Ax||_2 : " + << FGMRES->getFinalResidualNorm() <<" \n" + << "\t Final relative residual norm: ||b-Ax||_2/||b||_2 : " + << FGMRES->getFinalResidualNorm()/norm_b <<" \n" + << "\t Number of iterations : " << FGMRES->getNumIter() << "\n"; + + if ((error_sum == 0) && (final_norm_first/norm_b < 1e-11) && (FGMRES->getFinalResidualNorm()/norm_b < 1e-11 )) { + std::cout<<"Test 5 (randomized GMRES) PASSED"<allocate(ReSolve::memory::HOST); + vec_rhs->allocate(ReSolve::memory::DEVICE); + + real_type* data = vec_rhs->getData(ReSolve::memory::HOST); + for (int i = 0; i < N; ++i) { + if (i % 2) { + data[i] = 1.0; + } else { + + data[i] = -111.0; + } + } + vec_rhs->copyData(ReSolve::memory::HOST, ReSolve::memory::DEVICE); + return vec_rhs; +} + +ReSolve::matrix::Csr* generateMatrix(const index_type N) +{ + std::vector r1 = {1., 5., 7., 8., 3., 2., 4.}; // sum 30 + std::vector r2 = {1., 3., 2., 2., 1., 6., 7., 3., 2., 3.}; // sum 30 + std::vector r3 = {11., 15., 4.}; // sum 30 + std::vector r4 = {1., 1., 5., 1., 9., 2., 1., 2., 3., 2., 3.}; // sum 30 + std::vector r5 = {6., 5., 7., 3., 2., 5., 2.}; // sum 30 + + + const std::vector > data = {r1, r2, r3, r4, r5}; + + // std::cout << N << "\n"; + + // First compute number of nonzeros + index_type NNZ = 0; + for (index_type i = 0; i < N; ++i) + { + size_t reminder = static_cast(i%5); + NNZ += static_cast(data[reminder].size()); + } + + // Allocate NxN CSR matrix with NNZ nonzeros + ReSolve::matrix::Csr* A = new ReSolve::matrix::Csr(N, N, NNZ); + A->allocateMatrixData(ReSolve::memory::HOST); + + index_type* rowptr = A->getRowData(ReSolve::memory::HOST); + index_type* colidx = A->getColData(ReSolve::memory::HOST); + real_type* val = A->getValues(ReSolve::memory::HOST); + + // Populate CSR matrix using same row pattern as for NNZ calculation + rowptr[0] = 0; + index_type where; + real_type what; + for (index_type i=0; i < N; ++i) + { + size_t reminder = static_cast(i%5); + const std::vector& row_sample = data[reminder]; + index_type nnz_per_row = static_cast(row_sample.size()); + + rowptr[i+1] = rowptr[i] + nnz_per_row; + bool c = false; + for (index_type j = rowptr[i]; j < rowptr[i+1]; ++j) + { + if (((!c) && (((j - rowptr[i]) * N/nnz_per_row + (N%(N/nnz_per_row))) >= i)) || ((!c) && (j == (rowptr[i+1] - 1)) )) { + c = true; + where = i; + what = 4.; + } else { + where = (j - rowptr[i]) * N/nnz_per_row + (N%(N/nnz_per_row)); + what = row_sample[static_cast(j - rowptr[i])]; + } + colidx[j] = where; + // evenly distribute nonzeros ^^^^ ^^^^^^^^ perturb offset + val[j] = what; + } + } + + + A->setUpdated(ReSolve::memory::HOST); + A->copyData(ReSolve::memory::DEVICE); + return A; +} diff --git a/tests/functionality/testRandGMRES_Rocm.cpp b/tests/functionality/testRandGMRES_Rocm.cpp new file mode 100644 index 000000000..a5eb2313b --- /dev/null +++ b/tests/functionality/testRandGMRES_Rocm.cpp @@ -0,0 +1,231 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ReSolve::constants; +using real_type = ReSolve::real_type; +using index_type = ReSolve::index_type; +using vector_type = ReSolve::vector::Vector; + +ReSolve::matrix::Csr* generateMatrix(const index_type N); +ReSolve::vector::Vector* generateRhs(const index_type N); + +int main(int argc, char *argv[]) +{ + // Use the same data types as those you specified in ReSolve build. + + + //we want error sum to be 0 at the end + //that means PASS. + //otheriwse it is a FAIL. + int error_sum = 0; + int status; + const index_type N = (argc == 2) ? atoi(argv[1]) : 10000; + ReSolve::matrix::Csr* A = generateMatrix(N); + + vector_type* vec_rhs = generateRhs(N); + + ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP(); + workspace_HIP->initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_HIP); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_HIP); + + vector_type* vec_x; + + ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2); + + ReSolve::LinSolverDirectRocSparseILU0* Rf = new ReSolve::LinSolverDirectRocSparseILU0(workspace_HIP); + ReSolve::LinSolverIterativeRandFGMRES* FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::cs, GS, "hip"); + + + vec_x = new vector_type(A->getNumRows()); + vec_x->allocate(ReSolve::memory::HOST); + + //iinit guess is 0 + vec_x->allocate(ReSolve::memory::DEVICE); + vec_x->setToZero(ReSolve::memory::DEVICE); + + real_type norm_b; + matrix_handler->setValuesChanged(true, "hip"); + + status = Rf->setup(A); + error_sum += status; + + FGMRES->setRestart(200); + FGMRES->setMaxit(2500); + FGMRES->setTol(1e-12); + FGMRES->setup(A); + + status = GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); + error_sum += status; + + //matrix_handler->setValuesChanged(true, "hip"); + status = FGMRES->resetMatrix(A); + error_sum += status; + + status = FGMRES->setupPreconditioner("LU", Rf); + error_sum += status; + + FGMRES->setFlexible(1); + + FGMRES->solve(vec_rhs, vec_x); + + norm_b = vector_handler->dot(vec_rhs, vec_rhs, "hip"); + norm_b = std::sqrt(norm_b); + real_type final_norm_first = FGMRES->getFinalResidualNorm(); + std::cout << "Randomized FGMRES results (first run): \n" + << "\t Sketching method: : CountSketch\n" + << "\t Initial residual norm: ||b-Ax_0||_2 : " + << std::scientific << std::setprecision(16) + << FGMRES->getInitResidualNorm()<<" \n" + << "\t Initial relative residual norm: ||b-Ax_0||_2/||b||_2 : " + << FGMRES->getInitResidualNorm()/norm_b<<" \n" + << "\t Final residual norm: ||b-Ax||_2 : " + << FGMRES->getFinalResidualNorm() <<" \n" + << "\t Final relative residual norm: ||b-Ax||_2/||b||_2 : " + << FGMRES->getFinalResidualNorm()/norm_b <<" \n" + << "\t Number of iterations : " << FGMRES->getNumIter() << "\n"; + + delete FGMRES; + delete GS; + GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2); + FGMRES = new ReSolve::LinSolverIterativeRandFGMRES(matrix_handler, vector_handler,ReSolve::LinSolverIterativeRandFGMRES::fwht, GS, "hip"); + + + FGMRES->setRestart(150); + FGMRES->setMaxit(2500); + FGMRES->setTol(1e-12); + FGMRES->setup(A); + + status = GS->setup(FGMRES->getKrand(), FGMRES->getRestart()); + error_sum += status; + + status = FGMRES->setupPreconditioner("LU", Rf); + error_sum += status; + + vec_x->setToZero(ReSolve::memory::DEVICE); + FGMRES->solve(vec_rhs, vec_x); + + + std::cout << "Randomized FGMRES results (second run): \n" + << "\t Sketching method: : FWHT\n" + << "\t Initial residual norm: ||b-Ax_0||_2 : " + << std::scientific << std::setprecision(16) + << FGMRES->getInitResidualNorm()<<" \n" + << "\t Initial relative residual norm: ||b-Ax_0||_2/||b||_2 : " + << FGMRES->getInitResidualNorm()/norm_b<<" \n" + << "\t Final residual norm: ||b-Ax||_2 : " + << FGMRES->getFinalResidualNorm() <<" \n" + << "\t Final relative residual norm: ||b-Ax||_2/||b||_2 : " + << FGMRES->getFinalResidualNorm()/norm_b <<" \n" + << "\t Number of iterations : " << FGMRES->getNumIter() << "\n"; + + if ((error_sum == 0) && (final_norm_first/norm_b < 1e-11) && (FGMRES->getFinalResidualNorm()/norm_b < 1e-11 )) { + std::cout<<"Test 5 (randomized GMRES) PASSED"<allocate(ReSolve::memory::HOST); + vec_rhs->allocate(ReSolve::memory::DEVICE); + + real_type* data = vec_rhs->getData(ReSolve::memory::HOST); + for (int i = 0; i < N; ++i) { + if (i % 2) { + data[i] = 1.0; + } else { + + data[i] = -111.0; + } + } + vec_rhs->copyData(ReSolve::memory::HOST, ReSolve::memory::DEVICE); + return vec_rhs; +} + +ReSolve::matrix::Csr* generateMatrix(const index_type N) +{ + std::vector r1 = {1., 5., 7., 8., 3., 2., 4.}; // sum 30 + std::vector r2 = {1., 3., 2., 2., 1., 6., 7., 3., 2., 3.}; // sum 30 + std::vector r3 = {11., 15., 4.}; // sum 30 + std::vector r4 = {1., 1., 5., 1., 9., 2., 1., 2., 3., 2., 3.}; // sum 30 + std::vector r5 = {6., 5., 7., 3., 2., 5., 2.}; // sum 30 + + + const std::vector > data = {r1, r2, r3, r4, r5}; + + // std::cout << N << "\n"; + + // First compute number of nonzeros + index_type NNZ = 0; + for (index_type i = 0; i < N; ++i) + { + size_t reminder = static_cast(i%5); + NNZ += static_cast(data[reminder].size()); + } + + // Allocate NxN CSR matrix with NNZ nonzeros + ReSolve::matrix::Csr* A = new ReSolve::matrix::Csr(N, N, NNZ); + A->allocateMatrixData(ReSolve::memory::HOST); + + index_type* rowptr = A->getRowData(ReSolve::memory::HOST); + index_type* colidx = A->getColData(ReSolve::memory::HOST); + real_type* val = A->getValues(ReSolve::memory::HOST); + + // Populate CSR matrix using same row pattern as for NNZ calculation + rowptr[0] = 0; + index_type where; + real_type what; + for (index_type i=0; i < N; ++i) + { + size_t reminder = static_cast(i%5); + const std::vector& row_sample = data[reminder]; + index_type nnz_per_row = static_cast(row_sample.size()); + + rowptr[i+1] = rowptr[i] + nnz_per_row; + bool c = false; + for (index_type j = rowptr[i]; j < rowptr[i+1]; ++j) + { + if (((!c) && (((j - rowptr[i]) * N/nnz_per_row + (N%(N/nnz_per_row))) >= i)) || ((!c) && (j == (rowptr[i+1] - 1)) )) { + c = true; + where = i; + what = 4.; + } else { + where = (j - rowptr[i]) * N/nnz_per_row + (N%(N/nnz_per_row)); + what = row_sample[static_cast(j - rowptr[i])]; + } + colidx[j] = where; + // evenly distribute nonzeros ^^^^ ^^^^^^^^ perturb offset + val[j] = what; + } + } + + + A->setUpdated(ReSolve::memory::HOST); + A->copyData(ReSolve::memory::DEVICE); + return A; +}