From 0cb0b5c1628b56bff69577a9273b86c865aa0107 Mon Sep 17 00:00:00 2001 From: kswirydo Date: Mon, 20 Apr 2026 17:40:04 -0400 Subject: [PATCH 1/3] Port equality constraint Jacobian from PETSc to RAJA GPU kernels Replace the PETSc-based equality constraint Jacobian computation in the PBPOLRAJAHIOPSPARSE model with direct GPU kernels using RAJA, eliminating the D2H-compute-H2D round trip. The sparsity pattern is now computed on the host during setup and the values are computed entirely on device. Key changes: - Add ComputeEqJacValuesGPU_PBPOLRAJAHIOPSPARSE in new gpu.cpp/hpp files - Add device arrays for flat-array indices (bus eqjacsp_selfidx, line eqjacsp_idx/eqjacsp_diag_idx/isdcline, gen xpdevidx/xpsetidx) - Fix nnz counting bugs (missing gen/load entries, off-by-one in line loop) and populate flat-array indices during model setup - Replace PETSc MatGetRow extraction in sparsity and values phases - Handle parallel lines by sharing off-diagonal positions with atomicAdd - Use pre-computed nnz in get_sparse_blocks_info instead of PETSc query - Add correctness test (test_eqjac_compare) and performance benchmark (test_eqjac_perf) Made-with: Cursor --- src/opflow/CMakeLists.txt | 3 +- .../model/power_bal_hiop/paramsrajahiop.cpp | 43 +++ .../model/power_bal_hiop/paramsrajahiop.h | 13 + .../power_bal_hiop/pbpolrajahiopsparse.cpp | 250 +++++++++++++- .../pbpolrajahiopsparse_gpu.cpp | 211 ++++++++++++ .../pbpolrajahiopsparse_gpu.hpp | 16 + .../pbpolrajahiopsparsekernels.cpp | 234 ++++++++++---- .../solver/hiop/opflow_hiopsparsegpu.cpp | 16 +- .../equality/CMakeLists.txt | 34 ++ .../equality/test_eqjac_compare.cpp | 306 ++++++++++++++++++ .../equality/test_eqjac_perf.cpp | 241 ++++++++++++++ 11 files changed, 1289 insertions(+), 78 deletions(-) create mode 100644 tests/unit/opflow/constraint_jacobian/equality/test_eqjac_compare.cpp create mode 100644 tests/unit/opflow/constraint_jacobian/equality/test_eqjac_perf.cpp diff --git a/src/opflow/CMakeLists.txt b/src/opflow/CMakeLists.txt index 863f45f4..fd767aea 100644 --- a/src/opflow/CMakeLists.txt +++ b/src/opflow/CMakeLists.txt @@ -41,7 +41,8 @@ if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_CUDA) set_source_files_properties( model/power_bal_hiop/pbpolrajahiopkernels.cpp model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp - model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp PROPERTIES LANGUAGE CUDA + model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp PROPERTIES LANGUAGE + CUDA ) endif() diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp b/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp index fbe44e4e..29d4e582 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp @@ -21,6 +21,7 @@ int BUSParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(bl); h_allocator_.deallocate(xidx); h_allocator_.deallocate(gidx); + h_allocator_.deallocate(eqjacsp_selfidx); if (opflow->include_powerimbalance_variables) { h_allocator_.deallocate(xidxpimb); h_allocator_.deallocate(powerimbalance_penalty); @@ -45,6 +46,7 @@ int BUSParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(bl_dev_); d_allocator_.deallocate(xidx_dev_); d_allocator_.deallocate(gidx_dev_); + d_allocator_.deallocate(eqjacsp_selfidx_dev_); if (opflow->include_powerimbalance_variables) { d_allocator_.deallocate(xidxpimb_dev_); d_allocator_.deallocate(powerimbalance_penalty_dev_); @@ -84,6 +86,7 @@ int BUSParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(xidx_dev_, xidx); resmgr.copy(gidx_dev_, gidx); + resmgr.copy(eqjacsp_selfidx_dev_, eqjacsp_selfidx); if (opflow->include_powerimbalance_variables) { resmgr.copy(xidxpimb_dev_, xidxpimb); resmgr.copy(jacsp_idx_dev_, jacsp_idx); @@ -108,6 +111,7 @@ int BUSParamsRajaHiop::copy(OPFLOW opflow) { xidx_dev_ = xidx; xidxpimb_dev_ = xidxpimb; gidx_dev_ = gidx; + eqjacsp_selfidx_dev_ = eqjacsp_selfidx; jacsp_idx_dev_ = jacsp_idx; jacsq_idx_dev_ = jacsq_idx; powerimbalance_penalty_dev_ = powerimbalance_penalty; @@ -145,6 +149,7 @@ int BUSParamsRajaHiop::allocate(OPFLOW opflow) { xidx = paramAlloc(h_allocator_, nbus); gidx = paramAlloc(h_allocator_, nbus); + eqjacsp_selfidx = paramAlloc(h_allocator_, 2 * nbus); if (opflow->include_powerimbalance_variables) { xidxpimb = paramAlloc(h_allocator_, nbus); @@ -232,6 +237,7 @@ int BUSParamsRajaHiop::allocate(OPFLOW opflow) { xidx_dev_ = paramAlloc(d_allocator_, nbus); gidx_dev_ = paramAlloc(d_allocator_, nbus); + eqjacsp_selfidx_dev_ = paramAlloc(d_allocator_, 2 * nbus); if (opflow->include_powerimbalance_variables) { xidxpimb_dev_ = paramAlloc(d_allocator_, nbus); @@ -278,6 +284,9 @@ int LINEParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(ineqjacsp_idx_dev_, ineqjacsp_idx); resmgr.copy(xslackidx_dev_, xslackidx); } + resmgr.copy(eqjacsp_idx_dev_, eqjacsp_idx); + resmgr.copy(eqjacsp_diag_idx_dev_, eqjacsp_diag_idx); + resmgr.copy(isdcline_dev_, isdcline); #else Gff_dev_ = Gff; Bff_dev_ = Bff; @@ -299,6 +308,9 @@ int LINEParamsRajaHiop::copy(OPFLOW opflow) { ineqjacsp_idx_dev_ = ineqjacsp_idx; xslackidx_dev_ = xslackidx; } + eqjacsp_idx_dev_ = eqjacsp_idx; + eqjacsp_diag_idx_dev_ = eqjacsp_diag_idx; + isdcline_dev_ = isdcline; #endif return 0; } @@ -328,6 +340,9 @@ int LINEParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(ineqjacsp_idx); h_allocator_.deallocate(xslackidx); } + h_allocator_.deallocate(eqjacsp_idx); + h_allocator_.deallocate(eqjacsp_diag_idx); + h_allocator_.deallocate(isdcline); #ifdef EXAGO_ENABLE_GPU // Destroy parameter arrays on the device @@ -354,6 +369,9 @@ int LINEParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(ineqjacsp_idx_dev_); d_allocator_.deallocate(xslackidx_dev_); } + d_allocator_.deallocate(eqjacsp_idx_dev_); + d_allocator_.deallocate(eqjacsp_diag_idx_dev_); + d_allocator_.deallocate(isdcline_dev_); #endif return 0; @@ -404,6 +422,10 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { xslackidx = paramAlloc(h_allocator_, nlinelim); } + eqjacsp_idx = paramAlloc(h_allocator_, nlineON); + eqjacsp_diag_idx = paramAlloc(h_allocator_, 4 * nlineON); + isdcline = paramAlloc(h_allocator_, nlineON); + PetscInt j = 0; /* Populate arrays */ for (i = 0; i < ps->nline; i++) { @@ -451,6 +473,7 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { j++; } + isdcline[linei] = (int)line->isdcline; linei++; } @@ -480,6 +503,10 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { ineqjacsp_idx_dev_ = paramAlloc(d_allocator_, nlinelim); xslackidx_dev_ = paramAlloc(d_allocator_, nlinelim); } + + eqjacsp_idx_dev_ = paramAlloc(d_allocator_, nlineON); + eqjacsp_diag_idx_dev_ = paramAlloc(d_allocator_, 4 * nlineON); + isdcline_dev_ = paramAlloc(d_allocator_, nlineON); #endif return 0; } @@ -630,6 +657,8 @@ int GENParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(gineqidxgen); h_allocator_.deallocate(gbineqidxgen); h_allocator_.deallocate(pgs); + h_allocator_.deallocate(xpdevidx); + h_allocator_.deallocate(xpsetidx); h_allocator_.deallocate(eqjacspgen_idx); h_allocator_.deallocate(ineqjacspgen_idx); } @@ -656,6 +685,8 @@ int GENParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(gineqidxgen_dev_); d_allocator_.deallocate(gbineqidxgen_dev_); d_allocator_.deallocate(pgs_dev_); + d_allocator_.deallocate(xpdevidx_dev_); + d_allocator_.deallocate(xpsetidx_dev_); d_allocator_.deallocate(eqjacspgen_idx_dev_); d_allocator_.deallocate(ineqjacspgen_idx_dev_); } @@ -697,6 +728,8 @@ int GENParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(eqjacspgen_idx_dev_, eqjacspgen_idx); resmgr.copy(ineqjacspgen_idx_dev_, ineqjacspgen_idx); resmgr.copy(pgs_dev_, pgs); + resmgr.copy(xpdevidx_dev_, xpdevidx); + resmgr.copy(xpsetidx_dev_, xpsetidx); } #else cost_alpha_dev_ = cost_alpha; @@ -721,6 +754,8 @@ int GENParamsRajaHiop::copy(OPFLOW opflow) { eqjacspgen_idx_dev_ = eqjacspgen_idx; ineqjacspgen_idx_dev_ = ineqjacspgen_idx; pgs_dev_ = pgs; + xpdevidx_dev_ = xpdevidx; + xpsetidx_dev_ = xpsetidx; #endif return 0; } @@ -771,6 +806,8 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { eqjacspgen_idx = paramAlloc(h_allocator_, ngenON); ineqjacspgen_idx = paramAlloc(h_allocator_, ngenON); pgs = paramAlloc(h_allocator_, ngenON); + xpdevidx = paramAlloc(h_allocator_, ngenON); + xpsetidx = paramAlloc(h_allocator_, ngenON); } /* Insert data in genparams */ @@ -798,6 +835,10 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { vs[geni] = gen->vs; if (opflow->has_gensetpoint) { pgs[geni] = gen->pgs; + if (!gen->isrenewable) { + xpdevidx[geni] = opflow->idxn2sd_map[gen->startxpdevloc]; + xpsetidx[geni] = opflow->idxn2sd_map[gen->startxpsetloc]; + } } xidx[geni] = opflow->idxn2sd_map[loc]; @@ -844,6 +885,8 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { eqjacspgen_idx_dev_ = paramAlloc(d_allocator_, ngenON); ineqjacspgen_idx_dev_ = paramAlloc(d_allocator_, ngenON); pgs_dev_ = paramAlloc(d_allocator_, ngenON); + xpdevidx_dev_ = paramAlloc(d_allocator_, ngenON); + xpsetidx_dev_ = paramAlloc(d_allocator_, ngenON); } #endif return 0; diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.h b/src/opflow/model/power_bal_hiop/paramsrajahiop.h index d274f990..13692c65 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.h +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.h @@ -34,6 +34,8 @@ struct BUSParamsRajaHiop { int *ineqjacsp_idx; /* KS: index in flat sparse ineq Jacobian array */ int *genoffset; /* KS: Offset into flattened gen array for this bus */ int *ngenONbus; /* KS: Number of ON generators on this bus */ + int *eqjacsp_selfidx; /* Flat-array position for bus self-admittance in eq + Jacobian. [2*i] = P-row base, [2*i+1] = Q-row base */ // Device data int *isref_dev_; /* isref[i] = 1 if bus is reference bus */ @@ -59,6 +61,7 @@ struct BUSParamsRajaHiop { int *ineqjacsp_idx_dev_; /* KS: device counterpart of ineqjacsp_idx_ */ int *genoffset_dev_; /* KS: dev counterpart of genoffset */ int *ngenONbus_dev_; /* KS: dev counterpart of ngenONbus */ + int *eqjacsp_selfidx_dev_; int allocate(OPFLOW); int destroy(OPFLOW); @@ -84,6 +87,7 @@ struct GENParamsRajaHiop { double *pgs; /* real power output setpoint */ double *apf; /* generator AGC participation factor */ double *vs; /* voltage setpoint */ + int *xpsetidx; /* Starting locations in X vector for set-point variables */ int *isrenewable; /* Is renewable generator? */ int *xidx; /* starting locations in X vector */ @@ -119,6 +123,7 @@ struct GENParamsRajaHiop { double *pgs_dev_; /* real power output setpoint */ double *apf_dev_; /* KS: device counterpart of apf */ double *vs_dev_; /* KS: device counterpart of vs */ + int *xpsetidx_dev_; int *isrenewable_dev_; /* Is renewable generator? */ int *xidx_dev_; /* starting locations in X vector */ @@ -213,6 +218,11 @@ struct LINEParamsRajaHiop { int *linelimidx; /* Indices for subset of lines that have finite limits */ int *ineqjacsp_idx; /* KS: Position in flat sparse ineq Jacobian array */ int *xslackidx; /* Starting location of slack variables in X vector */ + int *eqjacsp_idx; /* Flat-array offset for off-diagonal eq Jacobian entries */ + int *eqjacsp_diag_idx; /* Flat-array positions for diagonal entries per line + [4*l+0]=from P-row, [4*l+1]=from Q-row, + [4*l+2]=to P-row, [4*l+3]=to Q-row */ + int *isdcline; /* isdcline[i] = 1 if line is a DC line */ // Device data double *Gff_dev_; /* From side self conductance */ @@ -238,6 +248,9 @@ struct LINEParamsRajaHiop { linelimidx_dev_; /* Indices for subset of lines that have finite limits */ int *ineqjacsp_idx_dev_; /* KS: Position in flat sparse ineq Jacobian array */ int *xslackidx_dev_; /* Starting location of slack variables in X vector */ + int *eqjacsp_idx_dev_; + int *eqjacsp_diag_idx_dev_; + int *isdcline_dev_; int allocate(OPFLOW); int destroy(OPFLOW); diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp index cac87986..4a3b3806 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp @@ -3,6 +3,8 @@ #if defined(EXAGO_ENABLE_RAJA) #if defined(EXAGO_ENABLE_HIOP_SPARSE) +#include + #include #include "pbpolrajahiopsparsekernels.hpp" @@ -272,6 +274,7 @@ PetscErrorCode OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE(OPFLOW opflow) { * from PETSc via get_sparse_blocks_info; inequality count is computed * axplicitly so we can skip PETSc */ int nnz_eqjacsp = 0, nnz_ineqjacsp = 0, nnz_hesssp = 0; + int nnz_ineqjac = 0, nnz_hess = 0; /* * KS: Count inequality Jacobian non-zeros. The traversal order must match @@ -333,6 +336,237 @@ PetscErrorCode OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE(OPFLOW opflow) { } } + /* Compute the number of nonzeros in equality constraint Jacobian + * and populate flat-array indices for the GPU kernel. */ + geni = 0; + int loadi = 0; + + for (int ibus = 0; ibus < ps->nbus; ++ibus) { + PSBUS bus = &(ps->bus[ibus]); + + /* P-row: bus self-admittance (theta, Vm) */ + busparams->eqjacsp_selfidx[2 * ibus] = nnz_eqjacsp; + nnz_eqjacsp += 2; + + if (bus->ide == ISOLATED_BUS) { + /* Q-row: bus self-admittance for isolated bus */ + busparams->eqjacsp_selfidx[2 * ibus + 1] = nnz_eqjacsp; + nnz_eqjacsp += 2; + continue; + } + + /* P-row: power imbalance variables */ + if (opflow->include_powerimbalance_variables) { + busparams->jacsp_idx[ibus] = nnz_eqjacsp; + nnz_eqjacsp += 2; + } + + /* P-row: generator Pg entries (-1 per active gen) */ + int gi = 0; + for (int k = 0; k < bus->ngen; k++) { + PSGEN gen; + ierr = PSBUSGetGen(bus, k, &gen); + CHKERRQ(ierr); + if (!gen->status) + continue; + genparams->eqjacspbus_idx[geni + gi] = nnz_eqjacsp; + nnz_eqjacsp += 1; + gi++; + } + + /* P-row: load loss entries (-1 per load) */ + if (opflow->include_loadloss_variables) { + for (int k = 0; k < bus->nload; k++) { + PSLOAD load; + ierr = PSBUSGetLoad(bus, k, &load); + CHKERRQ(ierr); + loadparams->jacsp_idx[loadi + k] = nnz_eqjacsp; + nnz_eqjacsp += 1; + } + } + + /* Q-row: bus self-admittance (theta, Vm) */ + busparams->eqjacsp_selfidx[2 * ibus + 1] = nnz_eqjacsp; + nnz_eqjacsp += 2; + + /* Q-row: power imbalance variables */ + if (opflow->include_powerimbalance_variables) { + busparams->jacsq_idx[ibus] = nnz_eqjacsp; + nnz_eqjacsp += 2; + } + + /* Q-row: generator Qg entries (-1 per active gen) */ + gi = 0; + for (int k = 0; k < bus->ngen; k++) { + PSGEN gen; + ierr = PSBUSGetGen(bus, k, &gen); + CHKERRQ(ierr); + if (!gen->status) + continue; + genparams->eqjacsqbus_idx[geni + gi] = nnz_eqjacsp; + nnz_eqjacsp += 1; + gi++; + } + + /* Q-row: load loss entries (-1 per load) */ + if (opflow->include_loadloss_variables) { + for (int k = 0; k < bus->nload; k++) { + PSLOAD load; + ierr = PSBUSGetLoad(bus, k, &load); + CHKERRQ(ierr); + loadparams->jacsq_idx[loadi + k] = nnz_eqjacsp; + nnz_eqjacsp += 1; + } + } + + geni += bus->ngenON; + loadi += bus->nload; + } + + /* Line off-diagonal entries: 8 per unique (from_bus, to_bus) pair. + Parallel lines (same bus pair) share the same off-diagonal positions + and accumulate via atomicAdd in the GPU kernel. */ + int linei = 0; + std::map, int> buspair_to_offdiag; + for (int iline = 0; iline < ps->nline; ++iline) { + PSLINE line = &(ps->line[iline]); + if (!line->status) + continue; + + if (!line->isdcline) { + const PSBUS *connbuses; + ierr = PSLINEGetConnectedBuses(line, &connbuses); + CHKERRQ(ierr); + PSBUS busf = connbuses[0]; + PSBUS bust = connbuses[1]; + int busidxf = (int)(busf - ps->bus); + int busidxt = (int)(bust - ps->bus); + + lineparams->eqjacsp_diag_idx[4 * linei + 0] = + busparams->eqjacsp_selfidx[2 * busidxf]; + lineparams->eqjacsp_diag_idx[4 * linei + 1] = + busparams->eqjacsp_selfidx[2 * busidxf + 1]; + lineparams->eqjacsp_diag_idx[4 * linei + 2] = + busparams->eqjacsp_selfidx[2 * busidxt]; + lineparams->eqjacsp_diag_idx[4 * linei + 3] = + busparams->eqjacsp_selfidx[2 * busidxt + 1]; + + auto key = std::make_pair(std::min(busidxf, busidxt), + std::max(busidxf, busidxt)); + auto it = buspair_to_offdiag.find(key); + if (it != buspair_to_offdiag.end()) { + lineparams->eqjacsp_idx[linei] = it->second; + } else { + lineparams->eqjacsp_idx[linei] = nnz_eqjacsp; + buspair_to_offdiag[key] = nnz_eqjacsp; + nnz_eqjacsp += 8; + } + } + + linei++; + } + + /* Generator set-point equality constraint entries */ + if (opflow->has_gensetpoint) { + geni = 0; + for (int ibus = 0; ibus < ps->nbus; ++ibus) { + PSBUS bus = &(ps->bus[ibus]); + int gi = 0; + for (int k = 0; k < bus->ngen; k++) { + PSGEN gen; + ierr = PSBUSGetGen(bus, k, &gen); + CHKERRQ(ierr); + if (!gen->status) + continue; + if (!gen->isrenewable) { + genparams->eqjacspgen_idx[geni + gi] = nnz_eqjacsp; + nnz_eqjacsp += 4; + } + gi++; + } + geni += bus->ngenON; + } + } + + if (opflow->has_gensetpoint) { + for (int ibus = 0; ibus < ps->nbus; ++ibus) { + PSBUS bus = &(ps->bus[ibus]); + for (int bgen = 0; bgen < bus->ngen; ++bgen) { + PSGEN gen; + ierr = PSBUSGetGen(bus, bgen, &gen); + CHKERRQ(ierr); + + if (!gen->status) + continue; + + nnz_ineqjac += 6; + } + } + } + + if (opflow->genbusvoltagetype == FIXED_WITHIN_QBOUNDS) { + for (int ibus = 0; ibus < ps->nbus; ++ibus) { + PSBUS bus = &(ps->bus[ibus]); + if (bus->ide == PV_BUS || bus->ide == REF_BUS) { + nnz_ineqjac += 2; + } + } + } + + for (int iline = 0; iline < opflow->nlinesmon; ++iline) { + nnz_ineqjac += 8; + } + + for (int ibus = 0; ibus < ps->nbus; ++ibus) { + // reserve 2 real and 2 reactive entries for each bus + // 3 upper triangular + nnz_hess += 3; + + if (opflow->include_powerimbalance_variables) { + nnz_hess += 2; + } + } + + for (int i = 0; i < ps->ngen; ++i) { + PSGEN gen = &(ps->gen[i]); + + if (!gen->status) + continue; + + nnz_hess += 2; + + if (opflow->has_gensetpoint) { + if (gen->isrenewable) + continue; + + // later ... + // if (opflow->use_agc) { + // nnz_hess += 5; + // } + } + if (opflow->genbusvoltagetype == FIXED_WITHIN_QBOUNDS) { + nnz_hess += 2; + } + } + + for (int iline = 0; iline < ps->nline; ++iline) { + PSLINE line = &(ps->line[iline]); + + if (!line->status) + continue; + + // 3 diagonal entries for on the from-bus rows (already defined) + // 3 diagonal entries for on the to-bus rows (already defined) + // 4 off-diagonal entries in upper part + nnz_hess += 4; + } + + if (opflow->include_loadloss_variables) { + for (int iload = 0; iload < ps->nload; ++iload) { + nnz_hess += 2; + } + } + opflow->nnz_eqjacsp = nnz_eqjacsp; opflow->nnz_ineqjacsp = nnz_ineqjacsp; opflow->nnz_hesssp = nnz_hesssp; @@ -377,6 +611,20 @@ extern PetscErrorCode OPFLOWSolutionCallback_PBPOLRAJAHIOPSPARSE( OPFLOW, const double *, const double *, const double *, const double *, const double *, double); +/** + * Empty stub for the equality constraint Jacobian in the RAJA sparse model. + * Will be replaced with a full RAJA kernel implementation. + */ +PetscErrorCode +OPFLOWComputeEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, + Vec X, Mat Je) { + (void)opflow; + (void)X; + (void)Je; + PetscFunctionBegin; + PetscFunctionReturn(0); +} + /** * @brief Constructor for the PBPOLRAJAHIOPSPARSE model. * @@ -430,7 +678,7 @@ PetscErrorCode OPFLOWModelCreate_PBPOLRAJAHIOPSPARSE(OPFLOW opflow) { opflow->modelops.solutiontops = OPFLOWSolutionToPS_PBPOLRAJAHIOPSPARSE; opflow->modelops.setup = OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE; opflow->modelops.computeequalityconstraintjacobian = - OPFLOWComputeEqualityConstraintJacobian_PBPOL; + OPFLOWComputeEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE; opflow->modelops.computesparseequalityconstraintjacobianhiop = OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE; opflow->modelops.computeinequalityconstraintjacobian = diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp index 05705a35..41480a0f 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp @@ -240,5 +240,216 @@ void ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, } } +void ComputeEqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, + const double *x_dev, + double *jace_dev) { + PbpolModelRajaHiop *pbpolrajahiopsparse = + reinterpret_cast(opflow->model); + BUSParamsRajaHiop *busparams = &pbpolrajahiopsparse->busparams; + GENParamsRajaHiop *genparams = &pbpolrajahiopsparse->genparams; + LOADParamsRajaHiop *loadparams = &pbpolrajahiopsparse->loadparams; + LINEParamsRajaHiop *lineparams = &pbpolrajahiopsparse->lineparams; + + /* Zero the equality Jacobian values before accumulating */ + RAJA::forall( + RAJA::RangeSegment(0, opflow->nnz_eqjacsp), + RAJA_LAMBDA(RAJA::Index_type i) { jace_dev[i] = 0.0; }); + + /* + * Kernel 1: Bus contributions (shunt self-admittance, power imbalance, + * generator Pg/Qg, load loss, gen setpoint). + * + * For bus self-admittance entries, the shunt values are written directly. + * Line diagonal contributions are accumulated via atomicAdd by Kernel 2. + */ + { + int *b_isisolated = busparams->isisolated_dev_; + int *b_xidx = busparams->xidx_dev_; + double *b_gl = busparams->gl_dev_; + double *b_bl = busparams->bl_dev_; + int *b_selfidx = busparams->eqjacsp_selfidx_dev_; + /*KS: can probably parallelize this differently but it is correct -- + * optimizing existing code is easier than writing it froms scratch .... */ + RAJA::forall( + RAJA::RangeSegment(0, busparams->nbus), + RAJA_LAMBDA(RAJA::Index_type i) { + int pbase = b_selfidx[2 * i]; + int qbase = b_selfidx[2 * i + 1]; + + if (b_isisolated[i]) { + jace_dev[pbase + 0] = 1.0; + jace_dev[pbase + 1] = 0.0; + jace_dev[qbase + 0] = 0.0; + jace_dev[qbase + 1] = 1.0; + return; + } + + double Vm = x_dev[b_xidx[i] + 1]; + + jace_dev[pbase + 0] = 0.0; + jace_dev[pbase + 1] = 2.0 * Vm * b_gl[i]; + jace_dev[qbase + 0] = 0.0; + jace_dev[qbase + 1] = -2.0 * Vm * b_bl[i]; + }); + + /* Power imbalance: constant entries */ + if (opflow->include_powerimbalance_variables) { + int *b_jacsp = busparams->jacsp_idx_dev_; + int *b_jacsq = busparams->jacsq_idx_dev_; + RAJA::forall( + RAJA::RangeSegment(0, busparams->nbus), + RAJA_LAMBDA(RAJA::Index_type i) { + jace_dev[b_jacsp[i]] = 1.0; + jace_dev[b_jacsp[i] + 1] = -1.0; + jace_dev[b_jacsq[i]] = 1.0; + jace_dev[b_jacsq[i] + 1] = -1.0; + }); + } + + /* Generator Pg/Qg contributions: -1 per active generator */ + int *g_eqjacspbus = genparams->eqjacspbus_idx_dev_; + int *g_eqjacsqbus = genparams->eqjacsqbus_idx_dev_; + RAJA::forall( + RAJA::RangeSegment(0, genparams->ngenON), + RAJA_LAMBDA(RAJA::Index_type i) { + jace_dev[g_eqjacspbus[i]] = -1.0; + jace_dev[g_eqjacsqbus[i]] = -1.0; + }); + + /* Load loss contributions: -1 per load */ + if (opflow->include_loadloss_variables) { + int *l_jacsp = loadparams->jacsp_idx_dev_; + int *l_jacsq = loadparams->jacsq_idx_dev_; + RAJA::forall( + RAJA::RangeSegment(0, loadparams->nload), + RAJA_LAMBDA(RAJA::Index_type i) { + jace_dev[l_jacsp[i]] = -1.0; + jace_dev[l_jacsq[i]] = -1.0; + }); + } + + /* Generator set-point equality constraints: constant entries */ + if (opflow->has_gensetpoint) { + int *g_eqjacspgen = genparams->eqjacspgen_idx_dev_; + int *g_isrenewable = genparams->isrenewable_dev_; + RAJA::forall( + RAJA::RangeSegment(0, genparams->ngenON), + RAJA_LAMBDA(RAJA::Index_type i) { + if (g_isrenewable[i]) + return; + int base = g_eqjacspgen[i]; + jace_dev[base + 0] = -1.0; + jace_dev[base + 1] = 1.0; + jace_dev[base + 2] = 1.0; + jace_dev[base + 3] = 1.0; + }); + } + } + + /* + * Kernel 2: Line contributions. + * + * Off-diagonal entries (remote-bus columns) are written directly. + * Diagonal entries (self-bus columns) are accumulated with atomicAdd + * into the bus self-admittance positions set by Kernel 1. + */ + { + double *l_Gff = lineparams->Gff_dev_; + double *l_Bff = lineparams->Bff_dev_; + double *l_Gft = lineparams->Gft_dev_; + double *l_Bft = lineparams->Bft_dev_; + double *l_Gtf = lineparams->Gtf_dev_; + double *l_Btf = lineparams->Btf_dev_; + double *l_Gtt = lineparams->Gtt_dev_; + double *l_Btt = lineparams->Btt_dev_; + int *l_xidxf = lineparams->xidxf_dev_; + int *l_xidxt = lineparams->xidxt_dev_; + int *l_eqjacsp_idx = lineparams->eqjacsp_idx_dev_; + int *l_eqjacsp_diag = lineparams->eqjacsp_diag_idx_dev_; + int *l_isdcline = lineparams->isdcline_dev_; + + RAJA::forall( + RAJA::RangeSegment(0, lineparams->nlineON), + RAJA_LAMBDA(RAJA::Index_type l) { + if (l_isdcline[l]) + return; + + double thetaf = x_dev[l_xidxf[l]]; + double Vmf = x_dev[l_xidxf[l] + 1]; + double thetat = x_dev[l_xidxt[l]]; + double Vmt = x_dev[l_xidxt[l] + 1]; + double thetaft = thetaf - thetat; + double thetatf = thetat - thetaf; + + double Gff = l_Gff[l], Bff = l_Bff[l]; + double Gft = l_Gft[l], Bft = l_Bft[l]; + double Gtf = l_Gtf[l], Btf = l_Btf[l]; + double Gtt = l_Gtt[l], Btt = l_Btt[l]; + + double sin_ft = sin(thetaft), cos_ft = cos(thetaft); + double sin_tf = sin(thetatf), cos_tf = cos(thetatf); + + /* From-bus diagonal derivatives (accumulate into bus self entries) */ + double dPf_dthetaf = Vmf * Vmt * (-Gft * sin_ft + Bft * cos_ft); + double dPf_dVmf = + 2.0 * Gff * Vmf + Vmt * (Gft * cos_ft + Bft * sin_ft); + double dQf_dthetaf = Vmf * Vmt * (Bft * sin_ft + Gft * cos_ft); + double dQf_dVmf = + -2.0 * Bff * Vmf + Vmt * (-Bft * cos_ft + Gft * sin_ft); + + int pfbase = l_eqjacsp_diag[4 * l + 0]; + int qfbase = l_eqjacsp_diag[4 * l + 1]; + + RAJA::atomicAdd(&jace_dev[pfbase + 0], + dPf_dthetaf); + RAJA::atomicAdd(&jace_dev[pfbase + 1], dPf_dVmf); + RAJA::atomicAdd(&jace_dev[qfbase + 0], + dQf_dthetaf); + RAJA::atomicAdd(&jace_dev[qfbase + 1], dQf_dVmf); + + /* From-bus off-diagonal derivatives (atomicAdd for parallel lines) */ + double dPf_dthetat = Vmf * Vmt * (Gft * sin_ft - Bft * cos_ft); + double dPf_dVmt = Vmf * (Gft * cos_ft + Bft * sin_ft); + double dQf_dthetat = Vmf * Vmt * (-Bft * sin_ft - Gft * cos_ft); + double dQf_dVmt = Vmf * (-Bft * cos_ft + Gft * sin_ft); + + int obase = l_eqjacsp_idx[l]; + RAJA::atomicAdd(&jace_dev[obase + 0], dPf_dthetat); + RAJA::atomicAdd(&jace_dev[obase + 1], dPf_dVmt); + RAJA::atomicAdd(&jace_dev[obase + 2], dQf_dthetat); + RAJA::atomicAdd(&jace_dev[obase + 3], dQf_dVmt); + + /* To-bus diagonal derivatives */ + double dPt_dthetat = Vmt * Vmf * (-Gtf * sin_tf + Btf * cos_tf); + double dPt_dVmt = + 2.0 * Gtt * Vmt + Vmf * (Gtf * cos_tf + Btf * sin_tf); + double dQt_dthetat = Vmt * Vmf * (Btf * sin_tf + Gtf * cos_tf); + double dQt_dVmt = + -2.0 * Btt * Vmt + Vmf * (-Btf * cos_tf + Gtf * sin_tf); + + int ptbase = l_eqjacsp_diag[4 * l + 2]; + int qtbase = l_eqjacsp_diag[4 * l + 3]; + + RAJA::atomicAdd(&jace_dev[ptbase + 0], + dPt_dthetat); + RAJA::atomicAdd(&jace_dev[ptbase + 1], dPt_dVmt); + RAJA::atomicAdd(&jace_dev[qtbase + 0], + dQt_dthetat); + RAJA::atomicAdd(&jace_dev[qtbase + 1], dQt_dVmt); + + /* To-bus off-diagonal derivatives (atomicAdd for parallel lines) */ + double dPt_dthetaf = Vmt * Vmf * (Gtf * sin_tf - Btf * cos_tf); + double dPt_dVmf = Vmt * (Gtf * cos_tf + Btf * sin_tf); + double dQt_dthetaf = Vmt * Vmf * (-Btf * sin_tf - Gtf * cos_tf); + double dQt_dVmf = Vmt * (-Btf * cos_tf + Gtf * sin_tf); + + RAJA::atomicAdd(&jace_dev[obase + 4], dPt_dthetaf); + RAJA::atomicAdd(&jace_dev[obase + 5], dPt_dVmf); + RAJA::atomicAdd(&jace_dev[obase + 6], dQt_dthetaf); + RAJA::atomicAdd(&jace_dev[obase + 7], dQt_dVmf); + }); + } +} + #endif // EXAGO_ENABLE_HIOP_SPARSE #endif // EXAGO_ENABLE_RAJA diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp index 938686a7..2d59b1c1 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp @@ -26,5 +26,21 @@ void ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, const double *x_dev, double *jacd_dev); +/** + * GPU-only (PETSc-free) computation of equality constraint Jacobian values. + * + * Replaces the PETSc-based path that calls + * opflow->modelops.computeequalityconstraintjacobian followed by + * MatGetRow extraction. Writes directly into device memory using RAJA + * kernels, without PETSc Mat/Vec operations; no H2D, D2H copies back and forth. + * + * @param opflow The OPFLOW problem context + * @param x_dev Device array of variable values + * @param jace_dev Device output array for equality Jacobian values + */ +void ComputeEqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, + const double *x_dev, + double *jace_dev); + #endif // EXAGO_ENABLE_HIOP_SPARSE #endif // EXAGO_ENABLE_RAJA diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp index efddb6f6..39512265 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp @@ -702,12 +702,12 @@ OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE( // This only needs to be done once since the sparsity pattern of the Jacobian // does not change during the optimization. if (iJacS_dev != NULL && jJacS_dev != NULL) { - /* Set locations only */ + /* Compute sparsity pattern on host, matching the flat-array layout + defined during setup in OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE. */ roffset = 0; coffset = 0; - // Create arrays on the host to store i, j, and val arrays umpire::Allocator h_allocator_ = resmgr.getAllocator("HOST"); pbpolrajahiopsparse->i_jaceq = @@ -717,35 +717,176 @@ OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE( pbpolrajahiopsparse->val_jaceq = (double *)(h_allocator_.allocate(opflow->nnz_eqjacsp * sizeof(double))); - iRowstart = pbpolrajahiopsparse->i_jaceq; - jColstart = pbpolrajahiopsparse->j_jaceq; + int *iRow = pbpolrajahiopsparse->i_jaceq; + int *jCol = pbpolrajahiopsparse->j_jaceq; + + PS ps = opflow->ps; + BUSParamsRajaHiop *busparams = &pbpolrajahiopsparse->busparams; + GENParamsRajaHiop *genparams = &pbpolrajahiopsparse->genparams; + LOADParamsRajaHiop *loadparams = &pbpolrajahiopsparse->loadparams; + LINEParamsRajaHiop *lineparams = &pbpolrajahiopsparse->lineparams; + + int geni = 0, loadi = 0; + for (int ibus = 0; ibus < ps->nbus; ibus++) { + PSBUS bus = &ps->bus[ibus]; + int P_row = roffset + busparams->gidx[ibus]; + int Q_row = P_row + 1; + int theta_col = coffset + busparams->xidx[ibus]; + int Vm_col = theta_col + 1; + int base; + + /* P-row self-admittance */ + base = busparams->eqjacsp_selfidx[2 * ibus]; + iRow[base] = P_row; + jCol[base] = theta_col; + iRow[base + 1] = P_row; + jCol[base + 1] = Vm_col; + + if (bus->ide == ISOLATED_BUS) { + /* Q-row self-admittance for isolated bus */ + base = busparams->eqjacsp_selfidx[2 * ibus + 1]; + iRow[base] = Q_row; + jCol[base] = theta_col; + iRow[base + 1] = Q_row; + jCol[base + 1] = Vm_col; + continue; + } - // Function pointer computeequalityconstraintjacobian points to - // OPFLOWComputeEqualityConstraintJacobian_PBPOL function. - // Use function from PBPOL model to create the sparsity pattern. - ierr = (*opflow->modelops.computeequalityconstraintjacobian)( - opflow, opflow->X, opflow->Jac_Ge); - CHKERRQ(ierr); + /* P-row power imbalance */ + if (opflow->include_powerimbalance_variables) { + base = busparams->jacsp_idx[ibus]; + int pimb_col = coffset + busparams->xidxpimb[ibus]; + iRow[base] = P_row; + jCol[base] = pimb_col; + iRow[base + 1] = P_row; + jCol[base + 1] = pimb_col + 1; + } - ierr = MatGetSize(opflow->Jac_Ge, &nrow, &ncol); - CHKERRQ(ierr); + /* P-row gen Pg */ + int gi = 0; + for (int k = 0; k < bus->ngen; k++) { + PSGEN gen; + PSBUSGetGen(bus, k, &gen); + if (!gen->status) + continue; + base = genparams->eqjacspbus_idx[geni + gi]; + iRow[base] = P_row; + jCol[base] = coffset + genparams->xidx[geni + gi]; + gi++; + } - /* Copy over locations to triplet format */ - for (i = 0; i < nrow; i++) { - ierr = MatGetRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); - CHKERRQ(ierr); - for (j = 0; j < nvals; j++) { - iRowstart[j] = roffset + i; - jColstart[j] = coffset + cols[j]; + /* P-row load loss */ + if (opflow->include_loadloss_variables) { + for (int k = 0; k < bus->nload; k++) { + base = loadparams->jacsp_idx[loadi + k]; + iRow[base] = P_row; + jCol[base] = coffset + loadparams->xidx[loadi + k]; + } + } + + /* Q-row self-admittance */ + base = busparams->eqjacsp_selfidx[2 * ibus + 1]; + iRow[base] = Q_row; + jCol[base] = theta_col; + iRow[base + 1] = Q_row; + jCol[base + 1] = Vm_col; + + /* Q-row power imbalance */ + if (opflow->include_powerimbalance_variables) { + base = busparams->jacsq_idx[ibus]; + int pimb_col = coffset + busparams->xidxpimb[ibus]; + iRow[base] = Q_row; + jCol[base] = pimb_col + 2; + iRow[base + 1] = Q_row; + jCol[base + 1] = pimb_col + 3; + } + + /* Q-row gen Qg */ + gi = 0; + for (int k = 0; k < bus->ngen; k++) { + PSGEN gen; + PSBUSGetGen(bus, k, &gen); + if (!gen->status) + continue; + base = genparams->eqjacsqbus_idx[geni + gi]; + iRow[base] = Q_row; + jCol[base] = coffset + genparams->xidx[geni + gi] + 1; + gi++; + } + + /* Q-row load loss */ + if (opflow->include_loadloss_variables) { + for (int k = 0; k < bus->nload; k++) { + base = loadparams->jacsq_idx[loadi + k]; + iRow[base] = Q_row; + jCol[base] = coffset + loadparams->xidx[loadi + k] + 1; + } + } + + geni += bus->ngenON; + loadi += bus->nload; + } + + /* Line off-diagonal entries */ + for (int l = 0; l < lineparams->nlineON; l++) { + if (lineparams->isdcline[l]) + continue; + + int base = lineparams->eqjacsp_idx[l]; + int Pf_row = roffset + lineparams->geqidxf[l]; + int Qf_row = Pf_row + 1; + int Pt_row = roffset + lineparams->geqidxt[l]; + int Qt_row = Pt_row + 1; + int thetat_col = coffset + lineparams->xidxt[l]; + int Vmt_col = thetat_col + 1; + int thetaf_col = coffset + lineparams->xidxf[l]; + int Vmf_col = thetaf_col + 1; + + /* From-bus off-diagonal: Pf w.r.t. thetat, Vmt */ + iRow[base + 0] = Pf_row; + jCol[base + 0] = thetat_col; + iRow[base + 1] = Pf_row; + jCol[base + 1] = Vmt_col; + /* Qf w.r.t. thetat, Vmt */ + iRow[base + 2] = Qf_row; + jCol[base + 2] = thetat_col; + iRow[base + 3] = Qf_row; + jCol[base + 3] = Vmt_col; + /* To-bus off-diagonal: Pt w.r.t. thetaf, Vmf */ + iRow[base + 4] = Pt_row; + jCol[base + 4] = thetaf_col; + iRow[base + 5] = Pt_row; + jCol[base + 5] = Vmf_col; + /* Qt w.r.t. thetaf, Vmf */ + iRow[base + 6] = Qt_row; + jCol[base + 6] = thetaf_col; + iRow[base + 7] = Qt_row; + jCol[base + 7] = Vmf_col; + } + + /* Generator set-point equality constraint entries */ + if (opflow->has_gensetpoint) { + for (int g = 0; g < genparams->ngenON; g++) { + if (genparams->isrenewable[g]) + continue; + int base = genparams->eqjacspgen_idx[g]; + int row0 = roffset + genparams->geqidxgen[g]; + int row1 = row0 + 1; + int Pg_col = coffset + genparams->xidx[g]; + int delPg_col = coffset + genparams->xpdevidx[g]; + int Pset_col = coffset + genparams->xpsetidx[g]; + + iRow[base + 0] = row0; + jCol[base + 0] = Pg_col; + iRow[base + 1] = row0; + jCol[base + 1] = delPg_col; + iRow[base + 2] = row0; + jCol[base + 2] = Pset_col; + iRow[base + 3] = row1; + jCol[base + 3] = Pset_col; } - /* Increment iRow,jCol pointers */ - iRowstart += nvals; - jColstart += nvals; - ierr = MatRestoreRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); - CHKERRQ(ierr); } - // Copy over i_jaceq and j_jaceq arrays to device resmgr.copy(iJacS_dev, pbpolrajahiopsparse->i_jaceq); resmgr.copy(jJacS_dev, pbpolrajahiopsparse->j_jaceq); } @@ -754,45 +895,10 @@ OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE( ierr = PetscLogEventBegin(opflow->eqconsjaclogger, 0, 0, 0, 0); CHKERRQ(ierr); - ierr = VecGetArray(opflow->X, &x); - CHKERRQ(ierr); - - // Copy from device to host - umpire::Allocator h_allocator_ = resmgr.getAllocator("HOST"); - registerWith(x, opflow->nx, resmgr, h_allocator_); - resmgr.copy((double *)x, (double *)x_dev); - - ierr = VecRestoreArray(opflow->X, &x); - CHKERRQ(ierr); - - // Compute equality constraint jacobian on the host - // Function pointer computeequalityconstraintjacobian points to the - // implementation OPFLOWComputeEqualityConstraintJacobian_PBPOL in the - // PBPOL model. - ierr = (*opflow->modelops.computeequalityconstraintjacobian)( - opflow, opflow->X, opflow->Jac_Ge); - CHKERRQ(ierr); - - ierr = MatGetSize(opflow->Jac_Ge, &nrow, &ncol); - CHKERRQ(ierr); - - values = pbpolrajahiopsparse->val_jaceq; - - // Unpack PETSc matrix and copy values to the host array in the - // PbpolModelRajaHiop struct. - for (i = 0; i < nrow; i++) { - ierr = MatGetRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); - CHKERRQ(ierr); - for (j = 0; j < nvals; j++) { - values[j] = vals[j]; - } - values += nvals; - ierr = MatRestoreRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); - CHKERRQ(ierr); - } - - // Copy over val_jaceq to device - resmgr.copy(MJacS_dev, pbpolrajahiopsparse->val_jaceq); + /* KS: Compute equality constraint Jacobian directly on device. + No H2D, D2H copies: x_dev is already on device, output goes + straight into MJacS_dev. */ + ComputeEqJacValuesGPU_PBPOLRAJAHIOPSPARSE(opflow, x_dev, MJacS_dev); ierr = PetscLogEventEnd(opflow->eqconsjaclogger, 0, 0, 0, 0); CHKERRQ(ierr); diff --git a/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp b/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp index d768e738..af5e165c 100644 --- a/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp +++ b/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp @@ -56,18 +56,10 @@ bool OPFLOWHIOPSPARSEGPUInterface::get_sparse_blocks_info( nx = opflow->nx; - /* Compute nonzeros for the Jacobian */ - /* Equality constraint Jacobian */ - ierr = (*opflow->modelops.computeequalityconstraintjacobian)( - opflow, opflow->X, opflow->Jac_Ge); - CHKERRQ(ierr); - ierr = MatSetOption(opflow->Jac_Ge, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE); - CHKERRQ(ierr); - - ierr = MatGetInfo(opflow->Jac_Ge, MAT_LOCAL, &info_eq); - CHKERRQ(ierr); - - nnz_sparse_Jaceq = opflow->nnz_eqjacsp = info_eq.nz_used; + /* Use the nnz count pre-computed during model setup + (OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE). This avoids calling the + PETSc-based Jacobian function, which is no longer wired up. */ + nnz_sparse_Jaceq = opflow->nnz_eqjacsp; /* KS: Use pre-computed nnz_ineqjacsp from model setup (avoids PETSc Mat assembly just for counting non-zeros -- not sure if faster or if it scales diff --git a/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt b/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt index cd2f33b2..35f1a74e 100644 --- a/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt +++ b/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt @@ -1,6 +1,28 @@ if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_CUDA) set_source_files_properties(jac_eq_acopf.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(test_eqjac_compare.cpp PROPERTIES LANGUAGE CUDA) endif() +if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIP) + set_source_files_properties(test_eqjac_compare.cpp PROPERTIES LANGUAGE HIP) +endif() + +add_executable(test_eqjac_compare test_eqjac_compare.cpp) +target_link_libraries(test_eqjac_compare ExaGO::OPFLOW) +target_include_directories( + test_eqjac_compare PRIVATE ${CMAKE_SOURCE_DIR}/tests/unit/opflow +) + +if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIP) + set_source_files_properties(test_eqjac_perf.cpp PROPERTIES LANGUAGE HIP) +endif() +if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_CUDA) + set_source_files_properties(test_eqjac_perf.cpp PROPERTIES LANGUAGE CUDA) +endif() +add_executable(test_eqjac_perf test_eqjac_perf.cpp) +target_link_libraries(test_eqjac_perf ExaGO::OPFLOW) +target_include_directories( + test_eqjac_perf PRIVATE ${CMAKE_SOURCE_DIR}/tests/unit/opflow +) add_executable( jac_eq_acopf jac_eq_acopf.cpp @@ -54,6 +76,18 @@ if(EXAGO_ENABLE_RAJA) list(APPEND dependencies HIOP) endif() +if(EXAGO_RUN_TESTS) + if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIOP) + set(prefix_net ${CMAKE_SOURCE_DIR}/datafiles/unit/opflow/objective/) + exago_add_test( + NAME UNIT_TESTS_EQJAC_GPU_VS_PETSC_testx3 + DEPENDS HIOP_SPARSE + COMMAND ${RUNCMD} $ + -netfile ${prefix_net}testx3.m + ) + endif() +endif() + if(EXAGO_RUN_TESTS) foreach( model diff --git a/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_compare.cpp b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_compare.cpp new file mode 100644 index 00000000..84ae4f5f --- /dev/null +++ b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_compare.cpp @@ -0,0 +1,306 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +#if defined(EXAGO_ENABLE_RAJA) +#include +#include +#include +#endif + +struct TripletEntry { + int row, col; + double val; +}; + +static void computeReferenceJacobian(OPFLOW opflow, Vec X, + std::vector &entries) { + PetscErrorCode ierr; + ierr = (*opflow->modelops.computeequalityconstraintjacobian)(opflow, X, + opflow->Jac_Ge); + + PetscInt nrow, ncol; + ierr = MatGetSize(opflow->Jac_Ge, &nrow, &ncol); + + for (PetscInt i = 0; i < nrow; i++) { + PetscInt nvals; + const PetscInt *cols; + const PetscScalar *vals; + ierr = MatGetRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); + for (PetscInt j = 0; j < nvals; j++) { + entries.push_back({(int)i, (int)cols[j], vals[j]}); + } + ierr = MatRestoreRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); + } +} + +int main(int argc, char **argv) { + PetscErrorCode ierr; + PetscBool flg; + char file_c_str[PETSC_MAX_PATH_LEN]; + std::string file; + char appname[] = "opflow"; + MPI_Comm comm = MPI_COMM_WORLD; + + char help[] = "Compare PETSc vs GPU equality constraint Jacobian\n"; + + ierr = ExaGOInitialize(comm, &argc, &argv, appname, help); + if (ierr) { + fprintf(stderr, "Could not initialize ExaGO.\n"); + return ierr; + } + + ierr = PetscOptionsGetString(NULL, NULL, "-netfile", file_c_str, + PETSC_MAX_PATH_LEN, &flg); + if (!flg) + file = "../datafiles/case9/case9mod.m"; + else + file.assign(file_c_str); + + /* ---------------------------------------------------------------- + * Step 1: Set up OPFLOW with PBPOL model (PETSc path) to get + * the reference Jacobian and the initial guess X. + * ---------------------------------------------------------------- */ + OPFLOW opflow_ref; + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow_ref, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow_ref, OPFLOWMODEL_PBPOL); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow_ref, OPFLOWSOLVER_IPOPT); + CHKERRQ(ierr); + ierr = OPFLOWSetInitializationType(opflow_ref, OPFLOWINIT_FROMFILE); + CHKERRQ(ierr); + ierr = OPFLOWSetUp(opflow_ref); + CHKERRQ(ierr); + + Vec X_ref; + ierr = OPFLOWGetSolution(opflow_ref, &X_ref); + CHKERRQ(ierr); + + std::vector ref_entries; + computeReferenceJacobian(opflow_ref, X_ref, ref_entries); + + /* ---------------------------------------------------------------- + * Step 2: Set up OPFLOW with HIOPSPARSE to exercise the GPU path. + * ---------------------------------------------------------------- */ + OPFLOW opflow_gpu; + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow_gpu); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow_gpu, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow_gpu, OPFLOWMODEL_PBPOLRAJAHIOPSPARSE); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow_gpu, OPFLOWSOLVER_HIOPSPARSEGPU); + CHKERRQ(ierr); + ierr = OPFLOWSetInitializationType(opflow_gpu, OPFLOWINIT_FROMFILE); + CHKERRQ(ierr); +#ifdef EXAGO_ENABLE_GPU + ierr = OPFLOWSetHIOPComputeMode(opflow_gpu, "GPU"); + CHKERRQ(ierr); +#endif + ierr = OPFLOWSetUp(opflow_gpu); + CHKERRQ(ierr); + + /* Get the initial guess vector and map to sparse-dense ordering */ + Vec X_gpu; + ierr = OPFLOWGetSolution(opflow_gpu, &X_gpu); + CHKERRQ(ierr); + + int nx = opflow_gpu->nx; + int nnz_eq = opflow_gpu->nnz_eqjacsp; + + printf("\n"); + printf("============================================================\n"); + printf(" Equality Constraint Jacobian: PETSc vs GPU Comparison\n"); + printf(" Network: %s\n", file.c_str()); + printf(" nx = %d, nconeq = %d, nnz_eqjac(GPU) = %d, nnz_eqjac(PETSc) = %d\n", + nx, opflow_gpu->nconeq, nnz_eq, (int)ref_entries.size()); + printf("============================================================\n\n"); + +#if defined(EXAGO_ENABLE_RAJA) + auto &resmgr = umpire::ResourceManager::getInstance(); + umpire::Allocator h_allocator = resmgr.getAllocator("HOST"); + + double *x_host; + ierr = VecGetArray(X_gpu, &x_host); + CHKERRQ(ierr); + + int *iRow, *jCol; + double *values; + iRow = static_cast(h_allocator.allocate(nnz_eq * sizeof(int))); + jCol = static_cast(h_allocator.allocate(nnz_eq * sizeof(int))); + values = static_cast(h_allocator.allocate(nnz_eq * sizeof(double))); + +#ifdef EXAGO_ENABLE_GPU + umpire::Allocator d_allocator = resmgr.getAllocator("DEVICE"); + + double *x_dev = + static_cast(d_allocator.allocate(nx * sizeof(double))); + int *iRow_dev = + static_cast(d_allocator.allocate(nnz_eq * sizeof(int))); + int *jCol_dev = + static_cast(d_allocator.allocate(nnz_eq * sizeof(int))); + double *values_dev = + static_cast(d_allocator.allocate(nnz_eq * sizeof(double))); + + umpire::util::AllocationRecord record_x{ + x_host, sizeof(double) * nx, h_allocator.getAllocationStrategy()}; + resmgr.registerAllocation(x_host, record_x); + resmgr.copy(x_dev, x_host); +#else + double *x_dev = x_host; + int *iRow_dev = iRow; + int *jCol_dev = jCol; + double *values_dev = values; +#endif + + /* Call the sparse eq jac function: first for sparsity, then for values */ + ierr = (*opflow_gpu->modelops.computesparseequalityconstraintjacobianhiop)( + opflow_gpu, x_dev, iRow_dev, jCol_dev, NULL); + CHKERRQ(ierr); + + ierr = (*opflow_gpu->modelops.computesparseequalityconstraintjacobianhiop)( + opflow_gpu, x_dev, NULL, NULL, values_dev); + CHKERRQ(ierr); + +#ifdef EXAGO_ENABLE_GPU + resmgr.copy(iRow, iRow_dev); + resmgr.copy(jCol, jCol_dev); + resmgr.copy(values, values_dev); +#endif + + /* Build a map from (row, col) -> value for the GPU result */ + struct PairHash { + size_t operator()(const std::pair &p) const { + return std::hash()(((long long)p.first << 32) | p.second); + } + }; + std::unordered_map, double, PairHash> gpu_map; + for (int i = 0; i < nnz_eq; i++) { + auto key = std::make_pair(iRow[i], jCol[i]); + gpu_map[key] += values[i]; + } + + /* ---------------------------------------------------------------- + * Step 3: Compare and print results + * ---------------------------------------------------------------- */ + int n_match = 0, n_mismatch = 0, n_missing_gpu = 0, n_extra_gpu = 0; + double max_abs_err = 0.0, max_rel_err = 0.0; + int worst_row = -1, worst_col = -1; + double worst_ref = 0, worst_gpu = 0; + const double tol = 1e-6; + + printf(" %-8s %-8s %16s %16s %12s %s\n", "Row", "Col", "PETSc (ref)", + "GPU", "AbsErr", "Status"); + printf(" %-8s %-8s %16s %16s %12s %s\n", "---", "---", "-----------", + "---", "------", "------"); + + for (const auto &e : ref_entries) { + auto key = std::make_pair(e.row, e.col); + auto it = gpu_map.find(key); + double gpu_val = 0.0; + bool found = (it != gpu_map.end()); + + if (found) { + gpu_val = it->second; + gpu_map.erase(it); + } + + double abs_err = fabs(gpu_val - e.val); + double rel_err = + (fabs(e.val) > 1e-12) ? abs_err / fabs(e.val) : abs_err; + const char *status; + + if (!found) { + status = "MISSING"; + n_missing_gpu++; + } else if (abs_err < tol) { + status = "OK"; + n_match++; + } else { + status = "MISMATCH"; + n_mismatch++; + } + + if (abs_err > max_abs_err) { + max_abs_err = abs_err; + max_rel_err = rel_err; + worst_row = e.row; + worst_col = e.col; + worst_ref = e.val; + worst_gpu = gpu_val; + } + + if (abs_err >= tol || !found) { + printf(" %-8d %-8d %16.8e %16.8e %12.2e %s\n", e.row, e.col, e.val, + gpu_val, abs_err, status); + } + } + + n_extra_gpu = (int)gpu_map.size(); + if (n_extra_gpu > 0) { + printf("\n Extra entries in GPU (not in PETSc reference):\n"); + for (const auto &kv : gpu_map) { + printf(" %-8d %-8d %16s %16.8e %12s EXTRA\n", kv.first.first, + kv.first.second, "n/a", kv.second, "n/a"); + } + } + + printf("\n"); + printf("============================================================\n"); + printf(" SUMMARY\n"); + printf("============================================================\n"); + printf(" PETSc nnz: %d\n", (int)ref_entries.size()); + printf(" GPU nnz: %d\n", nnz_eq); + printf(" Matching: %d\n", n_match); + printf(" Mismatched: %d\n", n_mismatch); + printf(" Missing in GPU: %d\n", n_missing_gpu); + printf(" Extra in GPU: %d\n", n_extra_gpu); + printf(" Max absolute err: %.2e at (%d, %d) ref=%.8e gpu=%.8e\n", + max_abs_err, worst_row, worst_col, worst_ref, worst_gpu); + printf(" Max relative err: %.2e\n", max_rel_err); + printf(" Tolerance: %.2e\n", tol); + printf(" RESULT: %s\n", + (n_mismatch == 0 && n_missing_gpu == 0 && n_extra_gpu == 0) + ? "PASS" + : "FAIL"); + printf("============================================================\n\n"); + + int result = + (n_mismatch == 0 && n_missing_gpu == 0 && n_extra_gpu == 0) ? 0 : 1; + + h_allocator.deallocate(iRow); + h_allocator.deallocate(jCol); + h_allocator.deallocate(values); +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(x_dev); + d_allocator.deallocate(iRow_dev); + d_allocator.deallocate(jCol_dev); + d_allocator.deallocate(values_dev); +#endif + + ierr = VecRestoreArray(X_gpu, &x_host); + CHKERRQ(ierr); +#endif // EXAGO_ENABLE_RAJA + + ierr = OPFLOWDestroy(&opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWDestroy(&opflow_gpu); + CHKERRQ(ierr); + + ExaGOFinalize(); + +#if defined(EXAGO_ENABLE_RAJA) + return result; +#else + return 0; +#endif +} diff --git a/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_perf.cpp b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_perf.cpp new file mode 100644 index 00000000..d1615ab4 --- /dev/null +++ b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_perf.cpp @@ -0,0 +1,241 @@ +#include +#include +#include +#include + +#include +#include +#include + +#if defined(EXAGO_ENABLE_RAJA) +#include +#include +#include +#endif + +#if defined(EXAGO_ENABLE_HIP) +#include +#define GPU_SYNC() hipDeviceSynchronize() +#elif defined(EXAGO_ENABLE_CUDA) +#include +#define GPU_SYNC() cudaDeviceSynchronize() +#else +#define GPU_SYNC() ((void)0) +#endif + +using Clock = std::chrono::high_resolution_clock; +using Ms = std::chrono::duration; + +static double benchmarkPETSc(OPFLOW opflow, Vec X, int niters) { + PetscErrorCode ierr; + PetscScalar *x_arr; + + ierr = VecGetArray(X, &x_arr); + auto t0 = Clock::now(); + for (int iter = 0; iter < niters; iter++) { + ierr = (*opflow->modelops.computeequalityconstraintjacobian)( + opflow, X, opflow->Jac_Ge); + } + auto t1 = Clock::now(); + ierr = VecRestoreArray(X, &x_arr); + + return Ms(t1 - t0).count() / niters; +} + +int main(int argc, char **argv) { + PetscErrorCode ierr; + PetscBool flg; + char file_c_str[PETSC_MAX_PATH_LEN]; + std::string file; + char appname[] = "opflow"; + MPI_Comm comm = MPI_COMM_WORLD; + int niters = 100; + + char help[] = "Benchmark PETSc vs GPU equality constraint Jacobian\n"; + + ierr = ExaGOInitialize(comm, &argc, &argv, appname, help); + if (ierr) + return ierr; + + ierr = PetscOptionsGetString(NULL, NULL, "-netfile", file_c_str, + PETSC_MAX_PATH_LEN, &flg); + if (!flg) + file = "../datafiles/case9/case9mod.m"; + else + file.assign(file_c_str); + + ierr = PetscOptionsGetInt(NULL, NULL, "-niters", &niters, &flg); + + /* ---------------------------------------------------------------- + * PETSc path: IPOPT + PBPOL + * ---------------------------------------------------------------- */ + OPFLOW opflow_ref; + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow_ref, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow_ref, OPFLOWMODEL_PBPOL); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow_ref, OPFLOWSOLVER_IPOPT); + CHKERRQ(ierr); + ierr = OPFLOWSetInitializationType(opflow_ref, OPFLOWINIT_FROMFILE); + CHKERRQ(ierr); + ierr = OPFLOWSetUp(opflow_ref); + CHKERRQ(ierr); + + Vec X_ref; + ierr = OPFLOWGetSolution(opflow_ref, &X_ref); + CHKERRQ(ierr); + + /* Warmup */ + benchmarkPETSc(opflow_ref, X_ref, 5); + double petsc_ms = benchmarkPETSc(opflow_ref, X_ref, niters); + + int petsc_nnz = 0; + { + MatInfo info; + ierr = MatGetInfo(opflow_ref->Jac_Ge, MAT_LOCAL, &info); + petsc_nnz = (int)info.nz_used; + } + + /* ---------------------------------------------------------------- + * GPU path: HIOPSPARSEGPU + PBPOLRAJAHIOPSPARSE + * ---------------------------------------------------------------- */ + OPFLOW opflow_gpu; + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow_gpu); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow_gpu, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow_gpu, OPFLOWMODEL_PBPOLRAJAHIOPSPARSE); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow_gpu, OPFLOWSOLVER_HIOPSPARSEGPU); + CHKERRQ(ierr); + ierr = OPFLOWSetInitializationType(opflow_gpu, OPFLOWINIT_FROMFILE); + CHKERRQ(ierr); +#ifdef EXAGO_ENABLE_GPU + ierr = OPFLOWSetHIOPComputeMode(opflow_gpu, "GPU"); + CHKERRQ(ierr); +#endif + ierr = OPFLOWSetUp(opflow_gpu); + CHKERRQ(ierr); + + Vec X_gpu; + ierr = OPFLOWGetSolution(opflow_gpu, &X_gpu); + CHKERRQ(ierr); + + int nx = opflow_gpu->nx; + int nnz_eq = opflow_gpu->nnz_eqjacsp; + +#if defined(EXAGO_ENABLE_RAJA) + auto &resmgr = umpire::ResourceManager::getInstance(); + umpire::Allocator h_allocator = resmgr.getAllocator("HOST"); + + double *x_host; + ierr = VecGetArray(X_gpu, &x_host); + CHKERRQ(ierr); + + double *values_dev; + double *x_dev; + +#ifdef EXAGO_ENABLE_GPU + umpire::Allocator d_allocator = resmgr.getAllocator("DEVICE"); + x_dev = static_cast(d_allocator.allocate(nx * sizeof(double))); + values_dev = + static_cast(d_allocator.allocate(nnz_eq * sizeof(double))); + + umpire::util::AllocationRecord record_x{ + x_host, sizeof(double) * nx, h_allocator.getAllocationStrategy()}; + resmgr.registerAllocation(x_host, record_x); + resmgr.copy(x_dev, x_host); +#else + x_dev = x_host; + values_dev = + static_cast(h_allocator.allocate(nnz_eq * sizeof(double))); +#endif + + /* Run sparsity phase once (not timed — this is one-time setup) */ + { + int *iRow_dev, *jCol_dev; +#ifdef EXAGO_ENABLE_GPU + iRow_dev = static_cast(d_allocator.allocate(nnz_eq * sizeof(int))); + jCol_dev = static_cast(d_allocator.allocate(nnz_eq * sizeof(int))); +#else + iRow_dev = static_cast(h_allocator.allocate(nnz_eq * sizeof(int))); + jCol_dev = static_cast(h_allocator.allocate(nnz_eq * sizeof(int))); +#endif + ierr = (*opflow_gpu->modelops.computesparseequalityconstraintjacobianhiop)( + opflow_gpu, x_dev, iRow_dev, jCol_dev, NULL); + CHKERRQ(ierr); +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(iRow_dev); + d_allocator.deallocate(jCol_dev); +#else + h_allocator.deallocate(iRow_dev); + h_allocator.deallocate(jCol_dev); +#endif + } + + /* Warmup the values kernel */ + for (int i = 0; i < 5; i++) { + ierr = (*opflow_gpu->modelops.computesparseequalityconstraintjacobianhiop)( + opflow_gpu, x_dev, NULL, NULL, values_dev); + CHKERRQ(ierr); + } + GPU_SYNC(); + + /* Timed runs */ + auto t0 = Clock::now(); + for (int iter = 0; iter < niters; iter++) { + ierr = (*opflow_gpu->modelops.computesparseequalityconstraintjacobianhiop)( + opflow_gpu, x_dev, NULL, NULL, values_dev); + CHKERRQ(ierr); + } + GPU_SYNC(); + auto t1 = Clock::now(); + double gpu_ms = Ms(t1 - t0).count() / niters; + +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(x_dev); + d_allocator.deallocate(values_dev); +#else + h_allocator.deallocate(values_dev); +#endif + + ierr = VecRestoreArray(X_gpu, &x_host); + CHKERRQ(ierr); +#else + double gpu_ms = 0.0; +#endif + + /* ---------------------------------------------------------------- + * Print results + * ---------------------------------------------------------------- */ + printf("\n"); + printf("================================================================\n"); + printf(" Equality Constraint Jacobian — Performance Comparison\n"); + printf("================================================================\n"); + printf(" Network: %s\n", file.c_str()); + printf(" Buses: %d\n", opflow_gpu->ps->nbus); + printf(" Variables (nx): %d\n", nx); + printf(" Eq constraints: %d\n", opflow_gpu->nconeq); + printf(" nnz (PETSc): %d\n", petsc_nnz); + printf(" nnz (GPU): %d\n", nnz_eq); + printf(" Iterations: %d\n", niters); + printf("----------------------------------------------------------------\n"); + printf(" %-20s %12s %12s\n", "", "PETSc (CPU)", "RAJA (GPU)"); + printf(" %-20s %12s %12s\n", "", "-----------", "----------"); + printf(" %-20s %10.4f ms %10.4f ms\n", "Avg time/call", petsc_ms, gpu_ms); + if (gpu_ms > 0.0) { + double speedup = petsc_ms / gpu_ms; + printf(" %-20s %10s %9.2fx\n", "Speedup", "", speedup); + } + printf("================================================================\n\n"); + + ierr = OPFLOWDestroy(&opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWDestroy(&opflow_gpu); + CHKERRQ(ierr); + + ExaGOFinalize(); + return 0; +} From a5227ded3edbe1765fada72476de468a4979c19b Mon Sep 17 00:00:00 2001 From: kswirydo Date: Thu, 23 Apr 2026 12:34:58 -0400 Subject: [PATCH 2/3] Add comments and annotations to eq Jacobian GPU port --- src/opflow/model/power_bal_hiop/paramsrajahiop.h | 8 ++++---- src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp | 6 +++--- .../model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp | 5 +++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.h b/src/opflow/model/power_bal_hiop/paramsrajahiop.h index 13692c65..4b31740b 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.h +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.h @@ -61,7 +61,7 @@ struct BUSParamsRajaHiop { int *ineqjacsp_idx_dev_; /* KS: device counterpart of ineqjacsp_idx_ */ int *genoffset_dev_; /* KS: dev counterpart of genoffset */ int *ngenONbus_dev_; /* KS: dev counterpart of ngenONbus */ - int *eqjacsp_selfidx_dev_; + int *eqjacsp_selfidx_dev_; /* KS: eqjacsp_selfidx device counterpart */ int allocate(OPFLOW); int destroy(OPFLOW); @@ -123,7 +123,7 @@ struct GENParamsRajaHiop { double *pgs_dev_; /* real power output setpoint */ double *apf_dev_; /* KS: device counterpart of apf */ double *vs_dev_; /* KS: device counterpart of vs */ - int *xpsetidx_dev_; + int *xpsetidx_dev_; /* KS: xpsetidx_ GPU counterpart */ int *isrenewable_dev_; /* Is renewable generator? */ int *xidx_dev_; /* starting locations in X vector */ @@ -248,8 +248,8 @@ struct LINEParamsRajaHiop { linelimidx_dev_; /* Indices for subset of lines that have finite limits */ int *ineqjacsp_idx_dev_; /* KS: Position in flat sparse ineq Jacobian array */ int *xslackidx_dev_; /* Starting location of slack variables in X vector */ - int *eqjacsp_idx_dev_; - int *eqjacsp_diag_idx_dev_; + int *eqjacsp_idx_dev_; /* KS: GPU counterpart for eqjacsp_idx_ */ + int *eqjacsp_diag_idx_dev_; /* KS: GPU counterpart for eqjacsp_diag_idx_*/ int *isdcline_dev_; int allocate(OPFLOW); diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp index 4a3b3806..75836574 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp @@ -613,7 +613,7 @@ extern PetscErrorCode OPFLOWSolutionCallback_PBPOLRAJAHIOPSPARSE( /** * Empty stub for the equality constraint Jacobian in the RAJA sparse model. - * Will be replaced with a full RAJA kernel implementation. + * Can be deleted if needed (dead code). Left for safety (it takes valid parameters and does nothing so it is harmless) */ PetscErrorCode OPFLOWComputeEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, @@ -676,9 +676,9 @@ PetscErrorCode OPFLOWModelCreate_PBPOLRAJAHIOPSPARSE(OPFLOW opflow) { opflow->modelops.computegradientarray = OPFLOWComputeGradientArray_PBPOLRAJAHIOPSPARSE; opflow->modelops.solutiontops = OPFLOWSolutionToPS_PBPOLRAJAHIOPSPARSE; - opflow->modelops.setup = OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE; + opflow->modelops.setup = OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE; opflow->modelops.computeequalityconstraintjacobian = - OPFLOWComputeEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE; + OPFLOWComputeEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE; /* KS: biggest change */ opflow->modelops.computesparseequalityconstraintjacobianhiop = OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE; opflow->modelops.computeinequalityconstraintjacobian = diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp index 39512265..ec3243d1 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp @@ -703,8 +703,8 @@ OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE( // does not change during the optimization. if (iJacS_dev != NULL && jJacS_dev != NULL) { /* Compute sparsity pattern on host, matching the flat-array layout - defined during setup in OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE. */ - + defined during setup in OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE. +*/ roffset = 0; coffset = 0; @@ -727,6 +727,7 @@ OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE( LINEParamsRajaHiop *lineparams = &pbpolrajahiopsparse->lineparams; int geni = 0, loadi = 0; +/*KS: not worth movint this to the gpu */ for (int ibus = 0; ibus < ps->nbus; ibus++) { PSBUS bus = &ps->bus[ibus]; int P_row = roffset + busparams->gidx[ibus]; From 0e94e1ccfa41e47e1ab682e8d7f3e3745502883b Mon Sep 17 00:00:00 2001 From: pelesh Date: Thu, 23 Apr 2026 21:44:29 +0000 Subject: [PATCH 3/3] Apply pre-commmit fixes --- src/opflow/CMakeLists.txt | 3 +-- .../model/power_bal_hiop/paramsrajahiop.h | 16 +++++++-------- .../power_bal_hiop/pbpolrajahiopsparse.cpp | 9 ++++++--- .../pbpolrajahiopsparsekernels.cpp | 4 ++-- .../equality/CMakeLists.txt | 13 ++++++++---- .../equality/test_eqjac_compare.cpp | 20 +++++++++---------- .../equality/test_eqjac_perf.cpp | 9 +++++---- 7 files changed, 40 insertions(+), 34 deletions(-) diff --git a/src/opflow/CMakeLists.txt b/src/opflow/CMakeLists.txt index fd767aea..863f45f4 100644 --- a/src/opflow/CMakeLists.txt +++ b/src/opflow/CMakeLists.txt @@ -41,8 +41,7 @@ if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_CUDA) set_source_files_properties( model/power_bal_hiop/pbpolrajahiopkernels.cpp model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp - model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp PROPERTIES LANGUAGE - CUDA + model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp PROPERTIES LANGUAGE CUDA ) endif() diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.h b/src/opflow/model/power_bal_hiop/paramsrajahiop.h index 4b31740b..06dddc91 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.h +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.h @@ -31,9 +31,9 @@ struct BUSParamsRajaHiop { int *hesssp_idx; /* KS: Hessian indices */ int *ispv; /* KS: ispv[i] = 1 if bus is PV bus */ int *gineqidx; /* KS: starting position of bus ineq constraints */ - int *ineqjacsp_idx; /* KS: index in flat sparse ineq Jacobian array */ - int *genoffset; /* KS: Offset into flattened gen array for this bus */ - int *ngenONbus; /* KS: Number of ON generators on this bus */ + int *ineqjacsp_idx; /* KS: index in flat sparse ineq Jacobian array */ + int *genoffset; /* KS: Offset into flattened gen array for this bus */ + int *ngenONbus; /* KS: Number of ON generators on this bus */ int *eqjacsp_selfidx; /* Flat-array position for bus self-admittance in eq Jacobian. [2*i] = P-row base, [2*i+1] = Q-row base */ @@ -87,8 +87,8 @@ struct GENParamsRajaHiop { double *pgs; /* real power output setpoint */ double *apf; /* generator AGC participation factor */ double *vs; /* voltage setpoint */ - int *xpsetidx; /* Starting locations in X vector for set-point variables */ - int *isrenewable; /* Is renewable generator? */ + int *xpsetidx; /* Starting locations in X vector for set-point variables */ + int *isrenewable; /* Is renewable generator? */ int *xidx; /* starting locations in X vector */ int *xpdevidx; /* KS: tarting locations of deviation variables in X vector */ @@ -123,7 +123,7 @@ struct GENParamsRajaHiop { double *pgs_dev_; /* real power output setpoint */ double *apf_dev_; /* KS: device counterpart of apf */ double *vs_dev_; /* KS: device counterpart of vs */ - int *xpsetidx_dev_; /* KS: xpsetidx_ GPU counterpart */ + int *xpsetidx_dev_; /* KS: xpsetidx_ GPU counterpart */ int *isrenewable_dev_; /* Is renewable generator? */ int *xidx_dev_; /* starting locations in X vector */ @@ -218,7 +218,7 @@ struct LINEParamsRajaHiop { int *linelimidx; /* Indices for subset of lines that have finite limits */ int *ineqjacsp_idx; /* KS: Position in flat sparse ineq Jacobian array */ int *xslackidx; /* Starting location of slack variables in X vector */ - int *eqjacsp_idx; /* Flat-array offset for off-diagonal eq Jacobian entries */ + int *eqjacsp_idx; /* Flat-array offset for off-diagonal eq Jacobian entries */ int *eqjacsp_diag_idx; /* Flat-array positions for diagonal entries per line [4*l+0]=from P-row, [4*l+1]=from Q-row, [4*l+2]=to P-row, [4*l+3]=to Q-row */ @@ -247,7 +247,7 @@ struct LINEParamsRajaHiop { int * linelimidx_dev_; /* Indices for subset of lines that have finite limits */ int *ineqjacsp_idx_dev_; /* KS: Position in flat sparse ineq Jacobian array */ - int *xslackidx_dev_; /* Starting location of slack variables in X vector */ + int *xslackidx_dev_; /* Starting location of slack variables in X vector */ int *eqjacsp_idx_dev_; /* KS: GPU counterpart for eqjacsp_idx_ */ int *eqjacsp_diag_idx_dev_; /* KS: GPU counterpart for eqjacsp_diag_idx_*/ int *isdcline_dev_; diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp index 75836574..ada701a5 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp @@ -613,7 +613,8 @@ extern PetscErrorCode OPFLOWSolutionCallback_PBPOLRAJAHIOPSPARSE( /** * Empty stub for the equality constraint Jacobian in the RAJA sparse model. - * Can be deleted if needed (dead code). Left for safety (it takes valid parameters and does nothing so it is harmless) + * Can be deleted if needed (dead code). Left for safety (it takes valid + * parameters and does nothing so it is harmless) */ PetscErrorCode OPFLOWComputeEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, @@ -676,9 +677,11 @@ PetscErrorCode OPFLOWModelCreate_PBPOLRAJAHIOPSPARSE(OPFLOW opflow) { opflow->modelops.computegradientarray = OPFLOWComputeGradientArray_PBPOLRAJAHIOPSPARSE; opflow->modelops.solutiontops = OPFLOWSolutionToPS_PBPOLRAJAHIOPSPARSE; - opflow->modelops.setup = OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE; + opflow->modelops.setup = OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE; opflow->modelops.computeequalityconstraintjacobian = - OPFLOWComputeEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE; /* KS: biggest change */ + OPFLOWComputeEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE; /* KS: + biggest + change */ opflow->modelops.computesparseequalityconstraintjacobianhiop = OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE; opflow->modelops.computeinequalityconstraintjacobian = diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp index ec3243d1..174a9464 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp @@ -703,7 +703,7 @@ OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE( // does not change during the optimization. if (iJacS_dev != NULL && jJacS_dev != NULL) { /* Compute sparsity pattern on host, matching the flat-array layout - defined during setup in OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE. + defined during setup in OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE. */ roffset = 0; coffset = 0; @@ -727,7 +727,7 @@ OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE( LINEParamsRajaHiop *lineparams = &pbpolrajahiopsparse->lineparams; int geni = 0, loadi = 0; -/*KS: not worth movint this to the gpu */ + /*KS: not worth movint this to the gpu */ for (int ibus = 0; ibus < ps->nbus; ibus++) { PSBUS bus = &ps->bus[ibus]; int P_row = roffset + busparams->gidx[ibus]; diff --git a/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt b/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt index 35f1a74e..0131169b 100644 --- a/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt +++ b/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt @@ -80,10 +80,15 @@ if(EXAGO_RUN_TESTS) if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIOP) set(prefix_net ${CMAKE_SOURCE_DIR}/datafiles/unit/opflow/objective/) exago_add_test( - NAME UNIT_TESTS_EQJAC_GPU_VS_PETSC_testx3 - DEPENDS HIOP_SPARSE - COMMAND ${RUNCMD} $ - -netfile ${prefix_net}testx3.m + NAME + UNIT_TESTS_EQJAC_GPU_VS_PETSC_testx3 + DEPENDS + HIOP_SPARSE + COMMAND + ${RUNCMD} + $ + -netfile + ${prefix_net}testx3.m ) endif() endif() diff --git a/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_compare.cpp b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_compare.cpp index 84ae4f5f..c2fd7cb9 100644 --- a/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_compare.cpp +++ b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_compare.cpp @@ -151,8 +151,8 @@ int main(int argc, char **argv) { double *values_dev = static_cast(d_allocator.allocate(nnz_eq * sizeof(double))); - umpire::util::AllocationRecord record_x{ - x_host, sizeof(double) * nx, h_allocator.getAllocationStrategy()}; + umpire::util::AllocationRecord record_x{x_host, sizeof(double) * nx, + h_allocator.getAllocationStrategy()}; resmgr.registerAllocation(x_host, record_x); resmgr.copy(x_dev, x_host); #else @@ -198,10 +198,10 @@ int main(int argc, char **argv) { double worst_ref = 0, worst_gpu = 0; const double tol = 1e-6; - printf(" %-8s %-8s %16s %16s %12s %s\n", "Row", "Col", "PETSc (ref)", - "GPU", "AbsErr", "Status"); - printf(" %-8s %-8s %16s %16s %12s %s\n", "---", "---", "-----------", - "---", "------", "------"); + printf(" %-8s %-8s %16s %16s %12s %s\n", "Row", "Col", "PETSc (ref)", "GPU", + "AbsErr", "Status"); + printf(" %-8s %-8s %16s %16s %12s %s\n", "---", "---", "-----------", "---", + "------", "------"); for (const auto &e : ref_entries) { auto key = std::make_pair(e.row, e.col); @@ -215,8 +215,7 @@ int main(int argc, char **argv) { } double abs_err = fabs(gpu_val - e.val); - double rel_err = - (fabs(e.val) > 1e-12) ? abs_err / fabs(e.val) : abs_err; + double rel_err = (fabs(e.val) > 1e-12) ? abs_err / fabs(e.val) : abs_err; const char *status; if (!found) { @@ -269,9 +268,8 @@ int main(int argc, char **argv) { printf(" Max relative err: %.2e\n", max_rel_err); printf(" Tolerance: %.2e\n", tol); printf(" RESULT: %s\n", - (n_mismatch == 0 && n_missing_gpu == 0 && n_extra_gpu == 0) - ? "PASS" - : "FAIL"); + (n_mismatch == 0 && n_missing_gpu == 0 && n_extra_gpu == 0) ? "PASS" + : "FAIL"); printf("============================================================\n\n"); int result = diff --git a/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_perf.cpp b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_perf.cpp index d1615ab4..d42d0e53 100644 --- a/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_perf.cpp +++ b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_perf.cpp @@ -143,8 +143,8 @@ int main(int argc, char **argv) { values_dev = static_cast(d_allocator.allocate(nnz_eq * sizeof(double))); - umpire::util::AllocationRecord record_x{ - x_host, sizeof(double) * nx, h_allocator.getAllocationStrategy()}; + umpire::util::AllocationRecord record_x{x_host, sizeof(double) * nx, + h_allocator.getAllocationStrategy()}; resmgr.registerAllocation(x_host, record_x); resmgr.copy(x_dev, x_host); #else @@ -227,9 +227,10 @@ int main(int argc, char **argv) { printf(" %-20s %10.4f ms %10.4f ms\n", "Avg time/call", petsc_ms, gpu_ms); if (gpu_ms > 0.0) { double speedup = petsc_ms / gpu_ms; - printf(" %-20s %10s %9.2fx\n", "Speedup", "", speedup); + printf(" %-20s %10s %9.2fx\n", "Speedup", "", speedup); } - printf("================================================================\n\n"); + printf( + "================================================================\n\n"); ierr = OPFLOWDestroy(&opflow_ref); CHKERRQ(ierr);