From 5861e99c59ed1bc9886e7f58a713fbdd0064dfc3 Mon Sep 17 00:00:00 2001 From: kswirydo Date: Thu, 16 Apr 2026 15:49:20 -0400 Subject: [PATCH 1/8] I am not going to lie, Cursor agent heavily helped me with this. Replace PETSc-based inequality Jacobian with GPU RAJA kernels Move the inequality constraint Jacobian computation for the HiOp sparse GPU solver entirely to the device, eliminating the per-iteration host back and forth (copy to host, PETSc compute, MatGetRow extraction, values copy back to device). Elimiate PETSc use from this part of the code. Three RAJA kernels now compute directly into device memory: - Generator set-point constraints (AGC) - Voltage-reactive-power bounds (FIXED_WITHIN_QBOUNDS) - Line flow limits (Sf^2/St^2 derivatives + slack variables) Supporting changes: - Analytical NNZ counting replaces PETSc MatGetInfo at solver setup - New device-side parameter fields (apf, vs, xpdevidx, xslackidx, bus-to-gen mapping) added to *ParamsRajaHiop structs - Sparse position indices assigned at model setup for all three contribution types Includes validation test (test_ineqjac_gpu) that solves with IPOPT, then compares PETSc and GPU Jacobian values at the converged solution. Optional -benchmark flag for performance comparison. Made-with: Cursor --- src/opflow/CMakeLists.txt | 6 +- .../model/power_bal_hiop/paramsrajahiop.cpp | 82 ++++ .../model/power_bal_hiop/paramsrajahiop.h | 32 +- .../power_bal_hiop/pbpolrajahiopsparse.cpp | 185 +++----- .../pbpolrajahiopsparse_gpu.cpp | 250 +++++++++++ .../pbpolrajahiopsparse_gpu.hpp | 30 ++ .../pbpolrajahiopsparsekernels.cpp | 43 +- .../solver/hiop/opflow_hiopsparsegpu.cpp | 17 +- tests/unit/CMakeLists.txt | 27 ++ tests/unit/test_ineqjac_gpu.cpp | 394 ++++++++++++++++++ 10 files changed, 884 insertions(+), 182 deletions(-) create mode 100644 src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp create mode 100644 src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp create mode 100644 tests/unit/test_ineqjac_gpu.cpp diff --git a/src/opflow/CMakeLists.txt b/src/opflow/CMakeLists.txt index d0553657..fd767aea 100644 --- a/src/opflow/CMakeLists.txt +++ b/src/opflow/CMakeLists.txt @@ -20,6 +20,7 @@ if(EXAGO_ENABLE_RAJA) set(OPFLOW_FORM_SRC ${OPFLOW_FORM_SRC} model/power_bal_hiop/pbpolrajahiopsparse.cpp model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp + model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp ) endif() endif() @@ -39,8 +40,9 @@ set_source_files_properties(${OPFLOW_SRC} PROPERTIES LANGUAGE CXX) if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_CUDA) set_source_files_properties( model/power_bal_hiop/pbpolrajahiopkernels.cpp - model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp PROPERTIES LANGUAGE - CUDA + model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp + model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp PROPERTIES LANGUAGE + CUDA ) endif() diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp b/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp index 27e2cfa0..583f273b 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp @@ -27,6 +27,11 @@ int BUSParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(jacsp_idx); h_allocator_.deallocate(jacsq_idx); } + h_allocator_.deallocate(ispv); + h_allocator_.deallocate(gineqidx); + h_allocator_.deallocate(ineqjacsp_idx); + h_allocator_.deallocate(genoffset); + h_allocator_.deallocate(ngenONbus); #ifdef EXAGO_ENABLE_GPU d_allocator_.deallocate(isref_dev_); @@ -46,6 +51,11 @@ int BUSParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(jacsp_idx_dev_); d_allocator_.deallocate(jacsq_idx_dev_); } + d_allocator_.deallocate(ispv_dev_); + d_allocator_.deallocate(gineqidx_dev_); + d_allocator_.deallocate(ineqjacsp_idx_dev_); + d_allocator_.deallocate(genoffset_dev_); + d_allocator_.deallocate(ngenONbus_dev_); #endif return 0; @@ -80,6 +90,11 @@ int BUSParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(jacsq_idx_dev_, jacsq_idx); resmgr.copy(powerimbalance_penalty_dev_, powerimbalance_penalty); } + resmgr.copy(ispv_dev_, ispv); + resmgr.copy(gineqidx_dev_, gineqidx); + resmgr.copy(ineqjacsp_idx_dev_, ineqjacsp_idx); + resmgr.copy(genoffset_dev_, genoffset); + resmgr.copy(ngenONbus_dev_, ngenONbus); #else isref_dev_ = isref; isisolated_dev_ = isisolated; @@ -96,6 +111,11 @@ int BUSParamsRajaHiop::copy(OPFLOW opflow) { jacsp_idx_dev_ = jacsp_idx; jacsq_idx_dev_ = jacsq_idx; powerimbalance_penalty_dev_ = powerimbalance_penalty; + ispv_dev_ = ispv; + gineqidx_dev_ = gineqidx; + ineqjacsp_idx_dev_ = ineqjacsp_idx; + genoffset_dev_ = genoffset; + ngenONbus_dev_ = ngenONbus; #endif return 0; } @@ -132,18 +152,31 @@ int BUSParamsRajaHiop::allocate(OPFLOW opflow) { jacsp_idx = paramAlloc(h_allocator_, nbus); jacsq_idx = paramAlloc(h_allocator_, nbus); } + ispv = paramAlloc(h_allocator_, nbus); + gineqidx = paramAlloc(h_allocator_, nbus); + ineqjacsp_idx = paramAlloc(h_allocator_, nbus); + genoffset = paramAlloc(h_allocator_, nbus); + ngenONbus = paramAlloc(h_allocator_, nbus); /* Memzero arrays */ resmgr.memset(isref, 0, nbus * sizeof(int)); resmgr.memset(ispvpq, 0, nbus * sizeof(int)); resmgr.memset(isisolated, 0, nbus * sizeof(int)); + resmgr.memset(ispv, 0, nbus * sizeof(int)); + resmgr.memset(gineqidx, 0, nbus * sizeof(int)); + resmgr.memset(ineqjacsp_idx, 0, nbus * sizeof(int)); + resmgr.memset(genoffset, 0, nbus * sizeof(int)); + resmgr.memset(ngenONbus, 0, nbus * sizeof(int)); + int genoff = 0; for (int i = 0; i < nbus; i++) { bus = &ps->bus[i]; loc = bus->startxVloc; xidx[i] = opflow->idxn2sd_map[loc]; gidx[i] = bus->starteqloc; + genoffset[i] = genoff; + genoff += bus->ngenON; if (bus->ide == REF_BUS) isref[i] = 1; @@ -152,6 +185,12 @@ int BUSParamsRajaHiop::allocate(OPFLOW opflow) { else ispvpq[i] = 1; + if (bus->ide == PV_BUS) + ispv[i] = 1; + + ngenONbus[i] = bus->ngenON; + gineqidx[i] = bus->startineqloc; + if (opflow->genbusvoltagetype == FIXED_AT_SETPOINT) { if (bus->ide == REF_BUS || bus->ide == PV_BUS) { /* Hold voltage at reference and PV buses */ @@ -200,6 +239,11 @@ int BUSParamsRajaHiop::allocate(OPFLOW opflow) { jacsp_idx_dev_ = paramAlloc(d_allocator_, nbus); jacsq_idx_dev_ = paramAlloc(d_allocator_, nbus); } + ispv_dev_ = paramAlloc(d_allocator_, nbus); + gineqidx_dev_ = paramAlloc(d_allocator_, nbus); + ineqjacsp_idx_dev_ = paramAlloc(d_allocator_, nbus); + genoffset_dev_ = paramAlloc(d_allocator_, nbus); + ngenONbus_dev_ = paramAlloc(d_allocator_, nbus); #endif return 0; } @@ -231,6 +275,8 @@ int LINEParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(gineqidx_dev_, gineqidx); resmgr.copy(gbineqidx_dev_, gbineqidx); resmgr.copy(linelimidx_dev_, linelimidx); + resmgr.copy(ineqjacsp_idx_dev_, ineqjacsp_idx); + resmgr.copy(xslackidx_dev_, xslackidx); } #else Gff_dev_ = Gff; @@ -250,6 +296,8 @@ int LINEParamsRajaHiop::copy(OPFLOW opflow) { gineqidx_dev_ = gineqidx; gbineqidx_dev_ = gbineqidx; linelimidx_dev_ = linelimidx; + ineqjacsp_idx_dev_ = ineqjacsp_idx; + xslackidx_dev_ = xslackidx; } #endif return 0; @@ -277,6 +325,8 @@ int LINEParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(gineqidx); h_allocator_.deallocate(gbineqidx); h_allocator_.deallocate(linelimidx); + h_allocator_.deallocate(ineqjacsp_idx); + h_allocator_.deallocate(xslackidx); } #ifdef EXAGO_ENABLE_GPU @@ -301,6 +351,8 @@ int LINEParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(gineqidx_dev_); d_allocator_.deallocate(gbineqidx_dev_); d_allocator_.deallocate(linelimidx_dev_); + d_allocator_.deallocate(ineqjacsp_idx_dev_); + d_allocator_.deallocate(xslackidx_dev_); } #endif @@ -348,6 +400,8 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { linelimidx = paramAlloc(h_allocator_, nlinelim); gineqidx = paramAlloc(h_allocator_, nlinelim); gbineqidx = paramAlloc(h_allocator_, nlinelim); + ineqjacsp_idx = paramAlloc(h_allocator_, nlinelim); + xslackidx = paramAlloc(h_allocator_, nlinelim); } PetscInt j = 0; @@ -391,6 +445,9 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { gbineqidx[j] = opflow->nconeq + line->startineqloc; gineqidx[j] = line->startineqloc; linelimidx[j] = linei; + if (opflow->allow_lineflow_violation) { + xslackidx[j] = opflow->idxn2sd_map[line->startxslackloc]; + } j++; } @@ -420,6 +477,8 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { gineqidx_dev_ = paramAlloc(d_allocator_, nlinelim); gbineqidx_dev_ = paramAlloc(d_allocator_, nlinelim); linelimidx_dev_ = paramAlloc(d_allocator_, nlinelim); + ineqjacsp_idx_dev_ = paramAlloc(d_allocator_, nlinelim); + xslackidx_dev_ = paramAlloc(d_allocator_, nlinelim); } #endif return 0; @@ -558,7 +617,10 @@ int GENParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(qt); h_allocator_.deallocate(qb); h_allocator_.deallocate(isrenewable); + h_allocator_.deallocate(apf); + h_allocator_.deallocate(vs); h_allocator_.deallocate(xidx); + h_allocator_.deallocate(xpdevidx); h_allocator_.deallocate(gidxbus); h_allocator_.deallocate(eqjacspbus_idx); h_allocator_.deallocate(eqjacsqbus_idx); @@ -581,7 +643,10 @@ int GENParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(qt_dev_); d_allocator_.deallocate(qb_dev_); d_allocator_.deallocate(isrenewable_dev_); + d_allocator_.deallocate(apf_dev_); + d_allocator_.deallocate(vs_dev_); d_allocator_.deallocate(xidx_dev_); + d_allocator_.deallocate(xpdevidx_dev_); d_allocator_.deallocate(gidxbus_dev_); d_allocator_.deallocate(eqjacspbus_idx_dev_); d_allocator_.deallocate(eqjacsqbus_idx_dev_); @@ -615,8 +680,11 @@ int GENParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(qt_dev_, qt); resmgr.copy(qb_dev_, qb); resmgr.copy(isrenewable_dev_, isrenewable); + resmgr.copy(apf_dev_, apf); + resmgr.copy(vs_dev_, vs); resmgr.copy(xidx_dev_, xidx); + resmgr.copy(xpdevidx_dev_, xpdevidx); resmgr.copy(gidxbus_dev_, gidxbus); resmgr.copy(eqjacspbus_idx_dev_, eqjacspbus_idx); @@ -639,7 +707,10 @@ int GENParamsRajaHiop::copy(OPFLOW opflow) { qt_dev_ = qt; qb_dev_ = qb; isrenewable_dev_ = isrenewable; + apf_dev_ = apf; + vs_dev_ = vs; xidx_dev_ = xidx; + xpdevidx_dev_ = xpdevidx; gidxbus_dev_ = gidxbus; eqjacspbus_idx_dev_ = eqjacspbus_idx; eqjacsqbus_idx_dev_ = eqjacsqbus_idx; @@ -682,8 +753,11 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { qt = paramAlloc(h_allocator_, ngenON); qb = paramAlloc(h_allocator_, ngenON); isrenewable = paramAlloc(h_allocator_, ngenON); + apf = paramAlloc(h_allocator_, ngenON); + vs = paramAlloc(h_allocator_, ngenON); xidx = paramAlloc(h_allocator_, ngenON); + xpdevidx = paramAlloc(h_allocator_, ngenON); gidxbus = paramAlloc(h_allocator_, ngenON); eqjacspbus_idx = paramAlloc(h_allocator_, ngenON); @@ -720,11 +794,16 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { qt[geni] = gen->qt; qb[geni] = gen->qb; isrenewable[geni] = (int)gen->isrenewable; + apf[geni] = gen->apf; + vs[geni] = gen->vs; if (opflow->has_gensetpoint) { pgs[geni] = gen->pgs; } xidx[geni] = opflow->idxn2sd_map[loc]; + xpdevidx[geni] = (opflow->has_gensetpoint && !gen->isrenewable) + ? opflow->idxn2sd_map[gen->startxpdevloc] + : -1; gidxbus[geni] = gloc; if (opflow->has_gensetpoint) { geqidxgen[geni] = gen->starteqloc; @@ -748,8 +827,11 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { qt_dev_ = paramAlloc(d_allocator_, ngenON); qb_dev_ = paramAlloc(d_allocator_, ngenON); isrenewable_dev_ = paramAlloc(d_allocator_, ngenON); + apf_dev_ = paramAlloc(d_allocator_, ngenON); + vs_dev_ = paramAlloc(d_allocator_, ngenON); xidx_dev_ = paramAlloc(d_allocator_, ngenON); + xpdevidx_dev_ = paramAlloc(d_allocator_, ngenON); gidxbus_dev_ = paramAlloc(d_allocator_, ngenON); eqjacspbus_idx_dev_ = paramAlloc(d_allocator_, ngenON); diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.h b/src/opflow/model/power_bal_hiop/paramsrajahiop.h index 91f1fb9f..6121a4c6 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.h +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.h @@ -28,7 +28,12 @@ struct BUSParamsRajaHiop { vector */ int *jacsp_idx; /* Location number in the sparse Jacobian for Pimb */ int *jacsq_idx; /* Location number in the sparse Jacobian for Qimb */ - int *hesssp_idx; /* Location number in the Hessian */ + int *hesssp_idx; /* KS: Hessian indices */ + int *ispv; /* KS: ispv[i] = 1 if bus is PV bus */ + int *gineqidx; /* KS: starting position of bus ineq constraints */ + int *ineqjacsp_idx; /* KS: index in flat sparse ineq Jacobian array */ + int *genoffset; /* KS: Offset into flattened gen array for this bus */ + int *ngenONbus; /* KS: Number of ON generators on this bus */ // Device data int *isref_dev_; /* isref[i] = 1 if bus is reference bus */ @@ -49,6 +54,11 @@ struct BUSParamsRajaHiop { int *jacsp_idx_dev_; /* Location number in the sparse Jacobian for Pimb */ int *jacsq_idx_dev_; /* Location number in the sparse Jacobian for Qimb */ int *hesssp_idx_dev_; /* Location number in the Hessian */ + int *ispv_dev_; /* KS: dev counterpart of ispv */ + int *gineqidx_dev_; /* KS: dev counterpart of gineqidx */ + int *ineqjacsp_idx_dev_; /* KS: device counterpart of ineqjacsp_idx_ */ + int *genoffset_dev_; /* KS: dev counterpart of genoffset */ + int *ngenONbus_dev_; /* KS: dev counterpart of ngenONbus */ int allocate(OPFLOW); int destroy(OPFLOW); @@ -72,9 +82,12 @@ struct GENParamsRajaHiop { double *qt; /* min. reactive power gen. limits */ double *qb; /* max. reactive power gen. limits */ double *pgs; /* real power output setpoint */ + double *apf; /* generator AGC participation factor */ + double *vs; /* voltage setpoint */ int *isrenewable; /* Is renewable generator? */ - int *xidx; /* starting locations in X vector */ + int *xidx; /* starting locations in X vector */ + int *xpdevidx; /* KS: tarting locations of deviation variables in X vector */ int * gidxbus; /* starting locations in constraint vector for bus constraints */ int *geqidxgen; /* starting locations in equality constraint vector for gen @@ -104,9 +117,12 @@ struct GENParamsRajaHiop { double *qt_dev_; /* min. reactive power gen. limits */ double *qb_dev_; /* max. reactive power gen. limits */ double *pgs_dev_; /* real power output setpoint */ + double *apf_dev_; /* KS: device counterpart of apf */ + double *vs_dev_; /* KS: device counterpart of vs */ int *isrenewable_dev_; /* Is renewable generator? */ int *xidx_dev_; /* starting locations in X vector */ + int *xpdevidx_dev_; /* KS: device coutnerpart of xpdevidx*/ int *gidxbus_dev_; /* starting locations in constraint vector for bus constraints */ int *geqidxgen_dev_; /* starting locations in equality constraint vector for @@ -143,7 +159,7 @@ struct LOADParamsRajaHiop { double *pl; /* active power demand */ double *ql; /* reactive power demand */ double *loadloss_penalty; /* Penalty for load loss */ - int *xidx; /* starting location in X vector */ + int *xidx; /* KS: starting location in X vector */ int *gidx; /* starting location in constraint vector */ /* The following members are only used with HIOP */ @@ -194,7 +210,9 @@ struct LINEParamsRajaHiop { constraint */ int *gbineqidx; /* Starting location to insert contribution to inequality constraint bound */ - int *linelimidx; /* Indices for subset of lines that have finite limits */ + int *linelimidx; /* Indices for subset of lines that have finite limits */ + int *ineqjacsp_idx; /* KS: Position in flat sparse ineq Jacobian array */ + int *xslackidx; /* Starting location of slack variables in X vector */ // Device data double *Gff_dev_; /* From side self conductance */ @@ -217,7 +235,9 @@ struct LINEParamsRajaHiop { int *gbineqidx_dev_; /* Starting location to insert contribution to inequality constraint bound */ int * - linelimidx_dev_; /* Indices for subset of lines that have finite limits */ + linelimidx_dev_; /* Indices for subset of lines that have finite limits */ + int *ineqjacsp_idx_dev_; /* KS: Position in flat sparse ineq Jacobian array */ + int *xslackidx_dev_; /* Starting location of slack variables in X vector */ int allocate(OPFLOW); int destroy(OPFLOW); @@ -248,6 +268,8 @@ struct PbpolModelRajaHiop : public _p_FormPBPOLRAJAHIOP { LINEParamsRajaHiop lineparams; BUSParamsRajaHiop busparams; + int agc_xidx; /* KS: X-vector index for the AGC delta-P variable (ps->startxloc) */ + // Arrays to store Jacobian and Hessian indices and entries on CPU (used with // GPU sparse model) int *i_jaceq, diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp index b574e4bb..cac87986 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp @@ -254,151 +254,88 @@ PetscErrorCode OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE(OPFLOW opflow) { LOADParamsRajaHiop *loadparams = &pbpolrajahiopsparse->loadparams; LINEParamsRajaHiop *lineparams = &pbpolrajahiopsparse->lineparams; - /* Need to compute the number of nonzeros in equality, inequality constraint - * Jacobians and Hessian */ - int nnz_eqjac = 0, nnz_ineqjac = 0, nnz_hess = 0; - - // Find nonzero entries in equality constraint Jacobian by row. Using - // OPFLOWComputeEqualityConstraintJacobian_PBPOL() as a guide. - - PS ps = (PS)opflow->ps; - - for (int ibus = 0; ibus < ps->nbus; ++ibus) { - - PSBUS bus = &(ps->bus[ibus]); - - // Nonzero entries used by each *bus* starts here - - // no matter what, each bus uses 2 rows and 2 columns - // row 1 = real, row2 = reactive - nnz_eqjac += 2; - nnz_eqjac += 2; + PS ps = opflow->ps; + PSBUS bus; + PSGEN gen; + PSLINE line; + PetscInt i, k; - if (bus->ide == ISOLATED_BUS) { - continue; - } + /* KS: Store the AGC variable index (scalar) */ + if (opflow->use_agc) { + pbpolrajahiopsparse->agc_xidx = opflow->idxn2sd_map[ps->startxloc]; + } else { + pbpolrajahiopsparse->agc_xidx = -1; + } - if (opflow->include_powerimbalance_variables) { - // 2 more entries on both real and reactive - nnz_eqjac += 4; - } + /* KS: Compute the number of nonzeros in equality, inequality constraint + * Jacobians and Hessian. Equality and Hessian counts are still obtained + * from PETSc via get_sparse_blocks_info; inequality count is computed + * axplicitly so we can skip PETSc */ + int nnz_eqjacsp = 0, nnz_ineqjacsp = 0, nnz_hesssp = 0; - if (opflow->has_gensetpoint) { - for (int bgen = 0; bgen < bus->ngen; ++bgen) { - PSGEN gen; - ierr = PSBUSGetGen(bus, bgen, &gen); - CHKERRQ(ierr); - - if (!gen->status || gen->isrenewable) - continue; + /* + * KS: Count inequality Jacobian non-zeros. The traversal order must match + * the startineqloc assignment in OPFLOWModelSetUp_PBPOL: for each bus + * (bus ineq, then gen ineq), then for each line. + */ + int geni = 0, gi; + for (i = 0; i < ps->nbus; i++) { + bus = &ps->bus[i]; - // each generator uses 2 rows, 3 columns real, 1 column reactive - nnz_eqjac += 4; + /* Bus voltage-Q-bounds constraints (FIXED_WITHIN_QBOUNDS) */ + if (opflow->genbusvoltagetype == FIXED_WITHIN_QBOUNDS) { + if (bus->ide == PV_BUS || bus->ide == REF_BUS) { + busparams->ineqjacsp_idx[i] = nnz_ineqjacsp; + /* 2 rows, each with ngenON + 1 entries (one per gen Qg + one for V) */ + nnz_ineqjacsp += 2 * (bus->ngenON + 1); } } - } - - // Go through the lines - for (int iline = 0; iline <= ps->nline; ++iline) { - PSLINE line = &(ps->line[iline]); - - if (!line->status) - continue; - - // each line adds 4 (off-diagonal) entries for the "to" bus and 4 - // entries for the "from" bus. Each line also modifies 4 existing - // "to" and "from" bus entries. - nnz_eqjac += 4; - nnz_eqjac += 4; - } - // if there are lines, non-zeros were over counted - if (ps->nline > 0) { - nnz_eqjac -= 8; - } - - if (opflow->has_gensetpoint) { - for (int ibus = 0; ibus < ps->nbus; ++ibus) { - PSBUS bus = &(ps->bus[ibus]); - for (int bgen = 0; bgen < bus->ngen; ++bgen) { - PSGEN gen; - ierr = PSBUSGetGen(bus, bgen, &gen); + /* KS: Generator set-point constraints */ + gi = 0; + if (opflow->has_gensetpoint) { + for (k = 0; k < bus->ngen; k++) { + ierr = PSBUSGetGen(bus, k, &gen); CHKERRQ(ierr); - if (!gen->status) continue; - - nnz_ineqjac += 6; - } - } - } - - if (opflow->genbusvoltagetype == FIXED_WITHIN_QBOUNDS) { - for (int ibus = 0; ibus < ps->nbus; ++ibus) { - PSBUS bus = &(ps->bus[ibus]); - if (bus->ide == PV_BUS || bus->ide == REF_BUS) { - nnz_ineqjac += 2; + if (!gen->isrenewable) { + genparams->ineqjacspgen_idx[geni + gi] = nnz_ineqjacsp; + if (opflow->use_agc) { + nnz_ineqjacsp += 6; /* 2 rows x 3 entries (Pg, delPg, delP) */ + } + } + gi++; } } - } - - for (int iline = 0; iline < opflow->nlinesmon; ++iline) { - nnz_ineqjac += 8; - } - - for (int ibus = 0; ibus < ps->nbus; ++ibus) { - // reserve 2 real and 2 reactive entries for each bus - // 3 upper triangular - nnz_hess += 3; - - if (opflow->include_powerimbalance_variables) { - nnz_hess += 2; - } - } - - for (int i = 0; i < ps->ngen; ++i) { - PSGEN gen = &(ps->gen[i]); - - if (!gen->status) - continue; - - nnz_hess += 2; - - if (opflow->has_gensetpoint) { - if (gen->isrenewable) - continue; - // later ... - // if (opflow->use_agc) { - // nnz_hess += 5; - // } - } - if (opflow->genbusvoltagetype == FIXED_WITHIN_QBOUNDS) { - nnz_hess += 2; - } + geni += bus->ngenON; } - for (int iline = 0; iline < ps->nline; ++iline) { - PSLINE line = &(ps->line[iline]); - + /* Line flow constraints */ + int linej = 0; + for (i = 0; i < ps->nline; i++) { + line = &ps->line[i]; if (!line->status) continue; + if (line->isdcline) + continue; - // 3 diagonal entries for on the from-bus rows (already defined) - // 3 diagonal entries for on the to-bus rows (already defined) - // 4 off-diagonal entries in upper part - nnz_hess += 4; - } - - if (opflow->include_loadloss_variables) { - for (int iload = 0; iload < ps->nload; ++iload) { - nnz_hess += 2; + if (linej < opflow->nlinesmon && opflow->linesmon[linej] == i) { + lineparams->ineqjacsp_idx[linej] = nnz_ineqjacsp; + /* 2 rows x 4 entries (thetaf, Vmf, thetat, Vmt) */ + int entries_per_line = 8; + if (opflow->allow_lineflow_violation) { + entries_per_line += 2; /* 1 slack entry per row */ + } + nnz_ineqjacsp += entries_per_line; + linej++; } } - opflow->nnz_eqjacsp = nnz_eqjac; - opflow->nnz_ineqjacsp = nnz_ineqjac; - opflow->nnz_hesssp = nnz_hess; + opflow->nnz_eqjacsp = nnz_eqjacsp; + opflow->nnz_ineqjacsp = nnz_ineqjacsp; + opflow->nnz_hesssp = nnz_hesssp; ierr = busparams->copy(opflow); ierr = genparams->copy(opflow); diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp new file mode 100644 index 00000000..5fc79ca1 --- /dev/null +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp @@ -0,0 +1,250 @@ + +#include + +#if defined(EXAGO_ENABLE_RAJA) +#if defined(EXAGO_ENABLE_HIOP_SPARSE) + +#include +#include + +#include "pbpolrajahiopsparse_gpu.hpp" + +void ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, + const double *x_dev, + double *jacd_dev) { + PbpolModelRajaHiop *pbpolrajahiopsparse = + reinterpret_cast(opflow->model); + GENParamsRajaHiop *genparams = &pbpolrajahiopsparse->genparams; + BUSParamsRajaHiop *busparams = &pbpolrajahiopsparse->busparams; + + /* + * Generator set-point inequality Jacobian (has_gensetpoint && use_agc). + * + * PETSc reference (pbpol.cpp:1191-1232): + * Row gloc: [apf*delP - delPg, -(Pg - pt), apf*(Pg - pt)] + * Row gloc+1: [apf*delP - delPg, (pb - Pg), -apf*(pb - Pg)] + * + * Flat array layout per generator (6 entries): + * [0..2] = row 0 values (Pg, delPg, delP columns) + * [3..5] = row 1 values (Pg, delPg, delP columns) + */ + if (opflow->has_gensetpoint && opflow->use_agc) { + int *g_ineqjacsp_idx = genparams->ineqjacspgen_idx_dev_; + int *g_xidx = genparams->xidx_dev_; + int *g_xpdevidx = genparams->xpdevidx_dev_; + int *g_isrenewable = genparams->isrenewable_dev_; + double *g_apf = genparams->apf_dev_; + double *g_pt = genparams->pt_dev_; + double *g_pb = genparams->pb_dev_; + int agc_xidx = pbpolrajahiopsparse->agc_xidx; + + RAJA::forall( + RAJA::RangeSegment(0, genparams->ngenON), + RAJA_LAMBDA(RAJA::Index_type i) { + if (g_isrenewable[i]) + return; + + int base = g_ineqjacsp_idx[i]; + double Pg = x_dev[g_xidx[i]]; + double delPg = x_dev[g_xpdevidx[i]]; + double delP = x_dev[agc_xidx]; + double apf = g_apf[i]; + double pt = g_pt[i]; + double pb = g_pb[i]; + + double apf_delP_minus_delPg = apf * delP - delPg; + + /* Row 0: d/d{Pg, delPg, delP} of (apf*delP - delPg)*(Pg - pt) */ + jacd_dev[base + 0] = apf_delP_minus_delPg; + jacd_dev[base + 1] = -(Pg - pt); + jacd_dev[base + 2] = apf * (Pg - pt); + + /* Row 1: d/d{Pg, delPg, delP} of (delPg - apf*delP)*(pb - Pg) */ + jacd_dev[base + 3] = apf_delP_minus_delPg; + jacd_dev[base + 4] = pb - Pg; + jacd_dev[base + 5] = -apf * (pb - Pg); + }); + } + + /* + * FIXED_WITHIN_QBOUNDS voltage constraint Jacobian. + * + * PETSc reference (pbpol.cpp:1235-1278): + * For each PV/REF bus with ngenON generators: + * Row 0 (gloc): [Vset_0-V, Vset_1-V, ..., Qmax-Q] + * Row 1 (gloc+1): [Vset_0-V, Vset_1-V, ..., Qmin-Q] + * + * Flat array layout per bus (2 * (ngenON + 1) entries): + * Row 0: ngenON Qg derivative entries + 1 V derivative entry + * Row 1: ngenON Qg derivative entries + 1 V derivative entry + * + * This kernel parallelizes over buses. The inner loop over generators + * is sequential within each thread (variable-length per bus). + */ + if (opflow->genbusvoltagetype == FIXED_WITHIN_QBOUNDS) { + int *b_ispv = busparams->ispv_dev_; + int *b_isref = busparams->isref_dev_; + int *b_xidx = busparams->xidx_dev_; + int *b_ineqjacsp_idx = busparams->ineqjacsp_idx_dev_; + int *b_genoffset = busparams->genoffset_dev_; + int *b_ngenONbus = busparams->ngenONbus_dev_; + int *g_xidx = genparams->xidx_dev_; + double *g_qt = genparams->qt_dev_; + double *g_qb = genparams->qb_dev_; + double *g_vs = genparams->vs_dev_; + + RAJA::forall( + RAJA::RangeSegment(0, busparams->nbus), + RAJA_LAMBDA(RAJA::Index_type i) { + if (!b_ispv[i] && !b_isref[i]) + return; + + int base = b_ineqjacsp_idx[i]; + int ngen = b_ngenONbus[i]; + int goff = b_genoffset[i]; + double V = x_dev[b_xidx[i] + 1]; + + double Q = 0.0, Qmax = 0.0, Qmin = 0.0; + double Vset = 0.0; + + /* Row 0 and Row 1: Qg derivative entries */ + for (int k = 0; k < ngen; k++) { + int gidx = goff + k; + double Qg = x_dev[g_xidx[gidx] + 1]; + Q += Qg; + Qmax += g_qt[gidx]; + Qmin += g_qb[gidx]; + Vset = g_vs[gidx]; + + double dQg = Vset - V; + /* Row 0 entry for this gen's Qg */ + jacd_dev[base + k] = dQg; + /* Row 1 entry for this gen's Qg */ + jacd_dev[base + ngen + 1 + k] = dQg; + } + + /* Row 0: V derivative entry (last in this row) */ + jacd_dev[base + ngen] = Qmax - Q; + /* Row 1: V derivative entry (last in this row) */ + jacd_dev[base + ngen + 1 + ngen] = Qmin - Q; + }); + } + + /* + * Line flow constraint Jacobian. + * + * Computes dSf2/d{thetaf,Vmf,thetat,Vmt} and dSt2/d{thetaf,Vmf,thetat,Vmt} + * for each monitored line, plus optional slack variable entries (-1.0). + * + * Flat array layout per line (8 entries without slack, 10 with): + * Row 0 (Sf2): [dSf2_dthetaf, dSf2_dVmf, dSf2_dthetat, dSf2_dVmt, + * (-1.0 if slack)] + * Row 1 (St2): [dSt2_dthetaf, dSt2_dVmf, dSt2_dthetat, dSt2_dVmt, + * (-1.0 if slack)] + */ + LINEParamsRajaHiop *lineparams = &pbpolrajahiopsparse->lineparams; + + if (lineparams->nlinelim) { + double *Gff_arr = lineparams->Gff_dev_; + double *Bff_arr = lineparams->Bff_dev_; + double *Gft_arr = lineparams->Gft_dev_; + double *Bft_arr = lineparams->Bft_dev_; + double *Gtf_arr = lineparams->Gtf_dev_; + double *Btf_arr = lineparams->Btf_dev_; + double *Gtt_arr = lineparams->Gtt_dev_; + double *Btt_arr = lineparams->Btt_dev_; + int *linelimidx = lineparams->linelimidx_dev_; + int *xidxf = lineparams->xidxf_dev_; + int *xidxt = lineparams->xidxt_dev_; + int *ineqjacsp_idx = lineparams->ineqjacsp_idx_dev_; + int has_slack = (int)opflow->allow_lineflow_violation; + int row_stride = 4 + has_slack; + + RAJA::forall( + RAJA::RangeSegment(0, lineparams->nlinelim), + RAJA_LAMBDA(RAJA::Index_type i) { + int j = linelimidx[i]; + int base = ineqjacsp_idx[i]; + + double thetaf = x_dev[xidxf[j]]; + double Vmf = x_dev[xidxf[j] + 1]; + double thetat = x_dev[xidxt[j]]; + double Vmt = x_dev[xidxt[j] + 1]; + double thetaft = thetaf - thetat; + double thetatf = thetat - thetaf; + + double Gff = Gff_arr[j], Bff = Bff_arr[j]; + double Gft = Gft_arr[j], Bft = Bft_arr[j]; + double Gtf = Gtf_arr[j], Btf = Btf_arr[j]; + double Gtt = Gtt_arr[j], Btt = Btt_arr[j]; + + double sin_ft = sin(thetaft), cos_ft = cos(thetaft); + double sin_tf = sin(thetatf), cos_tf = cos(thetatf); + + double Pf = Gff * Vmf * Vmf + + Vmf * Vmt * (Gft * cos_ft + Bft * sin_ft); + double Qf = -Bff * Vmf * Vmf + + Vmf * Vmt * (-Bft * cos_ft + Gft * sin_ft); + double Pt = Gtt * Vmt * Vmt + + Vmt * Vmf * (Gtf * cos_tf + Btf * sin_tf); + double Qt = -Btt * Vmt * Vmt + + Vmt * Vmf * (-Btf * cos_tf + Gtf * sin_tf); + + double dSf2_dPf = 2 * Pf, dSf2_dQf = 2 * Qf; + double dSt2_dPt = 2 * Pt, dSt2_dQt = 2 * Qt; + + double dPf_dthetaf = Vmf * Vmt * (-Gft * sin_ft + Bft * cos_ft); + double dPf_dVmf = 2 * Gff * Vmf + + Vmt * (Gft * cos_ft + Bft * sin_ft); + double dPf_dthetat = Vmf * Vmt * (Gft * sin_ft - Bft * cos_ft); + double dPf_dVmt = Vmf * (Gft * cos_ft + Bft * sin_ft); + + double dQf_dthetaf = Vmf * Vmt * (Bft * sin_ft + Gft * cos_ft); + double dQf_dVmf = -2 * Bff * Vmf + + Vmt * (-Bft * cos_ft + Gft * sin_ft); + double dQf_dthetat = Vmf * Vmt * (-Bft * sin_ft - Gft * cos_ft); + double dQf_dVmt = Vmf * (-Bft * cos_ft + Gft * sin_ft); + + double dPt_dthetat = Vmt * Vmf * (-Gtf * sin_tf + Btf * cos_tf); + double dPt_dVmt = 2 * Gtt * Vmt + + Vmf * (Gtf * cos_tf + Btf * sin_tf); + double dPt_dthetaf = Vmt * Vmf * (Gtf * sin_tf - Btf * cos_tf); + double dPt_dVmf = Vmt * (Gtf * cos_tf + Btf * sin_tf); + + double dQt_dthetat = Vmt * Vmf * (Btf * sin_tf + Gtf * cos_tf); + double dQt_dVmt = -2 * Btt * Vmt + + Vmf * (-Btf * cos_tf + Gtf * sin_tf); + double dQt_dthetaf = Vmt * Vmf * (-Btf * sin_tf - Gtf * cos_tf); + double dQt_dVmf = Vmt * (-Btf * cos_tf + Gtf * sin_tf); + + /* Row 0 (Sf2): derivatives w.r.t. thetaf, Vmf, thetat, Vmt */ + jacd_dev[base + 0] = + dSf2_dPf * dPf_dthetaf + dSf2_dQf * dQf_dthetaf; + jacd_dev[base + 1] = + dSf2_dPf * dPf_dVmf + dSf2_dQf * dQf_dVmf; + jacd_dev[base + 2] = + dSf2_dPf * dPf_dthetat + dSf2_dQf * dQf_dthetat; + jacd_dev[base + 3] = + dSf2_dPf * dPf_dVmt + dSf2_dQf * dQf_dVmt; + + /* Row 1 (St2): derivatives w.r.t. thetaf, Vmf, thetat, Vmt */ + jacd_dev[base + row_stride + 0] = + dSt2_dPt * dPt_dthetaf + dSt2_dQt * dQt_dthetaf; + jacd_dev[base + row_stride + 1] = + dSt2_dPt * dPt_dVmf + dSt2_dQt * dQt_dVmf; + jacd_dev[base + row_stride + 2] = + dSt2_dPt * dPt_dthetat + dSt2_dQt * dQt_dthetat; + jacd_dev[base + row_stride + 3] = + dSt2_dPt * dPt_dVmt + dSt2_dQt * dQt_dVmt; + + /* Slack variable entries (Step 5) */ + if (has_slack) { + jacd_dev[base + 4] = -1.0; + jacd_dev[base + row_stride + 4] = -1.0; + } + }); + } +} + +#endif // EXAGO_ENABLE_HIOP_SPARSE +#endif // EXAGO_ENABLE_RAJA diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp new file mode 100644 index 00000000..29d82e87 --- /dev/null +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp @@ -0,0 +1,30 @@ +#include + +#if defined(EXAGO_ENABLE_RAJA) +#if defined(EXAGO_ENABLE_HIOP_SPARSE) + +#pragma once + +#include +#include "pbpolrajahiopsparse.hpp" + +/** + * GPU-only (PETSc-free) computation of inequality constraint Jacobian values. + * + * Replaces the PETSc-based path that calls + * opflow->modelops.computeinequalityconstraintjacobian followed by + * MatGetRow extraction. Writes directly into device memory using RAJA + * kernels, without PETSc Mat/Vec operations; no H2D, D2H copies back and forth. + * + * @param opflow The OPFLOW problem context + * @param x_dev Device array of variable values + * @param jacd_dev Device output array for inequality Jacobian values + * (points to the ineq portion of the sparse Jacobian, + * i.e. MJacS_dev + nnz_eqjacsp) + */ +void ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, + const double *x_dev, + double *jacd_dev); + +#endif // EXAGO_ENABLE_HIOP_SPARSE +#endif // EXAGO_ENABLE_RAJA diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp index 7a9d9983..efddb6f6 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp @@ -14,6 +14,7 @@ #include #include "pbpolrajahiopsparsekernels.hpp" #include "pbpolrajahiopsparse.hpp" +#include "pbpolrajahiopsparse_gpu.hpp" /** * @brief Set the initial guess array for the PBPOLRAJAHIOPSPARSE model. @@ -647,43 +648,11 @@ OPFLOWComputeSparseInequalityConstraintJacobian_PBPOLRAJAHIOPSPARSE( ierr = PetscLogEventBegin(opflow->ineqconsjaclogger, 0, 0, 0, 0); CHKERRQ(ierr); - ierr = VecGetArray(opflow->X, &x); - CHKERRQ(ierr); - - // Copy from device to host - umpire::Allocator h_allocator_ = resmgr.getAllocator("HOST"); - registerWith(x, opflow->nx, resmgr, h_allocator_); - resmgr.copy((double *)x, (double *)x_dev); - - ierr = VecRestoreArray(opflow->X, &x); - CHKERRQ(ierr); - - // Compute inequality constraint jacobian on the host - // The function pointer computeinequalityconstraintjacobian points to - // OPFLOWComputeInequalityConstraintJacobian_PBPOL - ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( - opflow, opflow->X, opflow->Jac_Gi); - CHKERRQ(ierr); - - ierr = MatGetSize(opflow->Jac_Gi, &nrow, &ncol); - CHKERRQ(ierr); - - values = pbpolrajahiopsparse->val_jacineq; - // Unpack PETSc matrix and copy values to the array `values` in - // PbpolModelRajaHiop struct. - for (i = 0; i < nrow; i++) { - ierr = MatGetRow(opflow->Jac_Gi, i, &nvals, &cols, &vals); - CHKERRQ(ierr); - for (j = 0; j < nvals; j++) { - values[j] = vals[j]; - } - values += nvals; - ierr = MatRestoreRow(opflow->Jac_Gi, i, &nvals, &cols, &vals); - CHKERRQ(ierr); - } - // Copy over val_jacineq to device - resmgr.copy(MJacS_dev + opflow->nnz_eqjacsp, - pbpolrajahiopsparse->val_jacineq); + /* KS: Compute inequality constraint Jacobian directly on device. + No H2D, D2H copies: x_dev is already on device, output goes + straight into the ineq portion of MJacS_dev. */ + ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE( + opflow, x_dev, MJacS_dev + opflow->nnz_eqjacsp); ierr = PetscLogEventEnd(opflow->ineqconsjaclogger, 0, 0, 0, 0); CHKERRQ(ierr); diff --git a/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp b/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp index d7a9539d..ffde17c9 100644 --- a/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp +++ b/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp @@ -69,20 +69,9 @@ bool OPFLOWHIOPSPARSEGPUInterface::get_sparse_blocks_info( nnz_sparse_Jaceq = opflow->nnz_eqjacsp = info_eq.nz_used; - nnz_sparse_Jacineq = 0; - if (opflow->Nconineq) { - ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( - opflow, opflow->X, opflow->Jac_Gi); - CHKERRQ(ierr); - ierr = - MatSetOption(opflow->Jac_Gi, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE); - CHKERRQ(ierr); - - ierr = MatGetInfo(opflow->Jac_Gi, MAT_LOCAL, &info_ineq); - CHKERRQ(ierr); - - nnz_sparse_Jacineq = opflow->nnz_ineqjacsp = info_ineq.nz_used; - } + /* KS: Use pre-computed nnz_ineqjacsp from model setup (avoids PETSc Mat + assembly just for counting non-zeros -- not sure if faster or if it scales but hey no PETSc!). */ + nnz_sparse_Jacineq = opflow->nnz_ineqjacsp; /* Compute non-zeros for Hessian */ ierr = (*opflow->modelops.computehessian)(opflow, opflow->X, opflow->Lambdae, diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index ace550c4..13c6b989 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -13,6 +13,17 @@ add_executable(test_pflow test_pflow.cpp utils/test_acopf_utils.cpp) target_link_libraries(test_pflow ExaGO::PFLOW) target_include_directories(test_pflow PRIVATE ./utils) +if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIOP_SPARSE) + if(EXAGO_ENABLE_CUDA) + set_source_files_properties(test_ineqjac_gpu.cpp PROPERTIES LANGUAGE CUDA) + endif() + add_executable(test_ineqjac_gpu test_ineqjac_gpu.cpp) + target_link_libraries(test_ineqjac_gpu ExaGO::OPFLOW) + target_include_directories( + test_ineqjac_gpu PRIVATE ${CMAKE_SOURCE_DIR}/src/opflow + ) +endif() + add_executable(test_error_handler test_error_handler.cpp) target_link_libraries(test_error_handler ExaGO::UTILS) @@ -49,6 +60,22 @@ if(EXAGO_RUN_TESTS) ${network_files} ) + if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIOP_SPARSE) + exago_add_test( + NAME + "UNIT_TEST_8_INEQJAC_GPU" + DEPENDS + HIOP + COMMAND + ${RUNCMD} + $ + -opflow_genbusvoltage + VARIABLE_WITHIN_BOUNDS + NETFILES + ${network_files} + ) + endif() + exago_add_test( NAME UNIT_TESTS_UTILS COMMAND $ ) diff --git a/tests/unit/test_ineqjac_gpu.cpp b/tests/unit/test_ineqjac_gpu.cpp new file mode 100644 index 00000000..e5dc2f75 --- /dev/null +++ b/tests/unit/test_ineqjac_gpu.cpp @@ -0,0 +1,394 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +#if defined(EXAGO_ENABLE_RAJA) +#include +#include +#include +#include +#endif + +#ifdef EXAGO_ENABLE_GPU +#include +#endif + +#include "model/power_bal_hiop/paramsrajahiop.h" +#include "model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp" + +static const double TOL = 1e-8; + +static int compare_arrays(const double *ref, const double *gpu, int n, + const char *label) { + int fail = 0; + for (int i = 0; i < n; i++) { + if (std::abs(ref[i] - gpu[i]) / (1.0 + std::abs(ref[i])) > TOL) { + std::cout << " MISMATCH " << label << "[" << i << "]: PETSc=" << ref[i] + << " GPU=" << gpu[i] + << " diff=" << std::abs(ref[i] - gpu[i]) << std::endl; + fail++; + } + } + return fail; +} + +/** + * Test 8: Validate GPU inequality constraint Jacobian values against PETSc. + * + * First solves with IPOPT/PBPOL to obtain a realistic solution, then + * sets up the PBPOLRAJAHIOPSPARSE model and evaluates the inequality + * Jacobian at the converged solution using both the PETSc reference + * path and the GPU RAJA kernel. Compares element-by-element. + */ +int main(int argc, char **argv) { + PetscErrorCode ierr; + PetscBool flg; + char file_c_str[PETSC_MAX_PATH_LEN]; + std::string file; + char appname[] = "opflow"; + MPI_Comm comm = MPI_COMM_WORLD; + char help[] = "Test 8: GPU ineq Jacobian validation\n"; + int fail = 0; + + ierr = ExaGOInitialize(comm, &argc, &argv, appname, help); + if (ierr) { + fprintf(stderr, "Could not initialize ExaGO.\n"); + return ierr; + } + + ierr = PetscOptionsGetString(NULL, NULL, "-netfile", file_c_str, + PETSC_MAX_PATH_LEN, &flg); + CHKERRQ(ierr); + if (!flg) + file = "../datafiles/case9/case9mod.m"; + else + file.assign(file_c_str); + + std::cout << "=== Test 8: GPU Inequality Jacobian Validation ===" << std::endl; + std::cout << "Network file: " << file << std::endl; + + /* ------------------------------------------------------------------ + * Step 1: Solve with IPOPT/PBPOL to get a realistic solution + * ------------------------------------------------------------------ */ + OPFLOW opflow_ref; + Vec Xsol; + + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow_ref, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow_ref, OPFLOWMODEL_PBPOL); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow_ref, OPFLOWSOLVER_IPOPT); + CHKERRQ(ierr); + ierr = OPFLOWSolve(opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWGetSolution(opflow_ref, &Xsol); + CHKERRQ(ierr); + + std::cout << "IPOPT solve complete." << std::endl; + + /* Get the solution array (natural ordering) */ + PetscInt nx_ref; + ierr = VecGetSize(Xsol, &nx_ref); + CHKERRQ(ierr); + double *xsol_nat; + ierr = VecGetArray(Xsol, &xsol_nat); + CHKERRQ(ierr); + + /* ------------------------------------------------------------------ + * Step 2: Create the PBPOLRAJAHIOPSPARSE model and set up + * ------------------------------------------------------------------ */ + OPFLOW opflow; + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow, OPFLOWMODEL_PBPOLRAJAHIOPSPARSE); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow, OPFLOWSOLVER_HIOPSPARSEGPU); + CHKERRQ(ierr); + ierr = OPFLOWSetUp(opflow); + CHKERRQ(ierr); + + int nx, nconeq, nconineq; + ierr = OPFLOWGetSizes(opflow, &nx, &nconeq, &nconineq); + CHKERRQ(ierr); + + std::cout << "nx=" << nx << " nconeq=" << nconeq + << " nconineq=" << nconineq + << " nnz_ineqjacsp=" << opflow->nnz_ineqjacsp << std::endl; + + if (!nconineq) { + std::cout << "No inequality constraints -- nothing to test. PASS." + << std::endl; + ierr = VecRestoreArray(Xsol, &xsol_nat); + CHKERRQ(ierr); + ierr = OPFLOWDestroy(&opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWDestroy(&opflow); + CHKERRQ(ierr); + ExaGOFinalize(); + return 0; + } + + /* ------------------------------------------------------------------ + * Step 3: Copy IPOPT solution into opflow->X (natural ordering) + * ------------------------------------------------------------------ */ + double *x_nat; + ierr = VecGetArray(opflow->X, &x_nat); + CHKERRQ(ierr); + for (int i = 0; i < nx; i++) + x_nat[i] = xsol_nat[i]; + ierr = VecRestoreArray(opflow->X, &x_nat); + CHKERRQ(ierr); + + ierr = VecRestoreArray(Xsol, &xsol_nat); + CHKERRQ(ierr); + + std::cout << "Evaluating Jacobian at IPOPT-converged solution." << std::endl; + + /* ------------------------------------------------------------------ + * Step 4: Compute reference inequality Jacobian via PETSc. + * The first call establishes the sparsity pattern, the second + * computes the actual values at the converged solution. + * ------------------------------------------------------------------ */ + ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( + opflow, opflow->X, opflow->Jac_Gi); + CHKERRQ(ierr); + ierr = MatSetOption(opflow->Jac_Gi, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE); + CHKERRQ(ierr); + ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( + opflow, opflow->X, opflow->Jac_Gi); + CHKERRQ(ierr); + + int nnz = opflow->nnz_ineqjacsp; + + auto &resmgr = umpire::ResourceManager::getInstance(); + umpire::Allocator h_allocator = resmgr.getAllocator("HOST"); + + double *ref_vals = + static_cast(h_allocator.allocate(nnz * sizeof(double))); + + PetscInt nrow, ncol; + ierr = MatGetSize(opflow->Jac_Gi, &nrow, &ncol); + CHKERRQ(ierr); + + double *vptr = ref_vals; + for (int i = 0; i < nrow; i++) { + PetscInt nvals; + const PetscInt *cols; + const PetscScalar *vals; + ierr = MatGetRow(opflow->Jac_Gi, i, &nvals, &cols, &vals); + CHKERRQ(ierr); + for (int j = 0; j < nvals; j++) + vptr[j] = vals[j]; + vptr += nvals; + ierr = MatRestoreRow(opflow->Jac_Gi, i, &nvals, &cols, &vals); + CHKERRQ(ierr); + } + + int ref_count = (int)(vptr - ref_vals); + std::cout << "PETSc extracted " << ref_count << " ineq Jacobian values" + << " (expected " << nnz << ")" << std::endl; + + if (ref_count != nnz) { + std::cout << "FAIL: NNZ mismatch! PETSc=" << ref_count + << " analytical=" << nnz << std::endl; + fail++; + } + + /* ------------------------------------------------------------------ + * Step 5: Compute GPU inequality Jacobian at the same solution + * ------------------------------------------------------------------ */ + double *x_host; + ierr = VecGetArray(opflow->X, &x_host); + CHKERRQ(ierr); + + double *x_sd = + static_cast(h_allocator.allocate(nx * sizeof(double))); + for (int i = 0; i < nx; i++) + x_sd[opflow->idxn2sd_map[i]] = x_host[i]; + + ierr = VecRestoreArray(opflow->X, &x_host); + CHKERRQ(ierr); + + double *gpu_vals; + double *x_dev, *gpu_vals_dev; + +#ifdef EXAGO_ENABLE_GPU + umpire::Allocator d_allocator = resmgr.getAllocator("DEVICE"); + x_dev = static_cast(d_allocator.allocate(nx * sizeof(double))); + gpu_vals_dev = + static_cast(d_allocator.allocate(nnz * sizeof(double))); + resmgr.memset(gpu_vals_dev, 0, nnz * sizeof(double)); +#else + x_dev = x_sd; + gpu_vals_dev = + static_cast(h_allocator.allocate(nnz * sizeof(double))); + memset(gpu_vals_dev, 0, nnz * sizeof(double)); +#endif + + umpire::util::AllocationRecord rec_x{x_sd, sizeof(double) * nx, + h_allocator.getAllocationStrategy()}; + resmgr.registerAllocation(x_sd, rec_x); +#ifdef EXAGO_ENABLE_GPU + resmgr.copy(x_dev, x_sd); +#endif + + std::cout << "Running RAJA GPU inequality Jacobian kernel..." << std::endl; + ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(opflow, x_dev, gpu_vals_dev); + + gpu_vals = static_cast(h_allocator.allocate(nnz * sizeof(double))); +#ifdef EXAGO_ENABLE_GPU + resmgr.copy(gpu_vals, gpu_vals_dev); +#else + memcpy(gpu_vals, gpu_vals_dev, nnz * sizeof(double)); +#endif + + /* ------------------------------------------------------------------ + * Step 6: Compare + * ------------------------------------------------------------------ */ + std::cout << "Comparing " << nnz << " inequality Jacobian values..." + << std::endl; + int cmp_fail = compare_arrays(ref_vals, gpu_vals, nnz, "ineqjac"); + fail += cmp_fail; + + if (cmp_fail == 0) + std::cout << "PASS: All " << nnz + << " inequality Jacobian values match within tol=" << TOL + << std::endl; + else + std::cout << "FAIL: " << cmp_fail << " of " << nnz << " values differ" + << std::endl; + + /* ------------------------------------------------------------------ + * Step 7: Performance comparison (enabled with -benchmark flag) + * ------------------------------------------------------------------ */ + PetscBool run_benchmark = PETSC_FALSE; + ierr = PetscOptionsGetBool(NULL, NULL, "-benchmark", NULL, &run_benchmark); + CHKERRQ(ierr); + + if (run_benchmark) { + int niters = 1000; + PetscInt bench_nrow, bench_ncol; + ierr = MatGetSize(opflow->Jac_Gi, &bench_nrow, &bench_ncol); + CHKERRQ(ierr); + + double *bench_vals = + static_cast(h_allocator.allocate(nnz * sizeof(double))); + + std::cout << "\n=== Performance Benchmark (" << niters + << " iterations) ===" << std::endl; + + /* --- PETSc path: compute + MatGetRow extraction + copy to device --- */ + { + double *bench_dev; + size_t nnz_bytes = nnz * sizeof(double); +#ifdef EXAGO_ENABLE_GPU + bench_dev = static_cast(d_allocator.allocate(nnz_bytes)); +#else + bench_dev = static_cast(h_allocator.allocate(nnz_bytes)); +#endif + + auto t0 = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < niters; iter++) { + ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( + opflow, opflow->X, opflow->Jac_Gi); + + double *vp = bench_vals; + for (int i = 0; i < bench_nrow; i++) { + PetscInt nv; + const PetscInt *c; + const PetscScalar *v; + MatGetRow(opflow->Jac_Gi, i, &nv, &c, &v); + for (int j = 0; j < nv; j++) + vp[j] = v[j]; + vp += nv; + MatRestoreRow(opflow->Jac_Gi, i, &nv, &c, &v); + } +#ifdef EXAGO_ENABLE_GPU + (void)hipMemcpy(bench_dev, bench_vals, nnz_bytes, + hipMemcpyHostToDevice); +#else + memcpy(bench_dev, bench_vals, nnz_bytes); +#endif + } + auto t1 = std::chrono::high_resolution_clock::now(); + double petsc_us = + std::chrono::duration(t1 - t0).count() / niters; + std::cout << " PETSc path (compute + MatGetRow + copy): " << petsc_us + << " us/iter" << std::endl; + +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(bench_dev); +#else + h_allocator.deallocate(bench_dev); +#endif + } + + /* --- GPU path: RAJA kernels, no copies --- */ + { + double *bench_dev; +#ifdef EXAGO_ENABLE_GPU + bench_dev = + static_cast(d_allocator.allocate(nnz * sizeof(double))); +#else + bench_dev = + static_cast(h_allocator.allocate(nnz * sizeof(double))); +#endif + +#ifdef EXAGO_ENABLE_GPU + (void)hipDeviceSynchronize(); +#endif + auto t0 = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < niters; iter++) { + ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(opflow, x_dev, bench_dev); + } +#ifdef EXAGO_ENABLE_GPU + (void)hipDeviceSynchronize(); +#endif + auto t1 = std::chrono::high_resolution_clock::now(); + double gpu_us = + std::chrono::duration(t1 - t0).count() / niters; + std::cout << " GPU path (RAJA kernels, no copies): " << gpu_us + << " us/iter" << std::endl; + +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(bench_dev); +#else + h_allocator.deallocate(bench_dev); +#endif + } + + h_allocator.deallocate(bench_vals); + std::cout << "=== End Benchmark ===" << std::endl; + } + + /* ------------------------------------------------------------------ + * Cleanup + * ------------------------------------------------------------------ */ + h_allocator.deallocate(ref_vals); + h_allocator.deallocate(gpu_vals); + h_allocator.deallocate(x_sd); +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(x_dev); + d_allocator.deallocate(gpu_vals_dev); +#else + h_allocator.deallocate(gpu_vals_dev); +#endif + + ierr = OPFLOWDestroy(&opflow); + CHKERRQ(ierr); + ierr = OPFLOWDestroy(&opflow_ref); + CHKERRQ(ierr); + ExaGOFinalize(); + + return fail; +} From a34a169bf4a9d6f0bc8ca87099f8ba49ca4a37e7 Mon Sep 17 00:00:00 2001 From: pelesh Date: Thu, 16 Apr 2026 21:04:28 +0000 Subject: [PATCH 2/8] Apply pre-commmit fixes --- src/opflow/CMakeLists.txt | 3 +- .../model/power_bal_hiop/paramsrajahiop.cpp | 4 +- .../model/power_bal_hiop/paramsrajahiop.h | 37 +++++++-------- .../pbpolrajahiopsparse_gpu.cpp | 46 ++++++++----------- .../pbpolrajahiopsparse_gpu.hpp | 4 +- .../solver/hiop/opflow_hiopsparsegpu.cpp | 3 +- tests/unit/test_ineqjac_gpu.cpp | 10 ++-- 7 files changed, 51 insertions(+), 56 deletions(-) diff --git a/src/opflow/CMakeLists.txt b/src/opflow/CMakeLists.txt index fd767aea..863f45f4 100644 --- a/src/opflow/CMakeLists.txt +++ b/src/opflow/CMakeLists.txt @@ -41,8 +41,7 @@ if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_CUDA) set_source_files_properties( model/power_bal_hiop/pbpolrajahiopkernels.cpp model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp - model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp PROPERTIES LANGUAGE - CUDA + model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp PROPERTIES LANGUAGE CUDA ) endif() diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp b/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp index 583f273b..fbe44e4e 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp @@ -802,8 +802,8 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { xidx[geni] = opflow->idxn2sd_map[loc]; xpdevidx[geni] = (opflow->has_gensetpoint && !gen->isrenewable) - ? opflow->idxn2sd_map[gen->startxpdevloc] - : -1; + ? opflow->idxn2sd_map[gen->startxpdevloc] + : -1; gidxbus[geni] = gloc; if (opflow->has_gensetpoint) { geqidxgen[geni] = gen->starteqloc; diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.h b/src/opflow/model/power_bal_hiop/paramsrajahiop.h index 6121a4c6..d274f990 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.h +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.h @@ -28,12 +28,12 @@ struct BUSParamsRajaHiop { vector */ int *jacsp_idx; /* Location number in the sparse Jacobian for Pimb */ int *jacsq_idx; /* Location number in the sparse Jacobian for Qimb */ - int *hesssp_idx; /* KS: Hessian indices */ - int *ispv; /* KS: ispv[i] = 1 if bus is PV bus */ - int *gineqidx; /* KS: starting position of bus ineq constraints */ - int *ineqjacsp_idx; /* KS: index in flat sparse ineq Jacobian array */ - int *genoffset; /* KS: Offset into flattened gen array for this bus */ - int *ngenONbus; /* KS: Number of ON generators on this bus */ + int *hesssp_idx; /* KS: Hessian indices */ + int *ispv; /* KS: ispv[i] = 1 if bus is PV bus */ + int *gineqidx; /* KS: starting position of bus ineq constraints */ + int *ineqjacsp_idx; /* KS: index in flat sparse ineq Jacobian array */ + int *genoffset; /* KS: Offset into flattened gen array for this bus */ + int *ngenONbus; /* KS: Number of ON generators on this bus */ // Device data int *isref_dev_; /* isref[i] = 1 if bus is reference bus */ @@ -51,14 +51,14 @@ struct BUSParamsRajaHiop { X vector */ int *gidx_dev_; /* starting locations for bus balance equations in constraint vector */ - int *jacsp_idx_dev_; /* Location number in the sparse Jacobian for Pimb */ - int *jacsq_idx_dev_; /* Location number in the sparse Jacobian for Qimb */ - int *hesssp_idx_dev_; /* Location number in the Hessian */ - int *ispv_dev_; /* KS: dev counterpart of ispv */ - int *gineqidx_dev_; /* KS: dev counterpart of gineqidx */ + int *jacsp_idx_dev_; /* Location number in the sparse Jacobian for Pimb */ + int *jacsq_idx_dev_; /* Location number in the sparse Jacobian for Qimb */ + int *hesssp_idx_dev_; /* Location number in the Hessian */ + int *ispv_dev_; /* KS: dev counterpart of ispv */ + int *gineqidx_dev_; /* KS: dev counterpart of gineqidx */ int *ineqjacsp_idx_dev_; /* KS: device counterpart of ineqjacsp_idx_ */ - int *genoffset_dev_; /* KS: dev counterpart of genoffset */ - int *ngenONbus_dev_; /* KS: dev counterpart of ngenONbus */ + int *genoffset_dev_; /* KS: dev counterpart of genoffset */ + int *ngenONbus_dev_; /* KS: dev counterpart of ngenONbus */ int allocate(OPFLOW); int destroy(OPFLOW); @@ -208,8 +208,8 @@ struct LINEParamsRajaHiop { contribution in constraints vector */ int *gineqidx; /* Starting location to insert contribution to inequality constraint */ - int *gbineqidx; /* Starting location to insert contribution to inequality - constraint bound */ + int *gbineqidx; /* Starting location to insert contribution to inequality + constraint bound */ int *linelimidx; /* Indices for subset of lines that have finite limits */ int *ineqjacsp_idx; /* KS: Position in flat sparse ineq Jacobian array */ int *xslackidx; /* Starting location of slack variables in X vector */ @@ -235,9 +235,9 @@ struct LINEParamsRajaHiop { int *gbineqidx_dev_; /* Starting location to insert contribution to inequality constraint bound */ int * - linelimidx_dev_; /* Indices for subset of lines that have finite limits */ + linelimidx_dev_; /* Indices for subset of lines that have finite limits */ int *ineqjacsp_idx_dev_; /* KS: Position in flat sparse ineq Jacobian array */ - int *xslackidx_dev_; /* Starting location of slack variables in X vector */ + int *xslackidx_dev_; /* Starting location of slack variables in X vector */ int allocate(OPFLOW); int destroy(OPFLOW); @@ -268,7 +268,8 @@ struct PbpolModelRajaHiop : public _p_FormPBPOLRAJAHIOP { LINEParamsRajaHiop lineparams; BUSParamsRajaHiop busparams; - int agc_xidx; /* KS: X-vector index for the AGC delta-P variable (ps->startxloc) */ + int agc_xidx; /* KS: X-vector index for the AGC delta-P variable + (ps->startxloc) */ // Arrays to store Jacobian and Hessian indices and entries on CPU (used with // GPU sparse model) diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp index 5fc79ca1..05705a35 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp @@ -10,8 +10,8 @@ #include "pbpolrajahiopsparse_gpu.hpp" void ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, - const double *x_dev, - double *jacd_dev) { + const double *x_dev, + double *jacd_dev) { PbpolModelRajaHiop *pbpolrajahiopsparse = reinterpret_cast(opflow->model); GENParamsRajaHiop *genparams = &pbpolrajahiopsparse->genparams; @@ -181,51 +181,45 @@ void ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, double sin_ft = sin(thetaft), cos_ft = cos(thetaft); double sin_tf = sin(thetatf), cos_tf = cos(thetatf); - double Pf = Gff * Vmf * Vmf + - Vmf * Vmt * (Gft * cos_ft + Bft * sin_ft); - double Qf = -Bff * Vmf * Vmf + - Vmf * Vmt * (-Bft * cos_ft + Gft * sin_ft); - double Pt = Gtt * Vmt * Vmt + - Vmt * Vmf * (Gtf * cos_tf + Btf * sin_tf); - double Qt = -Btt * Vmt * Vmt + - Vmt * Vmf * (-Btf * cos_tf + Gtf * sin_tf); + double Pf = + Gff * Vmf * Vmf + Vmf * Vmt * (Gft * cos_ft + Bft * sin_ft); + double Qf = + -Bff * Vmf * Vmf + Vmf * Vmt * (-Bft * cos_ft + Gft * sin_ft); + double Pt = + Gtt * Vmt * Vmt + Vmt * Vmf * (Gtf * cos_tf + Btf * sin_tf); + double Qt = + -Btt * Vmt * Vmt + Vmt * Vmf * (-Btf * cos_tf + Gtf * sin_tf); double dSf2_dPf = 2 * Pf, dSf2_dQf = 2 * Qf; double dSt2_dPt = 2 * Pt, dSt2_dQt = 2 * Qt; double dPf_dthetaf = Vmf * Vmt * (-Gft * sin_ft + Bft * cos_ft); - double dPf_dVmf = 2 * Gff * Vmf + - Vmt * (Gft * cos_ft + Bft * sin_ft); + double dPf_dVmf = 2 * Gff * Vmf + Vmt * (Gft * cos_ft + Bft * sin_ft); double dPf_dthetat = Vmf * Vmt * (Gft * sin_ft - Bft * cos_ft); double dPf_dVmt = Vmf * (Gft * cos_ft + Bft * sin_ft); double dQf_dthetaf = Vmf * Vmt * (Bft * sin_ft + Gft * cos_ft); - double dQf_dVmf = -2 * Bff * Vmf + - Vmt * (-Bft * cos_ft + Gft * sin_ft); + double dQf_dVmf = + -2 * Bff * Vmf + Vmt * (-Bft * cos_ft + Gft * sin_ft); double dQf_dthetat = Vmf * Vmt * (-Bft * sin_ft - Gft * cos_ft); double dQf_dVmt = Vmf * (-Bft * cos_ft + Gft * sin_ft); double dPt_dthetat = Vmt * Vmf * (-Gtf * sin_tf + Btf * cos_tf); - double dPt_dVmt = 2 * Gtt * Vmt + - Vmf * (Gtf * cos_tf + Btf * sin_tf); + double dPt_dVmt = 2 * Gtt * Vmt + Vmf * (Gtf * cos_tf + Btf * sin_tf); double dPt_dthetaf = Vmt * Vmf * (Gtf * sin_tf - Btf * cos_tf); double dPt_dVmf = Vmt * (Gtf * cos_tf + Btf * sin_tf); double dQt_dthetat = Vmt * Vmf * (Btf * sin_tf + Gtf * cos_tf); - double dQt_dVmt = -2 * Btt * Vmt + - Vmf * (-Btf * cos_tf + Gtf * sin_tf); + double dQt_dVmt = + -2 * Btt * Vmt + Vmf * (-Btf * cos_tf + Gtf * sin_tf); double dQt_dthetaf = Vmt * Vmf * (-Btf * sin_tf - Gtf * cos_tf); double dQt_dVmf = Vmt * (-Btf * cos_tf + Gtf * sin_tf); /* Row 0 (Sf2): derivatives w.r.t. thetaf, Vmf, thetat, Vmt */ - jacd_dev[base + 0] = - dSf2_dPf * dPf_dthetaf + dSf2_dQf * dQf_dthetaf; - jacd_dev[base + 1] = - dSf2_dPf * dPf_dVmf + dSf2_dQf * dQf_dVmf; - jacd_dev[base + 2] = - dSf2_dPf * dPf_dthetat + dSf2_dQf * dQf_dthetat; - jacd_dev[base + 3] = - dSf2_dPf * dPf_dVmt + dSf2_dQf * dQf_dVmt; + jacd_dev[base + 0] = dSf2_dPf * dPf_dthetaf + dSf2_dQf * dQf_dthetaf; + jacd_dev[base + 1] = dSf2_dPf * dPf_dVmf + dSf2_dQf * dQf_dVmf; + jacd_dev[base + 2] = dSf2_dPf * dPf_dthetat + dSf2_dQf * dQf_dthetat; + jacd_dev[base + 3] = dSf2_dPf * dPf_dVmt + dSf2_dQf * dQf_dVmt; /* Row 1 (St2): derivatives w.r.t. thetaf, Vmf, thetat, Vmt */ jacd_dev[base + row_stride + 0] = diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp index 29d82e87..938686a7 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp @@ -23,8 +23,8 @@ * i.e. MJacS_dev + nnz_eqjacsp) */ void ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, - const double *x_dev, - double *jacd_dev); + const double *x_dev, + double *jacd_dev); #endif // EXAGO_ENABLE_HIOP_SPARSE #endif // EXAGO_ENABLE_RAJA diff --git a/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp b/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp index ffde17c9..d768e738 100644 --- a/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp +++ b/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp @@ -70,7 +70,8 @@ bool OPFLOWHIOPSPARSEGPUInterface::get_sparse_blocks_info( nnz_sparse_Jaceq = opflow->nnz_eqjacsp = info_eq.nz_used; /* KS: Use pre-computed nnz_ineqjacsp from model setup (avoids PETSc Mat - assembly just for counting non-zeros -- not sure if faster or if it scales but hey no PETSc!). */ + assembly just for counting non-zeros -- not sure if faster or if it scales + but hey no PETSc!). */ nnz_sparse_Jacineq = opflow->nnz_ineqjacsp; /* Compute non-zeros for Hessian */ diff --git a/tests/unit/test_ineqjac_gpu.cpp b/tests/unit/test_ineqjac_gpu.cpp index e5dc2f75..1492010e 100644 --- a/tests/unit/test_ineqjac_gpu.cpp +++ b/tests/unit/test_ineqjac_gpu.cpp @@ -30,8 +30,8 @@ static int compare_arrays(const double *ref, const double *gpu, int n, for (int i = 0; i < n; i++) { if (std::abs(ref[i] - gpu[i]) / (1.0 + std::abs(ref[i])) > TOL) { std::cout << " MISMATCH " << label << "[" << i << "]: PETSc=" << ref[i] - << " GPU=" << gpu[i] - << " diff=" << std::abs(ref[i] - gpu[i]) << std::endl; + << " GPU=" << gpu[i] << " diff=" << std::abs(ref[i] - gpu[i]) + << std::endl; fail++; } } @@ -70,7 +70,8 @@ int main(int argc, char **argv) { else file.assign(file_c_str); - std::cout << "=== Test 8: GPU Inequality Jacobian Validation ===" << std::endl; + std::cout << "=== Test 8: GPU Inequality Jacobian Validation ===" + << std::endl; std::cout << "Network file: " << file << std::endl; /* ------------------------------------------------------------------ @@ -121,8 +122,7 @@ int main(int argc, char **argv) { ierr = OPFLOWGetSizes(opflow, &nx, &nconeq, &nconineq); CHKERRQ(ierr); - std::cout << "nx=" << nx << " nconeq=" << nconeq - << " nconineq=" << nconineq + std::cout << "nx=" << nx << " nconeq=" << nconeq << " nconineq=" << nconineq << " nnz_ineqjacsp=" << opflow->nnz_ineqjacsp << std::endl; if (!nconineq) { From 89c12adddf121a17756b2043861ccc4ac24728c0 Mon Sep 17 00:00:00 2001 From: Slaven Peles Date: Mon, 20 Apr 2026 15:21:11 -0400 Subject: [PATCH 3/8] Remove redundant message from CMake. --- tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt b/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt index 4c9e1630..cd2f33b2 100644 --- a/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt +++ b/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt @@ -71,7 +71,6 @@ if(EXAGO_RUN_TESTS) set(testname "UNIT_TESTS_EQUALITY_CONSTRAINT_JACOBIAN_${net}_${solver}_${model}" ) - message(STATUS "Setting up test: ${testname}") exago_add_test( NAME ${testname} From 888bd8726980349d77bcbbfc5f4b762f07d9a75a Mon Sep 17 00:00:00 2001 From: Slaven Peles Date: Mon, 20 Apr 2026 15:23:25 -0400 Subject: [PATCH 4/8] Remove/guard HIP specific code in test_ineqjac_gpu.cpp --- tests/unit/test_ineqjac_gpu.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_ineqjac_gpu.cpp b/tests/unit/test_ineqjac_gpu.cpp index 1492010e..a9c49764 100644 --- a/tests/unit/test_ineqjac_gpu.cpp +++ b/tests/unit/test_ineqjac_gpu.cpp @@ -314,8 +314,7 @@ int main(int argc, char **argv) { MatRestoreRow(opflow->Jac_Gi, i, &nv, &c, &v); } #ifdef EXAGO_ENABLE_GPU - (void)hipMemcpy(bench_dev, bench_vals, nnz_bytes, - hipMemcpyHostToDevice); + resmgr.copy(bench_dev, bench_vals); #else memcpy(bench_dev, bench_vals, nnz_bytes); #endif @@ -344,14 +343,14 @@ int main(int argc, char **argv) { static_cast(h_allocator.allocate(nnz * sizeof(double))); #endif -#ifdef EXAGO_ENABLE_GPU +#ifdef EXAGO_ENABLE_HIP (void)hipDeviceSynchronize(); #endif auto t0 = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < niters; iter++) { ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(opflow, x_dev, bench_dev); } -#ifdef EXAGO_ENABLE_GPU +#ifdef EXAGO_ENABLE_HIP (void)hipDeviceSynchronize(); #endif auto t1 = std::chrono::high_resolution_clock::now(); From 580192c7394477290738e69b495a774ebcd263bf Mon Sep 17 00:00:00 2001 From: Slaven Peles Date: Mon, 20 Apr 2026 15:42:13 -0400 Subject: [PATCH 5/8] Fix names in inequality Jacobian test. --- tests/unit/CMakeLists.txt | 2 +- tests/unit/test_ineqjac_gpu.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 13c6b989..b052d727 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -63,7 +63,7 @@ if(EXAGO_RUN_TESTS) if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIOP_SPARSE) exago_add_test( NAME - "UNIT_TEST_8_INEQJAC_GPU" + "UNIT_TEST_INEQJAC_GPU" DEPENDS HIOP COMMAND diff --git a/tests/unit/test_ineqjac_gpu.cpp b/tests/unit/test_ineqjac_gpu.cpp index a9c49764..0c984e49 100644 --- a/tests/unit/test_ineqjac_gpu.cpp +++ b/tests/unit/test_ineqjac_gpu.cpp @@ -39,7 +39,7 @@ static int compare_arrays(const double *ref, const double *gpu, int n, } /** - * Test 8: Validate GPU inequality constraint Jacobian values against PETSc. + * @brief Validate GPU inequality constraint Jacobian values against PETSc. * * First solves with IPOPT/PBPOL to obtain a realistic solution, then * sets up the PBPOLRAJAHIOPSPARSE model and evaluates the inequality From a5ad9adc8974b20479e15bcd41cd6f7ee0a3b95c Mon Sep 17 00:00:00 2001 From: Nicholson Koukpaizan Date: Mon, 20 Apr 2026 14:29:53 -0400 Subject: [PATCH 6/8] Add benchmark timing for CPU-only evaluation. --- tests/unit/test_ineqjac_gpu.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/unit/test_ineqjac_gpu.cpp b/tests/unit/test_ineqjac_gpu.cpp index 0c984e49..19d3514d 100644 --- a/tests/unit/test_ineqjac_gpu.cpp +++ b/tests/unit/test_ineqjac_gpu.cpp @@ -332,6 +332,23 @@ int main(int argc, char **argv) { #endif } + /* --- PETSc path: CPU compute only --- */ + { + auto t0 = std::chrono::high_resolution_clock::now(); + + for (int iter = 0; iter < niters; iter++) { + ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( + opflow, opflow->X, opflow->Jac_Gi); + } + + auto t1 = std::chrono::high_resolution_clock::now(); + + double petsc_us = + std::chrono::duration(t1 - t0).count() / niters; + std::cout << " PETSc path (compute only): " << petsc_us + << " us/iter" << std::endl; + } + /* --- GPU path: RAJA kernels, no copies --- */ { double *bench_dev; From 6bb54fcd2ed820d9150ed78cd0c5ed676b72cc1a Mon Sep 17 00:00:00 2001 From: Slaven Peles Date: Mon, 20 Apr 2026 16:22:33 -0400 Subject: [PATCH 7/8] format fix --- tests/unit/test_ineqjac_gpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_ineqjac_gpu.cpp b/tests/unit/test_ineqjac_gpu.cpp index 19d3514d..6de8f12a 100644 --- a/tests/unit/test_ineqjac_gpu.cpp +++ b/tests/unit/test_ineqjac_gpu.cpp @@ -333,7 +333,7 @@ int main(int argc, char **argv) { } /* --- PETSc path: CPU compute only --- */ - { + { auto t0 = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < niters; iter++) { From 60328e8459ddb4c6c855a55a09412e853502b563 Mon Sep 17 00:00:00 2001 From: Slaven Peles Date: Mon, 20 Apr 2026 16:31:06 -0400 Subject: [PATCH 8/8] [skip ci] Remove redundant include. --- tests/unit/test_ineqjac_gpu.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/unit/test_ineqjac_gpu.cpp b/tests/unit/test_ineqjac_gpu.cpp index 6de8f12a..bc473c70 100644 --- a/tests/unit/test_ineqjac_gpu.cpp +++ b/tests/unit/test_ineqjac_gpu.cpp @@ -15,10 +15,6 @@ #include #endif -#ifdef EXAGO_ENABLE_GPU -#include -#endif - #include "model/power_bal_hiop/paramsrajahiop.h" #include "model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp"