diff --git a/src/opflow/CMakeLists.txt b/src/opflow/CMakeLists.txt index d0553657..863f45f4 100644 --- a/src/opflow/CMakeLists.txt +++ b/src/opflow/CMakeLists.txt @@ -20,6 +20,7 @@ if(EXAGO_ENABLE_RAJA) set(OPFLOW_FORM_SRC ${OPFLOW_FORM_SRC} model/power_bal_hiop/pbpolrajahiopsparse.cpp model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp + model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp ) endif() endif() @@ -39,8 +40,8 @@ set_source_files_properties(${OPFLOW_SRC} PROPERTIES LANGUAGE CXX) if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_CUDA) set_source_files_properties( model/power_bal_hiop/pbpolrajahiopkernels.cpp - model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp PROPERTIES LANGUAGE - CUDA + model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp + model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp PROPERTIES LANGUAGE CUDA ) endif() diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp b/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp index 27e2cfa0..fbe44e4e 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp @@ -27,6 +27,11 @@ int BUSParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(jacsp_idx); h_allocator_.deallocate(jacsq_idx); } + h_allocator_.deallocate(ispv); + h_allocator_.deallocate(gineqidx); + h_allocator_.deallocate(ineqjacsp_idx); + h_allocator_.deallocate(genoffset); + h_allocator_.deallocate(ngenONbus); #ifdef EXAGO_ENABLE_GPU d_allocator_.deallocate(isref_dev_); @@ -46,6 +51,11 @@ int BUSParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(jacsp_idx_dev_); d_allocator_.deallocate(jacsq_idx_dev_); } + d_allocator_.deallocate(ispv_dev_); + d_allocator_.deallocate(gineqidx_dev_); + d_allocator_.deallocate(ineqjacsp_idx_dev_); + d_allocator_.deallocate(genoffset_dev_); + d_allocator_.deallocate(ngenONbus_dev_); #endif return 0; @@ -80,6 +90,11 @@ int BUSParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(jacsq_idx_dev_, jacsq_idx); resmgr.copy(powerimbalance_penalty_dev_, powerimbalance_penalty); } + resmgr.copy(ispv_dev_, ispv); + resmgr.copy(gineqidx_dev_, gineqidx); + resmgr.copy(ineqjacsp_idx_dev_, ineqjacsp_idx); + resmgr.copy(genoffset_dev_, genoffset); + resmgr.copy(ngenONbus_dev_, ngenONbus); #else isref_dev_ = isref; isisolated_dev_ = isisolated; @@ -96,6 +111,11 @@ int BUSParamsRajaHiop::copy(OPFLOW opflow) { jacsp_idx_dev_ = jacsp_idx; jacsq_idx_dev_ = jacsq_idx; powerimbalance_penalty_dev_ = powerimbalance_penalty; + ispv_dev_ = ispv; + gineqidx_dev_ = gineqidx; + ineqjacsp_idx_dev_ = ineqjacsp_idx; + genoffset_dev_ = genoffset; + ngenONbus_dev_ = ngenONbus; #endif return 0; } @@ -132,18 +152,31 @@ int BUSParamsRajaHiop::allocate(OPFLOW opflow) { jacsp_idx = paramAlloc(h_allocator_, nbus); jacsq_idx = paramAlloc(h_allocator_, nbus); } + ispv = paramAlloc(h_allocator_, nbus); + gineqidx = paramAlloc(h_allocator_, nbus); + ineqjacsp_idx = paramAlloc(h_allocator_, nbus); + genoffset = paramAlloc(h_allocator_, nbus); + ngenONbus = paramAlloc(h_allocator_, nbus); /* Memzero arrays */ resmgr.memset(isref, 0, nbus * sizeof(int)); resmgr.memset(ispvpq, 0, nbus * sizeof(int)); resmgr.memset(isisolated, 0, nbus * sizeof(int)); + resmgr.memset(ispv, 0, nbus * sizeof(int)); + resmgr.memset(gineqidx, 0, nbus * sizeof(int)); + resmgr.memset(ineqjacsp_idx, 0, nbus * sizeof(int)); + resmgr.memset(genoffset, 0, nbus * sizeof(int)); + resmgr.memset(ngenONbus, 0, nbus * sizeof(int)); + int genoff = 0; for (int i = 0; i < nbus; i++) { bus = &ps->bus[i]; loc = bus->startxVloc; xidx[i] = opflow->idxn2sd_map[loc]; gidx[i] = bus->starteqloc; + genoffset[i] = genoff; + genoff += bus->ngenON; if (bus->ide == REF_BUS) isref[i] = 1; @@ -152,6 +185,12 @@ int BUSParamsRajaHiop::allocate(OPFLOW opflow) { else ispvpq[i] = 1; + if (bus->ide == PV_BUS) + ispv[i] = 1; + + ngenONbus[i] = bus->ngenON; + gineqidx[i] = bus->startineqloc; + if (opflow->genbusvoltagetype == FIXED_AT_SETPOINT) { if (bus->ide == REF_BUS || bus->ide == PV_BUS) { /* Hold voltage at reference and PV buses */ @@ -200,6 +239,11 @@ int BUSParamsRajaHiop::allocate(OPFLOW opflow) { jacsp_idx_dev_ = paramAlloc(d_allocator_, nbus); jacsq_idx_dev_ = paramAlloc(d_allocator_, nbus); } + ispv_dev_ = paramAlloc(d_allocator_, nbus); + gineqidx_dev_ = paramAlloc(d_allocator_, nbus); + ineqjacsp_idx_dev_ = paramAlloc(d_allocator_, nbus); + genoffset_dev_ = paramAlloc(d_allocator_, nbus); + ngenONbus_dev_ = paramAlloc(d_allocator_, nbus); #endif return 0; } @@ -231,6 +275,8 @@ int LINEParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(gineqidx_dev_, gineqidx); resmgr.copy(gbineqidx_dev_, gbineqidx); resmgr.copy(linelimidx_dev_, linelimidx); + resmgr.copy(ineqjacsp_idx_dev_, ineqjacsp_idx); + resmgr.copy(xslackidx_dev_, xslackidx); } #else Gff_dev_ = Gff; @@ -250,6 +296,8 @@ int LINEParamsRajaHiop::copy(OPFLOW opflow) { gineqidx_dev_ = gineqidx; gbineqidx_dev_ = gbineqidx; linelimidx_dev_ = linelimidx; + ineqjacsp_idx_dev_ = ineqjacsp_idx; + xslackidx_dev_ = xslackidx; } #endif return 0; @@ -277,6 +325,8 @@ int LINEParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(gineqidx); h_allocator_.deallocate(gbineqidx); h_allocator_.deallocate(linelimidx); + h_allocator_.deallocate(ineqjacsp_idx); + h_allocator_.deallocate(xslackidx); } #ifdef EXAGO_ENABLE_GPU @@ -301,6 +351,8 @@ int LINEParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(gineqidx_dev_); d_allocator_.deallocate(gbineqidx_dev_); d_allocator_.deallocate(linelimidx_dev_); + d_allocator_.deallocate(ineqjacsp_idx_dev_); + d_allocator_.deallocate(xslackidx_dev_); } #endif @@ -348,6 +400,8 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { linelimidx = paramAlloc(h_allocator_, nlinelim); gineqidx = paramAlloc(h_allocator_, nlinelim); gbineqidx = paramAlloc(h_allocator_, nlinelim); + ineqjacsp_idx = paramAlloc(h_allocator_, nlinelim); + xslackidx = paramAlloc(h_allocator_, nlinelim); } PetscInt j = 0; @@ -391,6 +445,9 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { gbineqidx[j] = opflow->nconeq + line->startineqloc; gineqidx[j] = line->startineqloc; linelimidx[j] = linei; + if (opflow->allow_lineflow_violation) { + xslackidx[j] = opflow->idxn2sd_map[line->startxslackloc]; + } j++; } @@ -420,6 +477,8 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { gineqidx_dev_ = paramAlloc(d_allocator_, nlinelim); gbineqidx_dev_ = paramAlloc(d_allocator_, nlinelim); linelimidx_dev_ = paramAlloc(d_allocator_, nlinelim); + ineqjacsp_idx_dev_ = paramAlloc(d_allocator_, nlinelim); + xslackidx_dev_ = paramAlloc(d_allocator_, nlinelim); } #endif return 0; @@ -558,7 +617,10 @@ int GENParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(qt); h_allocator_.deallocate(qb); h_allocator_.deallocate(isrenewable); + h_allocator_.deallocate(apf); + h_allocator_.deallocate(vs); h_allocator_.deallocate(xidx); + h_allocator_.deallocate(xpdevidx); h_allocator_.deallocate(gidxbus); h_allocator_.deallocate(eqjacspbus_idx); h_allocator_.deallocate(eqjacsqbus_idx); @@ -581,7 +643,10 @@ int GENParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(qt_dev_); d_allocator_.deallocate(qb_dev_); d_allocator_.deallocate(isrenewable_dev_); + d_allocator_.deallocate(apf_dev_); + d_allocator_.deallocate(vs_dev_); d_allocator_.deallocate(xidx_dev_); + d_allocator_.deallocate(xpdevidx_dev_); d_allocator_.deallocate(gidxbus_dev_); d_allocator_.deallocate(eqjacspbus_idx_dev_); d_allocator_.deallocate(eqjacsqbus_idx_dev_); @@ -615,8 +680,11 @@ int GENParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(qt_dev_, qt); resmgr.copy(qb_dev_, qb); resmgr.copy(isrenewable_dev_, isrenewable); + resmgr.copy(apf_dev_, apf); + resmgr.copy(vs_dev_, vs); resmgr.copy(xidx_dev_, xidx); + resmgr.copy(xpdevidx_dev_, xpdevidx); resmgr.copy(gidxbus_dev_, gidxbus); resmgr.copy(eqjacspbus_idx_dev_, eqjacspbus_idx); @@ -639,7 +707,10 @@ int GENParamsRajaHiop::copy(OPFLOW opflow) { qt_dev_ = qt; qb_dev_ = qb; isrenewable_dev_ = isrenewable; + apf_dev_ = apf; + vs_dev_ = vs; xidx_dev_ = xidx; + xpdevidx_dev_ = xpdevidx; gidxbus_dev_ = gidxbus; eqjacspbus_idx_dev_ = eqjacspbus_idx; eqjacsqbus_idx_dev_ = eqjacsqbus_idx; @@ -682,8 +753,11 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { qt = paramAlloc(h_allocator_, ngenON); qb = paramAlloc(h_allocator_, ngenON); isrenewable = paramAlloc(h_allocator_, ngenON); + apf = paramAlloc(h_allocator_, ngenON); + vs = paramAlloc(h_allocator_, ngenON); xidx = paramAlloc(h_allocator_, ngenON); + xpdevidx = paramAlloc(h_allocator_, ngenON); gidxbus = paramAlloc(h_allocator_, ngenON); eqjacspbus_idx = paramAlloc(h_allocator_, ngenON); @@ -720,11 +794,16 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { qt[geni] = gen->qt; qb[geni] = gen->qb; isrenewable[geni] = (int)gen->isrenewable; + apf[geni] = gen->apf; + vs[geni] = gen->vs; if (opflow->has_gensetpoint) { pgs[geni] = gen->pgs; } xidx[geni] = opflow->idxn2sd_map[loc]; + xpdevidx[geni] = (opflow->has_gensetpoint && !gen->isrenewable) + ? opflow->idxn2sd_map[gen->startxpdevloc] + : -1; gidxbus[geni] = gloc; if (opflow->has_gensetpoint) { geqidxgen[geni] = gen->starteqloc; @@ -748,8 +827,11 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { qt_dev_ = paramAlloc(d_allocator_, ngenON); qb_dev_ = paramAlloc(d_allocator_, ngenON); isrenewable_dev_ = paramAlloc(d_allocator_, ngenON); + apf_dev_ = paramAlloc(d_allocator_, ngenON); + vs_dev_ = paramAlloc(d_allocator_, ngenON); xidx_dev_ = paramAlloc(d_allocator_, ngenON); + xpdevidx_dev_ = paramAlloc(d_allocator_, ngenON); gidxbus_dev_ = paramAlloc(d_allocator_, ngenON); eqjacspbus_idx_dev_ = paramAlloc(d_allocator_, ngenON); diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.h b/src/opflow/model/power_bal_hiop/paramsrajahiop.h index 91f1fb9f..d274f990 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.h +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.h @@ -28,7 +28,12 @@ struct BUSParamsRajaHiop { vector */ int *jacsp_idx; /* Location number in the sparse Jacobian for Pimb */ int *jacsq_idx; /* Location number in the sparse Jacobian for Qimb */ - int *hesssp_idx; /* Location number in the Hessian */ + int *hesssp_idx; /* KS: Hessian indices */ + int *ispv; /* KS: ispv[i] = 1 if bus is PV bus */ + int *gineqidx; /* KS: starting position of bus ineq constraints */ + int *ineqjacsp_idx; /* KS: index in flat sparse ineq Jacobian array */ + int *genoffset; /* KS: Offset into flattened gen array for this bus */ + int *ngenONbus; /* KS: Number of ON generators on this bus */ // Device data int *isref_dev_; /* isref[i] = 1 if bus is reference bus */ @@ -46,9 +51,14 @@ struct BUSParamsRajaHiop { X vector */ int *gidx_dev_; /* starting locations for bus balance equations in constraint vector */ - int *jacsp_idx_dev_; /* Location number in the sparse Jacobian for Pimb */ - int *jacsq_idx_dev_; /* Location number in the sparse Jacobian for Qimb */ - int *hesssp_idx_dev_; /* Location number in the Hessian */ + int *jacsp_idx_dev_; /* Location number in the sparse Jacobian for Pimb */ + int *jacsq_idx_dev_; /* Location number in the sparse Jacobian for Qimb */ + int *hesssp_idx_dev_; /* Location number in the Hessian */ + int *ispv_dev_; /* KS: dev counterpart of ispv */ + int *gineqidx_dev_; /* KS: dev counterpart of gineqidx */ + int *ineqjacsp_idx_dev_; /* KS: device counterpart of ineqjacsp_idx_ */ + int *genoffset_dev_; /* KS: dev counterpart of genoffset */ + int *ngenONbus_dev_; /* KS: dev counterpart of ngenONbus */ int allocate(OPFLOW); int destroy(OPFLOW); @@ -72,9 +82,12 @@ struct GENParamsRajaHiop { double *qt; /* min. reactive power gen. limits */ double *qb; /* max. reactive power gen. limits */ double *pgs; /* real power output setpoint */ + double *apf; /* generator AGC participation factor */ + double *vs; /* voltage setpoint */ int *isrenewable; /* Is renewable generator? */ - int *xidx; /* starting locations in X vector */ + int *xidx; /* starting locations in X vector */ + int *xpdevidx; /* KS: tarting locations of deviation variables in X vector */ int * gidxbus; /* starting locations in constraint vector for bus constraints */ int *geqidxgen; /* starting locations in equality constraint vector for gen @@ -104,9 +117,12 @@ struct GENParamsRajaHiop { double *qt_dev_; /* min. reactive power gen. limits */ double *qb_dev_; /* max. reactive power gen. limits */ double *pgs_dev_; /* real power output setpoint */ + double *apf_dev_; /* KS: device counterpart of apf */ + double *vs_dev_; /* KS: device counterpart of vs */ int *isrenewable_dev_; /* Is renewable generator? */ int *xidx_dev_; /* starting locations in X vector */ + int *xpdevidx_dev_; /* KS: device coutnerpart of xpdevidx*/ int *gidxbus_dev_; /* starting locations in constraint vector for bus constraints */ int *geqidxgen_dev_; /* starting locations in equality constraint vector for @@ -143,7 +159,7 @@ struct LOADParamsRajaHiop { double *pl; /* active power demand */ double *ql; /* reactive power demand */ double *loadloss_penalty; /* Penalty for load loss */ - int *xidx; /* starting location in X vector */ + int *xidx; /* KS: starting location in X vector */ int *gidx; /* starting location in constraint vector */ /* The following members are only used with HIOP */ @@ -192,9 +208,11 @@ struct LINEParamsRajaHiop { contribution in constraints vector */ int *gineqidx; /* Starting location to insert contribution to inequality constraint */ - int *gbineqidx; /* Starting location to insert contribution to inequality - constraint bound */ - int *linelimidx; /* Indices for subset of lines that have finite limits */ + int *gbineqidx; /* Starting location to insert contribution to inequality + constraint bound */ + int *linelimidx; /* Indices for subset of lines that have finite limits */ + int *ineqjacsp_idx; /* KS: Position in flat sparse ineq Jacobian array */ + int *xslackidx; /* Starting location of slack variables in X vector */ // Device data double *Gff_dev_; /* From side self conductance */ @@ -218,6 +236,8 @@ struct LINEParamsRajaHiop { constraint bound */ int * linelimidx_dev_; /* Indices for subset of lines that have finite limits */ + int *ineqjacsp_idx_dev_; /* KS: Position in flat sparse ineq Jacobian array */ + int *xslackidx_dev_; /* Starting location of slack variables in X vector */ int allocate(OPFLOW); int destroy(OPFLOW); @@ -248,6 +268,9 @@ struct PbpolModelRajaHiop : public _p_FormPBPOLRAJAHIOP { LINEParamsRajaHiop lineparams; BUSParamsRajaHiop busparams; + int agc_xidx; /* KS: X-vector index for the AGC delta-P variable + (ps->startxloc) */ + // Arrays to store Jacobian and Hessian indices and entries on CPU (used with // GPU sparse model) int *i_jaceq, diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp index 9624445b..46f4cc11 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp @@ -221,151 +221,88 @@ PetscErrorCode OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE(OPFLOW opflow) { LOADParamsRajaHiop *loadparams = &pbpolrajahiopsparse->loadparams; LINEParamsRajaHiop *lineparams = &pbpolrajahiopsparse->lineparams; - /* Need to compute the number of nonzeros in equality, inequality constraint - * Jacobians and Hessian */ - int nnz_eqjac = 0, nnz_ineqjac = 0, nnz_hess = 0; - - // Find nonzero entries in equality constraint Jacobian by row. Using - // OPFLOWComputeEqualityConstraintJacobian_PBPOL() as a guide. - - PS ps = (PS)opflow->ps; - - for (int ibus = 0; ibus < ps->nbus; ++ibus) { - - PSBUS bus = &(ps->bus[ibus]); - - // Nonzero entries used by each *bus* starts here - - // no matter what, each bus uses 2 rows and 2 columns - // row 1 = real, row2 = reactive - nnz_eqjac += 2; - nnz_eqjac += 2; + PS ps = opflow->ps; + PSBUS bus; + PSGEN gen; + PSLINE line; + PetscInt i, k; - if (bus->ide == ISOLATED_BUS) { - continue; - } + /* KS: Store the AGC variable index (scalar) */ + if (opflow->use_agc) { + pbpolrajahiopsparse->agc_xidx = opflow->idxn2sd_map[ps->startxloc]; + } else { + pbpolrajahiopsparse->agc_xidx = -1; + } - if (opflow->include_powerimbalance_variables) { - // 2 more entries on both real and reactive - nnz_eqjac += 4; - } + /* KS: Compute the number of nonzeros in equality, inequality constraint + * Jacobians and Hessian. Equality and Hessian counts are still obtained + * from PETSc via get_sparse_blocks_info; inequality count is computed + * axplicitly so we can skip PETSc */ + int nnz_eqjacsp = 0, nnz_ineqjacsp = 0, nnz_hesssp = 0; - if (opflow->has_gensetpoint) { - for (int bgen = 0; bgen < bus->ngen; ++bgen) { - PSGEN gen; - ierr = PSBUSGetGen(bus, bgen, &gen); - CHKERRQ(ierr); - - if (!gen->status || gen->isrenewable) - continue; + /* + * KS: Count inequality Jacobian non-zeros. The traversal order must match + * the startineqloc assignment in OPFLOWModelSetUp_PBPOL: for each bus + * (bus ineq, then gen ineq), then for each line. + */ + int geni = 0, gi; + for (i = 0; i < ps->nbus; i++) { + bus = &ps->bus[i]; - // each generator uses 2 rows, 3 columns real, 1 column reactive - nnz_eqjac += 4; + /* Bus voltage-Q-bounds constraints (FIXED_WITHIN_QBOUNDS) */ + if (opflow->genbusvoltagetype == FIXED_WITHIN_QBOUNDS) { + if (bus->ide == PV_BUS || bus->ide == REF_BUS) { + busparams->ineqjacsp_idx[i] = nnz_ineqjacsp; + /* 2 rows, each with ngenON + 1 entries (one per gen Qg + one for V) */ + nnz_ineqjacsp += 2 * (bus->ngenON + 1); } } - } - - // Go through the lines - for (int iline = 0; iline <= ps->nline; ++iline) { - PSLINE line = &(ps->line[iline]); - - if (!line->status) - continue; - - // each line adds 4 (off-diagonal) entries for the to bus and 4 - // entries for the from bus. Each line also modifies 4 existing - // to and from bus entries. - nnz_eqjac += 4; - nnz_eqjac += 4; - } - // if there are lines, non-zeros were over counted - if (ps->nline > 0) { - nnz_eqjac -= 8; - } - - if (opflow->has_gensetpoint) { - for (int ibus = 0; ibus < ps->nbus; ++ibus) { - PSBUS bus = &(ps->bus[ibus]); - for (int bgen = 0; bgen < bus->ngen; ++bgen) { - PSGEN gen; - ierr = PSBUSGetGen(bus, bgen, &gen); + /* KS: Generator set-point constraints */ + gi = 0; + if (opflow->has_gensetpoint) { + for (k = 0; k < bus->ngen; k++) { + ierr = PSBUSGetGen(bus, k, &gen); CHKERRQ(ierr); - if (!gen->status) continue; - - nnz_ineqjac += 6; - } - } - } - - if (opflow->genbusvoltagetype == FIXED_WITHIN_QBOUNDS) { - for (int ibus = 0; ibus < ps->nbus; ++ibus) { - PSBUS bus = &(ps->bus[ibus]); - if (bus->ide == PV_BUS || bus->ide == REF_BUS) { - nnz_ineqjac += 2; + if (!gen->isrenewable) { + genparams->ineqjacspgen_idx[geni + gi] = nnz_ineqjacsp; + if (opflow->use_agc) { + nnz_ineqjacsp += 6; /* 2 rows x 3 entries (Pg, delPg, delP) */ + } + } + gi++; } } - } - - for (int iline = 0; iline < opflow->nlinesmon; ++iline) { - nnz_ineqjac += 8; - } - - for (int ibus = 0; ibus < ps->nbus; ++ibus) { - // reserve 2 real and 2 reactive entries for each bus - // 3 upper triangular - nnz_hess += 3; - - if (opflow->include_powerimbalance_variables) { - nnz_hess += 2; - } - } - - for (int i = 0; i < ps->ngen; ++i) { - PSGEN gen = &(ps->gen[i]); - - if (!gen->status) - continue; - - nnz_hess += 2; - - if (opflow->has_gensetpoint) { - if (gen->isrenewable) - continue; - // later ... - // if (opflow->use_agc) { - // nnz_hess += 5; - // } - } - if (opflow->genbusvoltagetype == FIXED_WITHIN_QBOUNDS) { - nnz_hess += 2; - } + geni += bus->ngenON; } - for (int iline = 0; iline < ps->nline; ++iline) { - PSLINE line = &(ps->line[iline]); - + /* Line flow constraints */ + int linej = 0; + for (i = 0; i < ps->nline; i++) { + line = &ps->line[i]; if (!line->status) continue; + if (line->isdcline) + continue; - // 3 diagonal entries for on the from-bus rows (already defined) - // 3 diagonal entries for on the to-bus rows (already defined) - // 4 off-diagonal entries in upper part - nnz_hess += 4; - } - - if (opflow->include_loadloss_variables) { - for (int iload = 0; iload < ps->nload; ++iload) { - nnz_hess += 2; + if (linej < opflow->nlinesmon && opflow->linesmon[linej] == i) { + lineparams->ineqjacsp_idx[linej] = nnz_ineqjacsp; + /* 2 rows x 4 entries (thetaf, Vmf, thetat, Vmt) */ + int entries_per_line = 8; + if (opflow->allow_lineflow_violation) { + entries_per_line += 2; /* 1 slack entry per row */ + } + nnz_ineqjacsp += entries_per_line; + linej++; } } - opflow->nnz_eqjacsp = nnz_eqjac; - opflow->nnz_ineqjacsp = nnz_ineqjac; - opflow->nnz_hesssp = nnz_hess; + opflow->nnz_eqjacsp = nnz_eqjacsp; + opflow->nnz_ineqjacsp = nnz_ineqjacsp; + opflow->nnz_hesssp = nnz_hesssp; ierr = busparams->copy(opflow); ierr = genparams->copy(opflow); diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp new file mode 100644 index 00000000..05705a35 --- /dev/null +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp @@ -0,0 +1,244 @@ + +#include + +#if defined(EXAGO_ENABLE_RAJA) +#if defined(EXAGO_ENABLE_HIOP_SPARSE) + +#include +#include + +#include "pbpolrajahiopsparse_gpu.hpp" + +void ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, + const double *x_dev, + double *jacd_dev) { + PbpolModelRajaHiop *pbpolrajahiopsparse = + reinterpret_cast(opflow->model); + GENParamsRajaHiop *genparams = &pbpolrajahiopsparse->genparams; + BUSParamsRajaHiop *busparams = &pbpolrajahiopsparse->busparams; + + /* + * Generator set-point inequality Jacobian (has_gensetpoint && use_agc). + * + * PETSc reference (pbpol.cpp:1191-1232): + * Row gloc: [apf*delP - delPg, -(Pg - pt), apf*(Pg - pt)] + * Row gloc+1: [apf*delP - delPg, (pb - Pg), -apf*(pb - Pg)] + * + * Flat array layout per generator (6 entries): + * [0..2] = row 0 values (Pg, delPg, delP columns) + * [3..5] = row 1 values (Pg, delPg, delP columns) + */ + if (opflow->has_gensetpoint && opflow->use_agc) { + int *g_ineqjacsp_idx = genparams->ineqjacspgen_idx_dev_; + int *g_xidx = genparams->xidx_dev_; + int *g_xpdevidx = genparams->xpdevidx_dev_; + int *g_isrenewable = genparams->isrenewable_dev_; + double *g_apf = genparams->apf_dev_; + double *g_pt = genparams->pt_dev_; + double *g_pb = genparams->pb_dev_; + int agc_xidx = pbpolrajahiopsparse->agc_xidx; + + RAJA::forall( + RAJA::RangeSegment(0, genparams->ngenON), + RAJA_LAMBDA(RAJA::Index_type i) { + if (g_isrenewable[i]) + return; + + int base = g_ineqjacsp_idx[i]; + double Pg = x_dev[g_xidx[i]]; + double delPg = x_dev[g_xpdevidx[i]]; + double delP = x_dev[agc_xidx]; + double apf = g_apf[i]; + double pt = g_pt[i]; + double pb = g_pb[i]; + + double apf_delP_minus_delPg = apf * delP - delPg; + + /* Row 0: d/d{Pg, delPg, delP} of (apf*delP - delPg)*(Pg - pt) */ + jacd_dev[base + 0] = apf_delP_minus_delPg; + jacd_dev[base + 1] = -(Pg - pt); + jacd_dev[base + 2] = apf * (Pg - pt); + + /* Row 1: d/d{Pg, delPg, delP} of (delPg - apf*delP)*(pb - Pg) */ + jacd_dev[base + 3] = apf_delP_minus_delPg; + jacd_dev[base + 4] = pb - Pg; + jacd_dev[base + 5] = -apf * (pb - Pg); + }); + } + + /* + * FIXED_WITHIN_QBOUNDS voltage constraint Jacobian. + * + * PETSc reference (pbpol.cpp:1235-1278): + * For each PV/REF bus with ngenON generators: + * Row 0 (gloc): [Vset_0-V, Vset_1-V, ..., Qmax-Q] + * Row 1 (gloc+1): [Vset_0-V, Vset_1-V, ..., Qmin-Q] + * + * Flat array layout per bus (2 * (ngenON + 1) entries): + * Row 0: ngenON Qg derivative entries + 1 V derivative entry + * Row 1: ngenON Qg derivative entries + 1 V derivative entry + * + * This kernel parallelizes over buses. The inner loop over generators + * is sequential within each thread (variable-length per bus). + */ + if (opflow->genbusvoltagetype == FIXED_WITHIN_QBOUNDS) { + int *b_ispv = busparams->ispv_dev_; + int *b_isref = busparams->isref_dev_; + int *b_xidx = busparams->xidx_dev_; + int *b_ineqjacsp_idx = busparams->ineqjacsp_idx_dev_; + int *b_genoffset = busparams->genoffset_dev_; + int *b_ngenONbus = busparams->ngenONbus_dev_; + int *g_xidx = genparams->xidx_dev_; + double *g_qt = genparams->qt_dev_; + double *g_qb = genparams->qb_dev_; + double *g_vs = genparams->vs_dev_; + + RAJA::forall( + RAJA::RangeSegment(0, busparams->nbus), + RAJA_LAMBDA(RAJA::Index_type i) { + if (!b_ispv[i] && !b_isref[i]) + return; + + int base = b_ineqjacsp_idx[i]; + int ngen = b_ngenONbus[i]; + int goff = b_genoffset[i]; + double V = x_dev[b_xidx[i] + 1]; + + double Q = 0.0, Qmax = 0.0, Qmin = 0.0; + double Vset = 0.0; + + /* Row 0 and Row 1: Qg derivative entries */ + for (int k = 0; k < ngen; k++) { + int gidx = goff + k; + double Qg = x_dev[g_xidx[gidx] + 1]; + Q += Qg; + Qmax += g_qt[gidx]; + Qmin += g_qb[gidx]; + Vset = g_vs[gidx]; + + double dQg = Vset - V; + /* Row 0 entry for this gen's Qg */ + jacd_dev[base + k] = dQg; + /* Row 1 entry for this gen's Qg */ + jacd_dev[base + ngen + 1 + k] = dQg; + } + + /* Row 0: V derivative entry (last in this row) */ + jacd_dev[base + ngen] = Qmax - Q; + /* Row 1: V derivative entry (last in this row) */ + jacd_dev[base + ngen + 1 + ngen] = Qmin - Q; + }); + } + + /* + * Line flow constraint Jacobian. + * + * Computes dSf2/d{thetaf,Vmf,thetat,Vmt} and dSt2/d{thetaf,Vmf,thetat,Vmt} + * for each monitored line, plus optional slack variable entries (-1.0). + * + * Flat array layout per line (8 entries without slack, 10 with): + * Row 0 (Sf2): [dSf2_dthetaf, dSf2_dVmf, dSf2_dthetat, dSf2_dVmt, + * (-1.0 if slack)] + * Row 1 (St2): [dSt2_dthetaf, dSt2_dVmf, dSt2_dthetat, dSt2_dVmt, + * (-1.0 if slack)] + */ + LINEParamsRajaHiop *lineparams = &pbpolrajahiopsparse->lineparams; + + if (lineparams->nlinelim) { + double *Gff_arr = lineparams->Gff_dev_; + double *Bff_arr = lineparams->Bff_dev_; + double *Gft_arr = lineparams->Gft_dev_; + double *Bft_arr = lineparams->Bft_dev_; + double *Gtf_arr = lineparams->Gtf_dev_; + double *Btf_arr = lineparams->Btf_dev_; + double *Gtt_arr = lineparams->Gtt_dev_; + double *Btt_arr = lineparams->Btt_dev_; + int *linelimidx = lineparams->linelimidx_dev_; + int *xidxf = lineparams->xidxf_dev_; + int *xidxt = lineparams->xidxt_dev_; + int *ineqjacsp_idx = lineparams->ineqjacsp_idx_dev_; + int has_slack = (int)opflow->allow_lineflow_violation; + int row_stride = 4 + has_slack; + + RAJA::forall( + RAJA::RangeSegment(0, lineparams->nlinelim), + RAJA_LAMBDA(RAJA::Index_type i) { + int j = linelimidx[i]; + int base = ineqjacsp_idx[i]; + + double thetaf = x_dev[xidxf[j]]; + double Vmf = x_dev[xidxf[j] + 1]; + double thetat = x_dev[xidxt[j]]; + double Vmt = x_dev[xidxt[j] + 1]; + double thetaft = thetaf - thetat; + double thetatf = thetat - thetaf; + + double Gff = Gff_arr[j], Bff = Bff_arr[j]; + double Gft = Gft_arr[j], Bft = Bft_arr[j]; + double Gtf = Gtf_arr[j], Btf = Btf_arr[j]; + double Gtt = Gtt_arr[j], Btt = Btt_arr[j]; + + double sin_ft = sin(thetaft), cos_ft = cos(thetaft); + double sin_tf = sin(thetatf), cos_tf = cos(thetatf); + + double Pf = + Gff * Vmf * Vmf + Vmf * Vmt * (Gft * cos_ft + Bft * sin_ft); + double Qf = + -Bff * Vmf * Vmf + Vmf * Vmt * (-Bft * cos_ft + Gft * sin_ft); + double Pt = + Gtt * Vmt * Vmt + Vmt * Vmf * (Gtf * cos_tf + Btf * sin_tf); + double Qt = + -Btt * Vmt * Vmt + Vmt * Vmf * (-Btf * cos_tf + Gtf * sin_tf); + + double dSf2_dPf = 2 * Pf, dSf2_dQf = 2 * Qf; + double dSt2_dPt = 2 * Pt, dSt2_dQt = 2 * Qt; + + double dPf_dthetaf = Vmf * Vmt * (-Gft * sin_ft + Bft * cos_ft); + double dPf_dVmf = 2 * Gff * Vmf + Vmt * (Gft * cos_ft + Bft * sin_ft); + double dPf_dthetat = Vmf * Vmt * (Gft * sin_ft - Bft * cos_ft); + double dPf_dVmt = Vmf * (Gft * cos_ft + Bft * sin_ft); + + double dQf_dthetaf = Vmf * Vmt * (Bft * sin_ft + Gft * cos_ft); + double dQf_dVmf = + -2 * Bff * Vmf + Vmt * (-Bft * cos_ft + Gft * sin_ft); + double dQf_dthetat = Vmf * Vmt * (-Bft * sin_ft - Gft * cos_ft); + double dQf_dVmt = Vmf * (-Bft * cos_ft + Gft * sin_ft); + + double dPt_dthetat = Vmt * Vmf * (-Gtf * sin_tf + Btf * cos_tf); + double dPt_dVmt = 2 * Gtt * Vmt + Vmf * (Gtf * cos_tf + Btf * sin_tf); + double dPt_dthetaf = Vmt * Vmf * (Gtf * sin_tf - Btf * cos_tf); + double dPt_dVmf = Vmt * (Gtf * cos_tf + Btf * sin_tf); + + double dQt_dthetat = Vmt * Vmf * (Btf * sin_tf + Gtf * cos_tf); + double dQt_dVmt = + -2 * Btt * Vmt + Vmf * (-Btf * cos_tf + Gtf * sin_tf); + double dQt_dthetaf = Vmt * Vmf * (-Btf * sin_tf - Gtf * cos_tf); + double dQt_dVmf = Vmt * (-Btf * cos_tf + Gtf * sin_tf); + + /* Row 0 (Sf2): derivatives w.r.t. thetaf, Vmf, thetat, Vmt */ + jacd_dev[base + 0] = dSf2_dPf * dPf_dthetaf + dSf2_dQf * dQf_dthetaf; + jacd_dev[base + 1] = dSf2_dPf * dPf_dVmf + dSf2_dQf * dQf_dVmf; + jacd_dev[base + 2] = dSf2_dPf * dPf_dthetat + dSf2_dQf * dQf_dthetat; + jacd_dev[base + 3] = dSf2_dPf * dPf_dVmt + dSf2_dQf * dQf_dVmt; + + /* Row 1 (St2): derivatives w.r.t. thetaf, Vmf, thetat, Vmt */ + jacd_dev[base + row_stride + 0] = + dSt2_dPt * dPt_dthetaf + dSt2_dQt * dQt_dthetaf; + jacd_dev[base + row_stride + 1] = + dSt2_dPt * dPt_dVmf + dSt2_dQt * dQt_dVmf; + jacd_dev[base + row_stride + 2] = + dSt2_dPt * dPt_dthetat + dSt2_dQt * dQt_dthetat; + jacd_dev[base + row_stride + 3] = + dSt2_dPt * dPt_dVmt + dSt2_dQt * dQt_dVmt; + + /* Slack variable entries (Step 5) */ + if (has_slack) { + jacd_dev[base + 4] = -1.0; + jacd_dev[base + row_stride + 4] = -1.0; + } + }); + } +} + +#endif // EXAGO_ENABLE_HIOP_SPARSE +#endif // EXAGO_ENABLE_RAJA diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp new file mode 100644 index 00000000..938686a7 --- /dev/null +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp @@ -0,0 +1,30 @@ +#include + +#if defined(EXAGO_ENABLE_RAJA) +#if defined(EXAGO_ENABLE_HIOP_SPARSE) + +#pragma once + +#include +#include "pbpolrajahiopsparse.hpp" + +/** + * GPU-only (PETSc-free) computation of inequality constraint Jacobian values. + * + * Replaces the PETSc-based path that calls + * opflow->modelops.computeinequalityconstraintjacobian followed by + * MatGetRow extraction. Writes directly into device memory using RAJA + * kernels, without PETSc Mat/Vec operations; no H2D, D2H copies back and forth. + * + * @param opflow The OPFLOW problem context + * @param x_dev Device array of variable values + * @param jacd_dev Device output array for inequality Jacobian values + * (points to the ineq portion of the sparse Jacobian, + * i.e. MJacS_dev + nnz_eqjacsp) + */ +void ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, + const double *x_dev, + double *jacd_dev); + +#endif // EXAGO_ENABLE_HIOP_SPARSE +#endif // EXAGO_ENABLE_RAJA diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp index e18ba20c..2fe94998 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp @@ -14,6 +14,7 @@ #include #include "pbpolrajahiopsparsekernels.hpp" #include "pbpolrajahiopsparse.hpp" +#include "pbpolrajahiopsparse_gpu.hpp" PetscErrorCode OPFLOWSetInitialGuessArray_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, double *x0_dev) { @@ -547,40 +548,11 @@ OPFLOWComputeSparseInequalityConstraintJacobian_PBPOLRAJAHIOPSPARSE( ierr = PetscLogEventBegin(opflow->ineqconsjaclogger, 0, 0, 0, 0); CHKERRQ(ierr); - ierr = VecGetArray(opflow->X, &x); - CHKERRQ(ierr); - - // Copy from device to host - umpire::Allocator h_allocator_ = resmgr.getAllocator("HOST"); - registerWith(x, opflow->nx, resmgr, h_allocator_); - resmgr.copy((double *)x, (double *)x_dev); - - ierr = VecRestoreArray(opflow->X, &x); - CHKERRQ(ierr); - - /* Compute inequality constraint jacobian */ - ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( - opflow, opflow->X, opflow->Jac_Gi); - CHKERRQ(ierr); - - ierr = MatGetSize(opflow->Jac_Gi, &nrow, &ncol); - CHKERRQ(ierr); - - values = pbpolrajahiopsparse->val_jacineq; - /* Copy over values */ - for (i = 0; i < nrow; i++) { - ierr = MatGetRow(opflow->Jac_Gi, i, &nvals, &cols, &vals); - CHKERRQ(ierr); - for (j = 0; j < nvals; j++) { - values[j] = vals[j]; - } - values += nvals; - ierr = MatRestoreRow(opflow->Jac_Gi, i, &nvals, &cols, &vals); - CHKERRQ(ierr); - } - // Copy over val_jacineq to device - resmgr.copy(MJacS_dev + opflow->nnz_eqjacsp, - pbpolrajahiopsparse->val_jacineq); + /* KS: Compute inequality constraint Jacobian directly on device. + No H2D, D2H copies: x_dev is already on device, output goes + straight into the ineq portion of MJacS_dev. */ + ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE( + opflow, x_dev, MJacS_dev + opflow->nnz_eqjacsp); ierr = PetscLogEventEnd(opflow->ineqconsjaclogger, 0, 0, 0, 0); CHKERRQ(ierr); diff --git a/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp b/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp index d7a9539d..d768e738 100644 --- a/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp +++ b/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp @@ -69,20 +69,10 @@ bool OPFLOWHIOPSPARSEGPUInterface::get_sparse_blocks_info( nnz_sparse_Jaceq = opflow->nnz_eqjacsp = info_eq.nz_used; - nnz_sparse_Jacineq = 0; - if (opflow->Nconineq) { - ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( - opflow, opflow->X, opflow->Jac_Gi); - CHKERRQ(ierr); - ierr = - MatSetOption(opflow->Jac_Gi, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE); - CHKERRQ(ierr); - - ierr = MatGetInfo(opflow->Jac_Gi, MAT_LOCAL, &info_ineq); - CHKERRQ(ierr); - - nnz_sparse_Jacineq = opflow->nnz_ineqjacsp = info_ineq.nz_used; - } + /* KS: Use pre-computed nnz_ineqjacsp from model setup (avoids PETSc Mat + assembly just for counting non-zeros -- not sure if faster or if it scales + but hey no PETSc!). */ + nnz_sparse_Jacineq = opflow->nnz_ineqjacsp; /* Compute non-zeros for Hessian */ ierr = (*opflow->modelops.computehessian)(opflow, opflow->X, opflow->Lambdae, diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index ace550c4..13c6b989 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -13,6 +13,17 @@ add_executable(test_pflow test_pflow.cpp utils/test_acopf_utils.cpp) target_link_libraries(test_pflow ExaGO::PFLOW) target_include_directories(test_pflow PRIVATE ./utils) +if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIOP_SPARSE) + if(EXAGO_ENABLE_CUDA) + set_source_files_properties(test_ineqjac_gpu.cpp PROPERTIES LANGUAGE CUDA) + endif() + add_executable(test_ineqjac_gpu test_ineqjac_gpu.cpp) + target_link_libraries(test_ineqjac_gpu ExaGO::OPFLOW) + target_include_directories( + test_ineqjac_gpu PRIVATE ${CMAKE_SOURCE_DIR}/src/opflow + ) +endif() + add_executable(test_error_handler test_error_handler.cpp) target_link_libraries(test_error_handler ExaGO::UTILS) @@ -49,6 +60,22 @@ if(EXAGO_RUN_TESTS) ${network_files} ) + if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIOP_SPARSE) + exago_add_test( + NAME + "UNIT_TEST_8_INEQJAC_GPU" + DEPENDS + HIOP + COMMAND + ${RUNCMD} + $ + -opflow_genbusvoltage + VARIABLE_WITHIN_BOUNDS + NETFILES + ${network_files} + ) + endif() + exago_add_test( NAME UNIT_TESTS_UTILS COMMAND $ ) diff --git a/tests/unit/test_ineqjac_gpu.cpp b/tests/unit/test_ineqjac_gpu.cpp new file mode 100644 index 00000000..6e537f63 --- /dev/null +++ b/tests/unit/test_ineqjac_gpu.cpp @@ -0,0 +1,411 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +#if defined(EXAGO_ENABLE_RAJA) +#include +#include +#include +#include +#endif + +#ifdef EXAGO_ENABLE_GPU +#include +#endif + +#include "model/power_bal_hiop/paramsrajahiop.h" +#include "model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp" + +static const double TOL = 1e-8; + +static int compare_arrays(const double *ref, const double *gpu, int n, + const char *label) { + int fail = 0; + for (int i = 0; i < n; i++) { + if (std::abs(ref[i] - gpu[i]) / (1.0 + std::abs(ref[i])) > TOL) { + std::cout << " MISMATCH " << label << "[" << i << "]: PETSc=" << ref[i] + << " GPU=" << gpu[i] << " diff=" << std::abs(ref[i] - gpu[i]) + << std::endl; + fail++; + } + } + return fail; +} + +/** + * Test 8: Validate GPU inequality constraint Jacobian values against PETSc. + * + * First solves with IPOPT/PBPOL to obtain a realistic solution, then + * sets up the PBPOLRAJAHIOPSPARSE model and evaluates the inequality + * Jacobian at the converged solution using both the PETSc reference + * path and the GPU RAJA kernel. Compares element-by-element. + */ +int main(int argc, char **argv) { + PetscErrorCode ierr; + PetscBool flg; + char file_c_str[PETSC_MAX_PATH_LEN]; + std::string file; + char appname[] = "opflow"; + MPI_Comm comm = MPI_COMM_WORLD; + char help[] = "Test 8: GPU ineq Jacobian validation\n"; + int fail = 0; + + ierr = ExaGOInitialize(comm, &argc, &argv, appname, help); + if (ierr) { + fprintf(stderr, "Could not initialize ExaGO.\n"); + return ierr; + } + + ierr = PetscOptionsGetString(NULL, NULL, "-netfile", file_c_str, + PETSC_MAX_PATH_LEN, &flg); + CHKERRQ(ierr); + if (!flg) + file = "../datafiles/case9/case9mod.m"; + else + file.assign(file_c_str); + + std::cout << "=== Test 8: GPU Inequality Jacobian Validation ===" + << std::endl; + std::cout << "Network file: " << file << std::endl; + + /* ------------------------------------------------------------------ + * Step 1: Solve with IPOPT/PBPOL to get a realistic solution + * ------------------------------------------------------------------ */ + OPFLOW opflow_ref; + Vec Xsol; + + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow_ref, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow_ref, OPFLOWMODEL_PBPOL); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow_ref, OPFLOWSOLVER_IPOPT); + CHKERRQ(ierr); + ierr = OPFLOWSolve(opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWGetSolution(opflow_ref, &Xsol); + CHKERRQ(ierr); + + std::cout << "IPOPT solve complete." << std::endl; + + /* Get the solution array (natural ordering) */ + PetscInt nx_ref; + ierr = VecGetSize(Xsol, &nx_ref); + CHKERRQ(ierr); + double *xsol_nat; + ierr = VecGetArray(Xsol, &xsol_nat); + CHKERRQ(ierr); + + /* ------------------------------------------------------------------ + * Step 2: Create the PBPOLRAJAHIOPSPARSE model and set up + * ------------------------------------------------------------------ */ + OPFLOW opflow; + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow, OPFLOWMODEL_PBPOLRAJAHIOPSPARSE); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow, OPFLOWSOLVER_HIOPSPARSEGPU); + CHKERRQ(ierr); + ierr = OPFLOWSetUp(opflow); + CHKERRQ(ierr); + + int nx, nconeq, nconineq; + ierr = OPFLOWGetSizes(opflow, &nx, &nconeq, &nconineq); + CHKERRQ(ierr); + + std::cout << "nx=" << nx << " nconeq=" << nconeq << " nconineq=" << nconineq + << " nnz_ineqjacsp=" << opflow->nnz_ineqjacsp << std::endl; + + if (!nconineq) { + std::cout << "No inequality constraints -- nothing to test. PASS." + << std::endl; + ierr = VecRestoreArray(Xsol, &xsol_nat); + CHKERRQ(ierr); + ierr = OPFLOWDestroy(&opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWDestroy(&opflow); + CHKERRQ(ierr); + ExaGOFinalize(); + return 0; + } + + /* ------------------------------------------------------------------ + * Step 3: Copy IPOPT solution into opflow->X (natural ordering) + * ------------------------------------------------------------------ */ + double *x_nat; + ierr = VecGetArray(opflow->X, &x_nat); + CHKERRQ(ierr); + for (int i = 0; i < nx; i++) + x_nat[i] = xsol_nat[i]; + ierr = VecRestoreArray(opflow->X, &x_nat); + CHKERRQ(ierr); + + ierr = VecRestoreArray(Xsol, &xsol_nat); + CHKERRQ(ierr); + + std::cout << "Evaluating Jacobian at IPOPT-converged solution." << std::endl; + + /* ------------------------------------------------------------------ + * Step 4: Compute reference inequality Jacobian via PETSc. + * The first call establishes the sparsity pattern, the second + * computes the actual values at the converged solution. + * ------------------------------------------------------------------ */ + ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( + opflow, opflow->X, opflow->Jac_Gi); + CHKERRQ(ierr); + ierr = MatSetOption(opflow->Jac_Gi, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE); + CHKERRQ(ierr); + ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( + opflow, opflow->X, opflow->Jac_Gi); + CHKERRQ(ierr); + + int nnz = opflow->nnz_ineqjacsp; + + auto &resmgr = umpire::ResourceManager::getInstance(); + umpire::Allocator h_allocator = resmgr.getAllocator("HOST"); + + double *ref_vals = + static_cast(h_allocator.allocate(nnz * sizeof(double))); + + PetscInt nrow, ncol; + ierr = MatGetSize(opflow->Jac_Gi, &nrow, &ncol); + CHKERRQ(ierr); + + double *vptr = ref_vals; + for (int i = 0; i < nrow; i++) { + PetscInt nvals; + const PetscInt *cols; + const PetscScalar *vals; + ierr = MatGetRow(opflow->Jac_Gi, i, &nvals, &cols, &vals); + CHKERRQ(ierr); + for (int j = 0; j < nvals; j++) + vptr[j] = vals[j]; + vptr += nvals; + ierr = MatRestoreRow(opflow->Jac_Gi, i, &nvals, &cols, &vals); + CHKERRQ(ierr); + } + + int ref_count = (int)(vptr - ref_vals); + std::cout << "PETSc extracted " << ref_count << " ineq Jacobian values" + << " (expected " << nnz << ")" << std::endl; + + if (ref_count != nnz) { + std::cout << "FAIL: NNZ mismatch! PETSc=" << ref_count + << " analytical=" << nnz << std::endl; + fail++; + } + + /* ------------------------------------------------------------------ + * Step 5: Compute GPU inequality Jacobian at the same solution + * ------------------------------------------------------------------ */ + double *x_host; + ierr = VecGetArray(opflow->X, &x_host); + CHKERRQ(ierr); + + double *x_sd = + static_cast(h_allocator.allocate(nx * sizeof(double))); + for (int i = 0; i < nx; i++) + x_sd[opflow->idxn2sd_map[i]] = x_host[i]; + + ierr = VecRestoreArray(opflow->X, &x_host); + CHKERRQ(ierr); + + double *gpu_vals; + double *x_dev, *gpu_vals_dev; + +#ifdef EXAGO_ENABLE_GPU + umpire::Allocator d_allocator = resmgr.getAllocator("DEVICE"); + x_dev = static_cast(d_allocator.allocate(nx * sizeof(double))); + gpu_vals_dev = + static_cast(d_allocator.allocate(nnz * sizeof(double))); + resmgr.memset(gpu_vals_dev, 0, nnz * sizeof(double)); +#else + x_dev = x_sd; + gpu_vals_dev = + static_cast(h_allocator.allocate(nnz * sizeof(double))); + memset(gpu_vals_dev, 0, nnz * sizeof(double)); +#endif + + umpire::util::AllocationRecord rec_x{x_sd, sizeof(double) * nx, + h_allocator.getAllocationStrategy()}; + resmgr.registerAllocation(x_sd, rec_x); +#ifdef EXAGO_ENABLE_GPU + resmgr.copy(x_dev, x_sd); +#endif + + std::cout << "Running RAJA GPU inequality Jacobian kernel..." << std::endl; + ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(opflow, x_dev, gpu_vals_dev); + + gpu_vals = static_cast(h_allocator.allocate(nnz * sizeof(double))); +#ifdef EXAGO_ENABLE_GPU + resmgr.copy(gpu_vals, gpu_vals_dev); +#else + memcpy(gpu_vals, gpu_vals_dev, nnz * sizeof(double)); +#endif + + /* ------------------------------------------------------------------ + * Step 6: Compare + * ------------------------------------------------------------------ */ + std::cout << "Comparing " << nnz << " inequality Jacobian values..." + << std::endl; + int cmp_fail = compare_arrays(ref_vals, gpu_vals, nnz, "ineqjac"); + fail += cmp_fail; + + if (cmp_fail == 0) + std::cout << "PASS: All " << nnz + << " inequality Jacobian values match within tol=" << TOL + << std::endl; + else + std::cout << "FAIL: " << cmp_fail << " of " << nnz << " values differ" + << std::endl; + + /* ------------------------------------------------------------------ + * Step 7: Performance comparison (enabled with -benchmark flag) + * ------------------------------------------------------------------ */ + PetscBool run_benchmark = PETSC_FALSE; + ierr = PetscOptionsGetBool(NULL, NULL, "-benchmark", NULL, &run_benchmark); + CHKERRQ(ierr); + + if (run_benchmark) { + int niters = 1000; + PetscInt bench_nrow, bench_ncol; + ierr = MatGetSize(opflow->Jac_Gi, &bench_nrow, &bench_ncol); + CHKERRQ(ierr); + + double *bench_vals = + static_cast(h_allocator.allocate(nnz * sizeof(double))); + + std::cout << "\n=== Performance Benchmark (" << niters + << " iterations) ===" << std::endl; + + /* --- PETSc path: compute + MatGetRow extraction + copy to device --- */ + { + double *bench_dev; + size_t nnz_bytes = nnz * sizeof(double); +#ifdef EXAGO_ENABLE_GPU + bench_dev = static_cast(d_allocator.allocate(nnz_bytes)); +#else + bench_dev = static_cast(h_allocator.allocate(nnz_bytes)); +#endif + + auto t0 = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < niters; iter++) { + ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( + opflow, opflow->X, opflow->Jac_Gi); + + double *vp = bench_vals; + for (int i = 0; i < bench_nrow; i++) { + PetscInt nv; + const PetscInt *c; + const PetscScalar *v; + MatGetRow(opflow->Jac_Gi, i, &nv, &c, &v); + for (int j = 0; j < nv; j++) + vp[j] = v[j]; + vp += nv; + MatRestoreRow(opflow->Jac_Gi, i, &nv, &c, &v); + } +#ifdef EXAGO_ENABLE_GPU + (void)hipMemcpy(bench_dev, bench_vals, nnz_bytes, + hipMemcpyHostToDevice); +#else + memcpy(bench_dev, bench_vals, nnz_bytes); +#endif + } + auto t1 = std::chrono::high_resolution_clock::now(); + double petsc_us = + std::chrono::duration(t1 - t0).count() / niters; + std::cout << " PETSc path (compute + MatGetRow + copy): " << petsc_us + << " us/iter" << std::endl; + +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(bench_dev); +#else + h_allocator.deallocate(bench_dev); +#endif + } + + /* --- PETSc path: CPU compute only --- */ + { + auto t0 = std::chrono::high_resolution_clock::now(); + + for (int iter = 0; iter < niters; iter++) { + ierr = (*opflow->modelops.computeinequalityconstraintjacobian)( + opflow, opflow->X, opflow->Jac_Gi); + } + + auto t1 = std::chrono::high_resolution_clock::now(); + + double petsc_us = + std::chrono::duration(t1 - t0).count() / niters; + std::cout << " PETSc path (compute only): " << petsc_us + << " us/iter" << std::endl; + } + + /* --- GPU path: RAJA kernels, no copies --- */ + { + double *bench_dev; +#ifdef EXAGO_ENABLE_GPU + bench_dev = + static_cast(d_allocator.allocate(nnz * sizeof(double))); +#else + bench_dev = + static_cast(h_allocator.allocate(nnz * sizeof(double))); +#endif + +#ifdef EXAGO_ENABLE_GPU + (void)hipDeviceSynchronize(); +#endif + auto t0 = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < niters; iter++) { + ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(opflow, x_dev, bench_dev); + } +#ifdef EXAGO_ENABLE_GPU + (void)hipDeviceSynchronize(); +#endif + auto t1 = std::chrono::high_resolution_clock::now(); + double gpu_us = + std::chrono::duration(t1 - t0).count() / niters; + std::cout << " GPU path (RAJA kernels, no copies): " << gpu_us + << " us/iter" << std::endl; + +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(bench_dev); +#else + h_allocator.deallocate(bench_dev); +#endif + } + + h_allocator.deallocate(bench_vals); + std::cout << "=== End Benchmark ===" << std::endl; + } + + /* ------------------------------------------------------------------ + * Cleanup + * ------------------------------------------------------------------ */ + h_allocator.deallocate(ref_vals); + h_allocator.deallocate(gpu_vals); + h_allocator.deallocate(x_sd); +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(x_dev); + d_allocator.deallocate(gpu_vals_dev); +#else + h_allocator.deallocate(gpu_vals_dev); +#endif + + ierr = OPFLOWDestroy(&opflow); + CHKERRQ(ierr); + ierr = OPFLOWDestroy(&opflow_ref); + CHKERRQ(ierr); + ExaGOFinalize(); + + return fail; +}