diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp b/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp index fbe44e4e..158445fd 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.cpp @@ -21,6 +21,7 @@ int BUSParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(bl); h_allocator_.deallocate(xidx); h_allocator_.deallocate(gidx); + h_allocator_.deallocate(eqjacsp_selfidx); if (opflow->include_powerimbalance_variables) { h_allocator_.deallocate(xidxpimb); h_allocator_.deallocate(powerimbalance_penalty); @@ -45,6 +46,7 @@ int BUSParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(bl_dev_); d_allocator_.deallocate(xidx_dev_); d_allocator_.deallocate(gidx_dev_); + d_allocator_.deallocate(eqjacsp_selfidx_dev_); if (opflow->include_powerimbalance_variables) { d_allocator_.deallocate(xidxpimb_dev_); d_allocator_.deallocate(powerimbalance_penalty_dev_); @@ -84,6 +86,7 @@ int BUSParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(xidx_dev_, xidx); resmgr.copy(gidx_dev_, gidx); + resmgr.copy(eqjacsp_selfidx_dev_, eqjacsp_selfidx); if (opflow->include_powerimbalance_variables) { resmgr.copy(xidxpimb_dev_, xidxpimb); resmgr.copy(jacsp_idx_dev_, jacsp_idx); @@ -108,6 +111,7 @@ int BUSParamsRajaHiop::copy(OPFLOW opflow) { xidx_dev_ = xidx; xidxpimb_dev_ = xidxpimb; gidx_dev_ = gidx; + eqjacsp_selfidx_dev_ = eqjacsp_selfidx; jacsp_idx_dev_ = jacsp_idx; jacsq_idx_dev_ = jacsq_idx; powerimbalance_penalty_dev_ = powerimbalance_penalty; @@ -145,6 +149,7 @@ int BUSParamsRajaHiop::allocate(OPFLOW opflow) { xidx = paramAlloc(h_allocator_, nbus); gidx = paramAlloc(h_allocator_, nbus); + eqjacsp_selfidx = paramAlloc(h_allocator_, 2 * nbus); if (opflow->include_powerimbalance_variables) { xidxpimb = paramAlloc(h_allocator_, nbus); @@ -232,6 +237,7 @@ int BUSParamsRajaHiop::allocate(OPFLOW opflow) { xidx_dev_ = paramAlloc(d_allocator_, nbus); gidx_dev_ = paramAlloc(d_allocator_, nbus); + eqjacsp_selfidx_dev_ = paramAlloc(d_allocator_, 2 * nbus); if (opflow->include_powerimbalance_variables) { xidxpimb_dev_ = paramAlloc(d_allocator_, nbus); @@ -278,6 +284,9 @@ int LINEParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(ineqjacsp_idx_dev_, ineqjacsp_idx); resmgr.copy(xslackidx_dev_, xslackidx); } + resmgr.copy(eqjacsp_idx_dev_, eqjacsp_idx); + resmgr.copy(eqjacsp_diag_idx_dev_, eqjacsp_diag_idx); + resmgr.copy(isdcline_dev_, isdcline); #else Gff_dev_ = Gff; Bff_dev_ = Bff; @@ -299,6 +308,9 @@ int LINEParamsRajaHiop::copy(OPFLOW opflow) { ineqjacsp_idx_dev_ = ineqjacsp_idx; xslackidx_dev_ = xslackidx; } + eqjacsp_idx_dev_ = eqjacsp_idx; + eqjacsp_diag_idx_dev_ = eqjacsp_diag_idx; + isdcline_dev_ = isdcline; #endif return 0; } @@ -328,6 +340,9 @@ int LINEParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(ineqjacsp_idx); h_allocator_.deallocate(xslackidx); } + h_allocator_.deallocate(eqjacsp_idx); + h_allocator_.deallocate(eqjacsp_diag_idx); + h_allocator_.deallocate(isdcline); #ifdef EXAGO_ENABLE_GPU // Destroy parameter arrays on the device @@ -354,6 +369,9 @@ int LINEParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(ineqjacsp_idx_dev_); d_allocator_.deallocate(xslackidx_dev_); } + d_allocator_.deallocate(eqjacsp_idx_dev_); + d_allocator_.deallocate(eqjacsp_diag_idx_dev_); + d_allocator_.deallocate(isdcline_dev_); #endif return 0; @@ -404,6 +422,10 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { xslackidx = paramAlloc(h_allocator_, nlinelim); } + eqjacsp_idx = paramAlloc(h_allocator_, nlineON); + eqjacsp_diag_idx = paramAlloc(h_allocator_, 4 * nlineON); + isdcline = paramAlloc(h_allocator_, nlineON); + PetscInt j = 0; /* Populate arrays */ for (i = 0; i < ps->nline; i++) { @@ -451,6 +473,7 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { j++; } + isdcline[linei] = (int)line->isdcline; linei++; } @@ -480,6 +503,10 @@ int LINEParamsRajaHiop::allocate(OPFLOW opflow) { ineqjacsp_idx_dev_ = paramAlloc(d_allocator_, nlinelim); xslackidx_dev_ = paramAlloc(d_allocator_, nlinelim); } + + eqjacsp_idx_dev_ = paramAlloc(d_allocator_, nlineON); + eqjacsp_diag_idx_dev_ = paramAlloc(d_allocator_, 4 * nlineON); + isdcline_dev_ = paramAlloc(d_allocator_, nlineON); #endif return 0; } @@ -621,6 +648,7 @@ int GENParamsRajaHiop::destroy(OPFLOW opflow) { h_allocator_.deallocate(vs); h_allocator_.deallocate(xidx); h_allocator_.deallocate(xpdevidx); + h_allocator_.deallocate(xpsetidx); h_allocator_.deallocate(gidxbus); h_allocator_.deallocate(eqjacspbus_idx); h_allocator_.deallocate(eqjacsqbus_idx); @@ -647,6 +675,7 @@ int GENParamsRajaHiop::destroy(OPFLOW opflow) { d_allocator_.deallocate(vs_dev_); d_allocator_.deallocate(xidx_dev_); d_allocator_.deallocate(xpdevidx_dev_); + d_allocator_.deallocate(xpsetidx_dev_); d_allocator_.deallocate(gidxbus_dev_); d_allocator_.deallocate(eqjacspbus_idx_dev_); d_allocator_.deallocate(eqjacsqbus_idx_dev_); @@ -685,6 +714,7 @@ int GENParamsRajaHiop::copy(OPFLOW opflow) { resmgr.copy(xidx_dev_, xidx); resmgr.copy(xpdevidx_dev_, xpdevidx); + resmgr.copy(xpsetidx_dev_, xpsetidx); resmgr.copy(gidxbus_dev_, gidxbus); resmgr.copy(eqjacspbus_idx_dev_, eqjacspbus_idx); @@ -711,6 +741,7 @@ int GENParamsRajaHiop::copy(OPFLOW opflow) { vs_dev_ = vs; xidx_dev_ = xidx; xpdevidx_dev_ = xpdevidx; + xpsetidx_dev_ = xpsetidx; gidxbus_dev_ = gidxbus; eqjacspbus_idx_dev_ = eqjacspbus_idx; eqjacsqbus_idx_dev_ = eqjacsqbus_idx; @@ -758,6 +789,7 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { xidx = paramAlloc(h_allocator_, ngenON); xpdevidx = paramAlloc(h_allocator_, ngenON); + xpsetidx = paramAlloc(h_allocator_, ngenON); gidxbus = paramAlloc(h_allocator_, ngenON); eqjacspbus_idx = paramAlloc(h_allocator_, ngenON); @@ -798,6 +830,10 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { vs[geni] = gen->vs; if (opflow->has_gensetpoint) { pgs[geni] = gen->pgs; + if (!gen->isrenewable) { + xpdevidx[geni] = opflow->idxn2sd_map[gen->startxpdevloc]; + xpsetidx[geni] = opflow->idxn2sd_map[gen->startxpsetloc]; + } } xidx[geni] = opflow->idxn2sd_map[loc]; @@ -832,6 +868,7 @@ int GENParamsRajaHiop::allocate(OPFLOW opflow) { xidx_dev_ = paramAlloc(d_allocator_, ngenON); xpdevidx_dev_ = paramAlloc(d_allocator_, ngenON); + xpsetidx_dev_ = paramAlloc(d_allocator_, ngenON); gidxbus_dev_ = paramAlloc(d_allocator_, ngenON); eqjacspbus_idx_dev_ = paramAlloc(d_allocator_, ngenON); diff --git a/src/opflow/model/power_bal_hiop/paramsrajahiop.h b/src/opflow/model/power_bal_hiop/paramsrajahiop.h index d274f990..65de68e6 100644 --- a/src/opflow/model/power_bal_hiop/paramsrajahiop.h +++ b/src/opflow/model/power_bal_hiop/paramsrajahiop.h @@ -29,11 +29,13 @@ struct BUSParamsRajaHiop { int *jacsp_idx; /* Location number in the sparse Jacobian for Pimb */ int *jacsq_idx; /* Location number in the sparse Jacobian for Qimb */ int *hesssp_idx; /* KS: Hessian indices */ - int *ispv; /* KS: ispv[i] = 1 if bus is PV bus */ - int *gineqidx; /* KS: starting position of bus ineq constraints */ - int *ineqjacsp_idx; /* KS: index in flat sparse ineq Jacobian array */ - int *genoffset; /* KS: Offset into flattened gen array for this bus */ - int *ngenONbus; /* KS: Number of ON generators on this bus */ + int *eqjacsp_selfidx; /* Flat-array position for bus self-admittance in eq + Jacobian. [2*i] = P-row base, [2*i+1] = Q-row base */ + int *ispv; /* KS: ispv[i] = 1 if bus is PV bus */ + int *gineqidx; /* KS: starting position of bus ineq constraints */ + int *ineqjacsp_idx; /* KS: index in flat sparse ineq Jacobian array */ + int *genoffset; /* KS: Offset into flattened gen array for this bus */ + int *ngenONbus; /* KS: Number of ON generators on this bus */ // Device data int *isref_dev_; /* isref[i] = 1 if bus is reference bus */ @@ -51,14 +53,15 @@ struct BUSParamsRajaHiop { X vector */ int *gidx_dev_; /* starting locations for bus balance equations in constraint vector */ - int *jacsp_idx_dev_; /* Location number in the sparse Jacobian for Pimb */ - int *jacsq_idx_dev_; /* Location number in the sparse Jacobian for Qimb */ - int *hesssp_idx_dev_; /* Location number in the Hessian */ - int *ispv_dev_; /* KS: dev counterpart of ispv */ - int *gineqidx_dev_; /* KS: dev counterpart of gineqidx */ - int *ineqjacsp_idx_dev_; /* KS: device counterpart of ineqjacsp_idx_ */ - int *genoffset_dev_; /* KS: dev counterpart of genoffset */ - int *ngenONbus_dev_; /* KS: dev counterpart of ngenONbus */ + int *jacsp_idx_dev_; /* Location number in the sparse Jacobian for Pimb */ + int *jacsq_idx_dev_; /* Location number in the sparse Jacobian for Qimb */ + int *hesssp_idx_dev_; /* Location number in the Hessian */ + int *eqjacsp_selfidx_dev_; /* KS: eqjacsp_selfidx device counterpart */ + int *ispv_dev_; /* KS: dev counterpart of ispv */ + int *gineqidx_dev_; /* KS: dev counterpart of gineqidx */ + int *ineqjacsp_idx_dev_; /* KS: device counterpart of ineqjacsp_idx_ */ + int *genoffset_dev_; /* KS: dev counterpart of genoffset */ + int *ngenONbus_dev_; /* KS: dev counterpart of ngenONbus */ int allocate(OPFLOW); int destroy(OPFLOW); @@ -88,6 +91,8 @@ struct GENParamsRajaHiop { int *xidx; /* starting locations in X vector */ int *xpdevidx; /* KS: tarting locations of deviation variables in X vector */ + int * + xpsetidx; /* KS: Starting locations in X vector for set-point variables */ int * gidxbus; /* starting locations in constraint vector for bus constraints */ int *geqidxgen; /* starting locations in equality constraint vector for gen @@ -123,6 +128,7 @@ struct GENParamsRajaHiop { int *xidx_dev_; /* starting locations in X vector */ int *xpdevidx_dev_; /* KS: device coutnerpart of xpdevidx*/ + int *xpsetidx_dev_; /* KS: xpsetidx device counterpart */ int *gidxbus_dev_; /* starting locations in constraint vector for bus constraints */ int *geqidxgen_dev_; /* starting locations in equality constraint vector for @@ -213,6 +219,11 @@ struct LINEParamsRajaHiop { int *linelimidx; /* Indices for subset of lines that have finite limits */ int *ineqjacsp_idx; /* KS: Position in flat sparse ineq Jacobian array */ int *xslackidx; /* Starting location of slack variables in X vector */ + int *eqjacsp_idx; /* Flat-array offset for off-diagonal eq Jacobian entries */ + int *eqjacsp_diag_idx; /* Flat-array positions for diagonal entries per line + [4*l+0]=from P-row, [4*l+1]=from Q-row, + [4*l+2]=to P-row, [4*l+3]=to Q-row */ + int *isdcline; /* isdcline[i] = 1 if line is a DC line */ // Device data double *Gff_dev_; /* From side self conductance */ @@ -238,6 +249,9 @@ struct LINEParamsRajaHiop { linelimidx_dev_; /* Indices for subset of lines that have finite limits */ int *ineqjacsp_idx_dev_; /* KS: Position in flat sparse ineq Jacobian array */ int *xslackidx_dev_; /* Starting location of slack variables in X vector */ + int *eqjacsp_idx_dev_; + int *eqjacsp_diag_idx_dev_; + int *isdcline_dev_; int allocate(OPFLOW); int destroy(OPFLOW); diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp index cac87986..2e1a9f42 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse.cpp @@ -3,6 +3,8 @@ #if defined(EXAGO_ENABLE_RAJA) #if defined(EXAGO_ENABLE_HIOP_SPARSE) +#include + #include #include "pbpolrajahiopsparsekernels.hpp" @@ -267,12 +269,145 @@ PetscErrorCode OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE(OPFLOW opflow) { pbpolrajahiopsparse->agc_xidx = -1; } - /* KS: Compute the number of nonzeros in equality, inequality constraint - * Jacobians and Hessian. Equality and Hessian counts are still obtained - * from PETSc via get_sparse_blocks_info; inequality count is computed - * axplicitly so we can skip PETSc */ + /* KS: Compute the number of nonzeros in equality and inequality constraint + * Jacobians. Equality count is computed explicitly for the GPU kernel. + * Inequality count is computed explicitly so we can skip PETSc. */ int nnz_eqjacsp = 0, nnz_ineqjacsp = 0, nnz_hesssp = 0; + /* ---- Equality constraint Jacobian nnz counting ---- */ + { + int geni_eq = 0, loadi_eq = 0; + for (int ibus = 0; ibus < ps->nbus; ++ibus) { + PSBUS bus_eq = &(ps->bus[ibus]); + + busparams->eqjacsp_selfidx[2 * ibus] = nnz_eqjacsp; + nnz_eqjacsp += 2; + + if (bus_eq->ide == ISOLATED_BUS) { + busparams->eqjacsp_selfidx[2 * ibus + 1] = nnz_eqjacsp; + nnz_eqjacsp += 2; + continue; + } + + if (opflow->include_powerimbalance_variables) { + busparams->jacsp_idx[ibus] = nnz_eqjacsp; + nnz_eqjacsp += 2; + } + + int gi_eq = 0; + for (int kk = 0; kk < bus_eq->ngen; kk++) { + PSGEN gen_eq; + ierr = PSBUSGetGen(bus_eq, kk, &gen_eq); + CHKERRQ(ierr); + if (!gen_eq->status) + continue; + genparams->eqjacspbus_idx[geni_eq + gi_eq] = nnz_eqjacsp; + nnz_eqjacsp += 1; + gi_eq++; + } + + if (opflow->include_loadloss_variables) { + for (int kk = 0; kk < bus_eq->nload; kk++) { + PSLOAD load_eq; + ierr = PSBUSGetLoad(bus_eq, kk, &load_eq); + CHKERRQ(ierr); + loadparams->jacsp_idx[loadi_eq + kk] = nnz_eqjacsp; + nnz_eqjacsp += 1; + } + } + + busparams->eqjacsp_selfidx[2 * ibus + 1] = nnz_eqjacsp; + nnz_eqjacsp += 2; + + if (opflow->include_powerimbalance_variables) { + busparams->jacsq_idx[ibus] = nnz_eqjacsp; + nnz_eqjacsp += 2; + } + + gi_eq = 0; + for (int kk = 0; kk < bus_eq->ngen; kk++) { + PSGEN gen_eq; + ierr = PSBUSGetGen(bus_eq, kk, &gen_eq); + CHKERRQ(ierr); + if (!gen_eq->status) + continue; + genparams->eqjacsqbus_idx[geni_eq + gi_eq] = nnz_eqjacsp; + nnz_eqjacsp += 1; + gi_eq++; + } + + if (opflow->include_loadloss_variables) { + for (int kk = 0; kk < bus_eq->nload; kk++) { + PSLOAD load_eq; + ierr = PSBUSGetLoad(bus_eq, kk, &load_eq); + CHKERRQ(ierr); + loadparams->jacsq_idx[loadi_eq + kk] = nnz_eqjacsp; + nnz_eqjacsp += 1; + } + } + + geni_eq += bus_eq->ngenON; + loadi_eq += bus_eq->nload; + } + + int linei_eq = 0; + std::map, int> buspair_to_offdiag; + for (int iline = 0; iline < ps->nline; ++iline) { + PSLINE line_eq = &(ps->line[iline]); + if (!line_eq->status) + continue; + if (!line_eq->isdcline) { + const PSBUS *connbuses_eq; + ierr = PSLINEGetConnectedBuses(line_eq, &connbuses_eq); + CHKERRQ(ierr); + int busidxf = (int)(connbuses_eq[0] - ps->bus); + int busidxt = (int)(connbuses_eq[1] - ps->bus); + + lineparams->eqjacsp_diag_idx[4 * linei_eq + 0] = + busparams->eqjacsp_selfidx[2 * busidxf]; + lineparams->eqjacsp_diag_idx[4 * linei_eq + 1] = + busparams->eqjacsp_selfidx[2 * busidxf + 1]; + lineparams->eqjacsp_diag_idx[4 * linei_eq + 2] = + busparams->eqjacsp_selfidx[2 * busidxt]; + lineparams->eqjacsp_diag_idx[4 * linei_eq + 3] = + busparams->eqjacsp_selfidx[2 * busidxt + 1]; + + auto key = std::make_pair(std::min(busidxf, busidxt), + std::max(busidxf, busidxt)); + auto it = buspair_to_offdiag.find(key); + if (it != buspair_to_offdiag.end()) { + lineparams->eqjacsp_idx[linei_eq] = it->second; + } else { + lineparams->eqjacsp_idx[linei_eq] = nnz_eqjacsp; + buspair_to_offdiag[key] = nnz_eqjacsp; + nnz_eqjacsp += 8; + } + } + linei_eq++; + } + + if (opflow->has_gensetpoint) { + geni_eq = 0; + for (int ibus = 0; ibus < ps->nbus; ++ibus) { + PSBUS bus_eq = &(ps->bus[ibus]); + int gi_eq = 0; + for (int kk = 0; kk < bus_eq->ngen; kk++) { + PSGEN gen_eq; + ierr = PSBUSGetGen(bus_eq, kk, &gen_eq); + CHKERRQ(ierr); + if (!gen_eq->status) + continue; + if (!gen_eq->isrenewable) { + genparams->eqjacspgen_idx[geni_eq + gi_eq] = nnz_eqjacsp; + nnz_eqjacsp += 4; + } + gi_eq++; + } + geni_eq += bus_eq->ngenON; + } + } + } + /* * KS: Count inequality Jacobian non-zeros. The traversal order must match * the startineqloc assignment in OPFLOWModelSetUp_PBPOL: for each bus @@ -377,6 +512,20 @@ extern PetscErrorCode OPFLOWSolutionCallback_PBPOLRAJAHIOPSPARSE( OPFLOW, const double *, const double *, const double *, const double *, const double *, double); +/** + * Empty stub for the equality constraint Jacobian in the RAJA sparse model. + * Can be deleted if needed (dead code). + */ +PetscErrorCode +OPFLOWComputeEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, + Vec X, Mat Je) { + (void)opflow; + (void)X; + (void)Je; + PetscFunctionBegin; + PetscFunctionReturn(0); +} + /** * @brief Constructor for the PBPOLRAJAHIOPSPARSE model. * @@ -430,7 +579,7 @@ PetscErrorCode OPFLOWModelCreate_PBPOLRAJAHIOPSPARSE(OPFLOW opflow) { opflow->modelops.solutiontops = OPFLOWSolutionToPS_PBPOLRAJAHIOPSPARSE; opflow->modelops.setup = OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE; opflow->modelops.computeequalityconstraintjacobian = - OPFLOWComputeEqualityConstraintJacobian_PBPOL; + OPFLOWComputeEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE; opflow->modelops.computesparseequalityconstraintjacobianhiop = OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE; opflow->modelops.computeinequalityconstraintjacobian = diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp index 05705a35..1e9c7f0c 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.cpp @@ -240,5 +240,205 @@ void ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, } } +void ComputeEqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, + const double *x_dev, + double *jace_dev) { + PbpolModelRajaHiop *pbpolrajahiopsparse = + reinterpret_cast(opflow->model); + BUSParamsRajaHiop *busparams = &pbpolrajahiopsparse->busparams; + GENParamsRajaHiop *genparams = &pbpolrajahiopsparse->genparams; + LOADParamsRajaHiop *loadparams = &pbpolrajahiopsparse->loadparams; + LINEParamsRajaHiop *lineparams = &pbpolrajahiopsparse->lineparams; + + /* Zero the equality Jacobian values before accumulating */ + RAJA::forall( + RAJA::RangeSegment(0, opflow->nnz_eqjacsp), + RAJA_LAMBDA(RAJA::Index_type i) { jace_dev[i] = 0.0; }); + + /* + * Kernel 1: Bus contributions (shunt self-admittance, power imbalance, + * generator Pg/Qg, load loss, gen setpoint). + */ + { + int *b_isisolated = busparams->isisolated_dev_; + int *b_xidx = busparams->xidx_dev_; + double *b_gl = busparams->gl_dev_; + double *b_bl = busparams->bl_dev_; + int *b_selfidx = busparams->eqjacsp_selfidx_dev_; + + RAJA::forall( + RAJA::RangeSegment(0, busparams->nbus), + RAJA_LAMBDA(RAJA::Index_type i) { + int pbase = b_selfidx[2 * i]; + int qbase = b_selfidx[2 * i + 1]; + + if (b_isisolated[i]) { + jace_dev[pbase + 0] = 1.0; + jace_dev[pbase + 1] = 0.0; + jace_dev[qbase + 0] = 0.0; + jace_dev[qbase + 1] = 1.0; + return; + } + + double Vm = x_dev[b_xidx[i] + 1]; + + jace_dev[pbase + 0] = 0.0; + jace_dev[pbase + 1] = 2.0 * Vm * b_gl[i]; + jace_dev[qbase + 0] = 0.0; + jace_dev[qbase + 1] = -2.0 * Vm * b_bl[i]; + }); + + if (opflow->include_powerimbalance_variables) { + int *b_jacsp = busparams->jacsp_idx_dev_; + int *b_jacsq = busparams->jacsq_idx_dev_; + RAJA::forall( + RAJA::RangeSegment(0, busparams->nbus), + RAJA_LAMBDA(RAJA::Index_type i) { + jace_dev[b_jacsp[i]] = 1.0; + jace_dev[b_jacsp[i] + 1] = -1.0; + jace_dev[b_jacsq[i]] = 1.0; + jace_dev[b_jacsq[i] + 1] = -1.0; + }); + } + + int *g_eqjacspbus = genparams->eqjacspbus_idx_dev_; + int *g_eqjacsqbus = genparams->eqjacsqbus_idx_dev_; + RAJA::forall( + RAJA::RangeSegment(0, genparams->ngenON), + RAJA_LAMBDA(RAJA::Index_type i) { + jace_dev[g_eqjacspbus[i]] = -1.0; + jace_dev[g_eqjacsqbus[i]] = -1.0; + }); + + if (opflow->include_loadloss_variables) { + int *l_jacsp = loadparams->jacsp_idx_dev_; + int *l_jacsq = loadparams->jacsq_idx_dev_; + RAJA::forall( + RAJA::RangeSegment(0, loadparams->nload), + RAJA_LAMBDA(RAJA::Index_type i) { + jace_dev[l_jacsp[i]] = -1.0; + jace_dev[l_jacsq[i]] = -1.0; + }); + } + + if (opflow->has_gensetpoint) { + int *g_eqjacspgen = genparams->eqjacspgen_idx_dev_; + int *g_isrenewable = genparams->isrenewable_dev_; + RAJA::forall( + RAJA::RangeSegment(0, genparams->ngenON), + RAJA_LAMBDA(RAJA::Index_type i) { + if (g_isrenewable[i]) + return; + int base = g_eqjacspgen[i]; + jace_dev[base + 0] = -1.0; + jace_dev[base + 1] = 1.0; + jace_dev[base + 2] = 1.0; + jace_dev[base + 3] = 1.0; + }); + } + } + + /* + * Kernel 2: Line contributions — diagonal via atomicAdd, off-diagonal + * via atomicAdd (for parallel lines sharing positions). + */ + { + double *l_Gff = lineparams->Gff_dev_; + double *l_Bff = lineparams->Bff_dev_; + double *l_Gft = lineparams->Gft_dev_; + double *l_Bft = lineparams->Bft_dev_; + double *l_Gtf = lineparams->Gtf_dev_; + double *l_Btf = lineparams->Btf_dev_; + double *l_Gtt = lineparams->Gtt_dev_; + double *l_Btt = lineparams->Btt_dev_; + int *l_xidxf = lineparams->xidxf_dev_; + int *l_xidxt = lineparams->xidxt_dev_; + int *l_eqjacsp_idx = lineparams->eqjacsp_idx_dev_; + int *l_eqjacsp_diag = lineparams->eqjacsp_diag_idx_dev_; + int *l_isdcline = lineparams->isdcline_dev_; + + RAJA::forall( + RAJA::RangeSegment(0, lineparams->nlineON), + RAJA_LAMBDA(RAJA::Index_type l) { + if (l_isdcline[l]) + return; + + double thetaf = x_dev[l_xidxf[l]]; + double Vmf = x_dev[l_xidxf[l] + 1]; + double thetat = x_dev[l_xidxt[l]]; + double Vmt = x_dev[l_xidxt[l] + 1]; + double thetaft = thetaf - thetat; + double thetatf = thetat - thetaf; + + double Gff = l_Gff[l], Bff = l_Bff[l]; + double Gft = l_Gft[l], Bft = l_Bft[l]; + double Gtf = l_Gtf[l], Btf = l_Btf[l]; + double Gtt = l_Gtt[l], Btt = l_Btt[l]; + + double sin_ft = sin(thetaft), cos_ft = cos(thetaft); + double sin_tf = sin(thetatf), cos_tf = cos(thetatf); + + /* From-bus diagonal */ + double dPf_dthetaf = Vmf * Vmt * (-Gft * sin_ft + Bft * cos_ft); + double dPf_dVmf = + 2.0 * Gff * Vmf + Vmt * (Gft * cos_ft + Bft * sin_ft); + double dQf_dthetaf = Vmf * Vmt * (Bft * sin_ft + Gft * cos_ft); + double dQf_dVmf = + -2.0 * Bff * Vmf + Vmt * (-Bft * cos_ft + Gft * sin_ft); + + int pfbase = l_eqjacsp_diag[4 * l + 0]; + int qfbase = l_eqjacsp_diag[4 * l + 1]; + + RAJA::atomicAdd(&jace_dev[pfbase + 0], + dPf_dthetaf); + RAJA::atomicAdd(&jace_dev[pfbase + 1], dPf_dVmf); + RAJA::atomicAdd(&jace_dev[qfbase + 0], + dQf_dthetaf); + RAJA::atomicAdd(&jace_dev[qfbase + 1], dQf_dVmf); + + /* From-bus off-diagonal */ + double dPf_dthetat = Vmf * Vmt * (Gft * sin_ft - Bft * cos_ft); + double dPf_dVmt = Vmf * (Gft * cos_ft + Bft * sin_ft); + double dQf_dthetat = Vmf * Vmt * (-Bft * sin_ft - Gft * cos_ft); + double dQf_dVmt = Vmf * (-Bft * cos_ft + Gft * sin_ft); + + int obase = l_eqjacsp_idx[l]; + RAJA::atomicAdd(&jace_dev[obase + 0], dPf_dthetat); + RAJA::atomicAdd(&jace_dev[obase + 1], dPf_dVmt); + RAJA::atomicAdd(&jace_dev[obase + 2], dQf_dthetat); + RAJA::atomicAdd(&jace_dev[obase + 3], dQf_dVmt); + + /* To-bus diagonal */ + double dPt_dthetat = Vmt * Vmf * (-Gtf * sin_tf + Btf * cos_tf); + double dPt_dVmt = + 2.0 * Gtt * Vmt + Vmf * (Gtf * cos_tf + Btf * sin_tf); + double dQt_dthetat = Vmt * Vmf * (Btf * sin_tf + Gtf * cos_tf); + double dQt_dVmt = + -2.0 * Btt * Vmt + Vmf * (-Btf * cos_tf + Gtf * sin_tf); + + int ptbase = l_eqjacsp_diag[4 * l + 2]; + int qtbase = l_eqjacsp_diag[4 * l + 3]; + + RAJA::atomicAdd(&jace_dev[ptbase + 0], + dPt_dthetat); + RAJA::atomicAdd(&jace_dev[ptbase + 1], dPt_dVmt); + RAJA::atomicAdd(&jace_dev[qtbase + 0], + dQt_dthetat); + RAJA::atomicAdd(&jace_dev[qtbase + 1], dQt_dVmt); + + /* To-bus off-diagonal */ + double dPt_dthetaf = Vmt * Vmf * (Gtf * sin_tf - Btf * cos_tf); + double dPt_dVmf = Vmt * (Gtf * cos_tf + Btf * sin_tf); + double dQt_dthetaf = Vmt * Vmf * (-Btf * sin_tf - Gtf * cos_tf); + double dQt_dVmf = Vmt * (-Btf * cos_tf + Gtf * sin_tf); + + RAJA::atomicAdd(&jace_dev[obase + 4], dPt_dthetaf); + RAJA::atomicAdd(&jace_dev[obase + 5], dPt_dVmf); + RAJA::atomicAdd(&jace_dev[obase + 6], dQt_dthetaf); + RAJA::atomicAdd(&jace_dev[obase + 7], dQt_dVmf); + }); + } +} + #endif // EXAGO_ENABLE_HIOP_SPARSE #endif // EXAGO_ENABLE_RAJA diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp index 938686a7..8de5f4dd 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparse_gpu.hpp @@ -26,5 +26,12 @@ void ComputeIneqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, const double *x_dev, double *jacd_dev); +/** + * GPU-only (PETSc-free) computation of equality constraint Jacobian values. + */ +void ComputeEqJacValuesGPU_PBPOLRAJAHIOPSPARSE(OPFLOW opflow, + const double *x_dev, + double *jace_dev); + #endif // EXAGO_ENABLE_HIOP_SPARSE #endif // EXAGO_ENABLE_RAJA diff --git a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp index efddb6f6..174a9464 100644 --- a/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp +++ b/src/opflow/model/power_bal_hiop/pbpolrajahiopsparsekernels.cpp @@ -702,12 +702,12 @@ OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE( // This only needs to be done once since the sparsity pattern of the Jacobian // does not change during the optimization. if (iJacS_dev != NULL && jJacS_dev != NULL) { - /* Set locations only */ - + /* Compute sparsity pattern on host, matching the flat-array layout + defined during setup in OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE. +*/ roffset = 0; coffset = 0; - // Create arrays on the host to store i, j, and val arrays umpire::Allocator h_allocator_ = resmgr.getAllocator("HOST"); pbpolrajahiopsparse->i_jaceq = @@ -717,35 +717,177 @@ OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE( pbpolrajahiopsparse->val_jaceq = (double *)(h_allocator_.allocate(opflow->nnz_eqjacsp * sizeof(double))); - iRowstart = pbpolrajahiopsparse->i_jaceq; - jColstart = pbpolrajahiopsparse->j_jaceq; + int *iRow = pbpolrajahiopsparse->i_jaceq; + int *jCol = pbpolrajahiopsparse->j_jaceq; + + PS ps = opflow->ps; + BUSParamsRajaHiop *busparams = &pbpolrajahiopsparse->busparams; + GENParamsRajaHiop *genparams = &pbpolrajahiopsparse->genparams; + LOADParamsRajaHiop *loadparams = &pbpolrajahiopsparse->loadparams; + LINEParamsRajaHiop *lineparams = &pbpolrajahiopsparse->lineparams; + + int geni = 0, loadi = 0; + /*KS: not worth movint this to the gpu */ + for (int ibus = 0; ibus < ps->nbus; ibus++) { + PSBUS bus = &ps->bus[ibus]; + int P_row = roffset + busparams->gidx[ibus]; + int Q_row = P_row + 1; + int theta_col = coffset + busparams->xidx[ibus]; + int Vm_col = theta_col + 1; + int base; + + /* P-row self-admittance */ + base = busparams->eqjacsp_selfidx[2 * ibus]; + iRow[base] = P_row; + jCol[base] = theta_col; + iRow[base + 1] = P_row; + jCol[base + 1] = Vm_col; + + if (bus->ide == ISOLATED_BUS) { + /* Q-row self-admittance for isolated bus */ + base = busparams->eqjacsp_selfidx[2 * ibus + 1]; + iRow[base] = Q_row; + jCol[base] = theta_col; + iRow[base + 1] = Q_row; + jCol[base + 1] = Vm_col; + continue; + } - // Function pointer computeequalityconstraintjacobian points to - // OPFLOWComputeEqualityConstraintJacobian_PBPOL function. - // Use function from PBPOL model to create the sparsity pattern. - ierr = (*opflow->modelops.computeequalityconstraintjacobian)( - opflow, opflow->X, opflow->Jac_Ge); - CHKERRQ(ierr); + /* P-row power imbalance */ + if (opflow->include_powerimbalance_variables) { + base = busparams->jacsp_idx[ibus]; + int pimb_col = coffset + busparams->xidxpimb[ibus]; + iRow[base] = P_row; + jCol[base] = pimb_col; + iRow[base + 1] = P_row; + jCol[base + 1] = pimb_col + 1; + } - ierr = MatGetSize(opflow->Jac_Ge, &nrow, &ncol); - CHKERRQ(ierr); + /* P-row gen Pg */ + int gi = 0; + for (int k = 0; k < bus->ngen; k++) { + PSGEN gen; + PSBUSGetGen(bus, k, &gen); + if (!gen->status) + continue; + base = genparams->eqjacspbus_idx[geni + gi]; + iRow[base] = P_row; + jCol[base] = coffset + genparams->xidx[geni + gi]; + gi++; + } - /* Copy over locations to triplet format */ - for (i = 0; i < nrow; i++) { - ierr = MatGetRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); - CHKERRQ(ierr); - for (j = 0; j < nvals; j++) { - iRowstart[j] = roffset + i; - jColstart[j] = coffset + cols[j]; + /* P-row load loss */ + if (opflow->include_loadloss_variables) { + for (int k = 0; k < bus->nload; k++) { + base = loadparams->jacsp_idx[loadi + k]; + iRow[base] = P_row; + jCol[base] = coffset + loadparams->xidx[loadi + k]; + } + } + + /* Q-row self-admittance */ + base = busparams->eqjacsp_selfidx[2 * ibus + 1]; + iRow[base] = Q_row; + jCol[base] = theta_col; + iRow[base + 1] = Q_row; + jCol[base + 1] = Vm_col; + + /* Q-row power imbalance */ + if (opflow->include_powerimbalance_variables) { + base = busparams->jacsq_idx[ibus]; + int pimb_col = coffset + busparams->xidxpimb[ibus]; + iRow[base] = Q_row; + jCol[base] = pimb_col + 2; + iRow[base + 1] = Q_row; + jCol[base + 1] = pimb_col + 3; + } + + /* Q-row gen Qg */ + gi = 0; + for (int k = 0; k < bus->ngen; k++) { + PSGEN gen; + PSBUSGetGen(bus, k, &gen); + if (!gen->status) + continue; + base = genparams->eqjacsqbus_idx[geni + gi]; + iRow[base] = Q_row; + jCol[base] = coffset + genparams->xidx[geni + gi] + 1; + gi++; + } + + /* Q-row load loss */ + if (opflow->include_loadloss_variables) { + for (int k = 0; k < bus->nload; k++) { + base = loadparams->jacsq_idx[loadi + k]; + iRow[base] = Q_row; + jCol[base] = coffset + loadparams->xidx[loadi + k] + 1; + } + } + + geni += bus->ngenON; + loadi += bus->nload; + } + + /* Line off-diagonal entries */ + for (int l = 0; l < lineparams->nlineON; l++) { + if (lineparams->isdcline[l]) + continue; + + int base = lineparams->eqjacsp_idx[l]; + int Pf_row = roffset + lineparams->geqidxf[l]; + int Qf_row = Pf_row + 1; + int Pt_row = roffset + lineparams->geqidxt[l]; + int Qt_row = Pt_row + 1; + int thetat_col = coffset + lineparams->xidxt[l]; + int Vmt_col = thetat_col + 1; + int thetaf_col = coffset + lineparams->xidxf[l]; + int Vmf_col = thetaf_col + 1; + + /* From-bus off-diagonal: Pf w.r.t. thetat, Vmt */ + iRow[base + 0] = Pf_row; + jCol[base + 0] = thetat_col; + iRow[base + 1] = Pf_row; + jCol[base + 1] = Vmt_col; + /* Qf w.r.t. thetat, Vmt */ + iRow[base + 2] = Qf_row; + jCol[base + 2] = thetat_col; + iRow[base + 3] = Qf_row; + jCol[base + 3] = Vmt_col; + /* To-bus off-diagonal: Pt w.r.t. thetaf, Vmf */ + iRow[base + 4] = Pt_row; + jCol[base + 4] = thetaf_col; + iRow[base + 5] = Pt_row; + jCol[base + 5] = Vmf_col; + /* Qt w.r.t. thetaf, Vmf */ + iRow[base + 6] = Qt_row; + jCol[base + 6] = thetaf_col; + iRow[base + 7] = Qt_row; + jCol[base + 7] = Vmf_col; + } + + /* Generator set-point equality constraint entries */ + if (opflow->has_gensetpoint) { + for (int g = 0; g < genparams->ngenON; g++) { + if (genparams->isrenewable[g]) + continue; + int base = genparams->eqjacspgen_idx[g]; + int row0 = roffset + genparams->geqidxgen[g]; + int row1 = row0 + 1; + int Pg_col = coffset + genparams->xidx[g]; + int delPg_col = coffset + genparams->xpdevidx[g]; + int Pset_col = coffset + genparams->xpsetidx[g]; + + iRow[base + 0] = row0; + jCol[base + 0] = Pg_col; + iRow[base + 1] = row0; + jCol[base + 1] = delPg_col; + iRow[base + 2] = row0; + jCol[base + 2] = Pset_col; + iRow[base + 3] = row1; + jCol[base + 3] = Pset_col; } - /* Increment iRow,jCol pointers */ - iRowstart += nvals; - jColstart += nvals; - ierr = MatRestoreRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); - CHKERRQ(ierr); } - // Copy over i_jaceq and j_jaceq arrays to device resmgr.copy(iJacS_dev, pbpolrajahiopsparse->i_jaceq); resmgr.copy(jJacS_dev, pbpolrajahiopsparse->j_jaceq); } @@ -754,45 +896,10 @@ OPFLOWComputeSparseEqualityConstraintJacobian_PBPOLRAJAHIOPSPARSE( ierr = PetscLogEventBegin(opflow->eqconsjaclogger, 0, 0, 0, 0); CHKERRQ(ierr); - ierr = VecGetArray(opflow->X, &x); - CHKERRQ(ierr); - - // Copy from device to host - umpire::Allocator h_allocator_ = resmgr.getAllocator("HOST"); - registerWith(x, opflow->nx, resmgr, h_allocator_); - resmgr.copy((double *)x, (double *)x_dev); - - ierr = VecRestoreArray(opflow->X, &x); - CHKERRQ(ierr); - - // Compute equality constraint jacobian on the host - // Function pointer computeequalityconstraintjacobian points to the - // implementation OPFLOWComputeEqualityConstraintJacobian_PBPOL in the - // PBPOL model. - ierr = (*opflow->modelops.computeequalityconstraintjacobian)( - opflow, opflow->X, opflow->Jac_Ge); - CHKERRQ(ierr); - - ierr = MatGetSize(opflow->Jac_Ge, &nrow, &ncol); - CHKERRQ(ierr); - - values = pbpolrajahiopsparse->val_jaceq; - - // Unpack PETSc matrix and copy values to the host array in the - // PbpolModelRajaHiop struct. - for (i = 0; i < nrow; i++) { - ierr = MatGetRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); - CHKERRQ(ierr); - for (j = 0; j < nvals; j++) { - values[j] = vals[j]; - } - values += nvals; - ierr = MatRestoreRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); - CHKERRQ(ierr); - } - - // Copy over val_jaceq to device - resmgr.copy(MJacS_dev, pbpolrajahiopsparse->val_jaceq); + /* KS: Compute equality constraint Jacobian directly on device. + No H2D, D2H copies: x_dev is already on device, output goes + straight into MJacS_dev. */ + ComputeEqJacValuesGPU_PBPOLRAJAHIOPSPARSE(opflow, x_dev, MJacS_dev); ierr = PetscLogEventEnd(opflow->eqconsjaclogger, 0, 0, 0, 0); CHKERRQ(ierr); diff --git a/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp b/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp index d768e738..af5e165c 100644 --- a/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp +++ b/src/opflow/solver/hiop/opflow_hiopsparsegpu.cpp @@ -56,18 +56,10 @@ bool OPFLOWHIOPSPARSEGPUInterface::get_sparse_blocks_info( nx = opflow->nx; - /* Compute nonzeros for the Jacobian */ - /* Equality constraint Jacobian */ - ierr = (*opflow->modelops.computeequalityconstraintjacobian)( - opflow, opflow->X, opflow->Jac_Ge); - CHKERRQ(ierr); - ierr = MatSetOption(opflow->Jac_Ge, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE); - CHKERRQ(ierr); - - ierr = MatGetInfo(opflow->Jac_Ge, MAT_LOCAL, &info_eq); - CHKERRQ(ierr); - - nnz_sparse_Jaceq = opflow->nnz_eqjacsp = info_eq.nz_used; + /* Use the nnz count pre-computed during model setup + (OPFLOWModelSetUp_PBPOLRAJAHIOPSPARSE). This avoids calling the + PETSc-based Jacobian function, which is no longer wired up. */ + nnz_sparse_Jaceq = opflow->nnz_eqjacsp; /* KS: Use pre-computed nnz_ineqjacsp from model setup (avoids PETSc Mat assembly just for counting non-zeros -- not sure if faster or if it scales diff --git a/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt b/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt index cd2f33b2..0131169b 100644 --- a/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt +++ b/tests/unit/opflow/constraint_jacobian/equality/CMakeLists.txt @@ -1,6 +1,28 @@ if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_CUDA) set_source_files_properties(jac_eq_acopf.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(test_eqjac_compare.cpp PROPERTIES LANGUAGE CUDA) endif() +if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIP) + set_source_files_properties(test_eqjac_compare.cpp PROPERTIES LANGUAGE HIP) +endif() + +add_executable(test_eqjac_compare test_eqjac_compare.cpp) +target_link_libraries(test_eqjac_compare ExaGO::OPFLOW) +target_include_directories( + test_eqjac_compare PRIVATE ${CMAKE_SOURCE_DIR}/tests/unit/opflow +) + +if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIP) + set_source_files_properties(test_eqjac_perf.cpp PROPERTIES LANGUAGE HIP) +endif() +if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_CUDA) + set_source_files_properties(test_eqjac_perf.cpp PROPERTIES LANGUAGE CUDA) +endif() +add_executable(test_eqjac_perf test_eqjac_perf.cpp) +target_link_libraries(test_eqjac_perf ExaGO::OPFLOW) +target_include_directories( + test_eqjac_perf PRIVATE ${CMAKE_SOURCE_DIR}/tests/unit/opflow +) add_executable( jac_eq_acopf jac_eq_acopf.cpp @@ -54,6 +76,23 @@ if(EXAGO_ENABLE_RAJA) list(APPEND dependencies HIOP) endif() +if(EXAGO_RUN_TESTS) + if(EXAGO_ENABLE_RAJA AND EXAGO_ENABLE_HIOP) + set(prefix_net ${CMAKE_SOURCE_DIR}/datafiles/unit/opflow/objective/) + exago_add_test( + NAME + UNIT_TESTS_EQJAC_GPU_VS_PETSC_testx3 + DEPENDS + HIOP_SPARSE + COMMAND + ${RUNCMD} + $ + -netfile + ${prefix_net}testx3.m + ) + endif() +endif() + if(EXAGO_RUN_TESTS) foreach( model diff --git a/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_compare.cpp b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_compare.cpp new file mode 100644 index 00000000..c2fd7cb9 --- /dev/null +++ b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_compare.cpp @@ -0,0 +1,304 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +#if defined(EXAGO_ENABLE_RAJA) +#include +#include +#include +#endif + +struct TripletEntry { + int row, col; + double val; +}; + +static void computeReferenceJacobian(OPFLOW opflow, Vec X, + std::vector &entries) { + PetscErrorCode ierr; + ierr = (*opflow->modelops.computeequalityconstraintjacobian)(opflow, X, + opflow->Jac_Ge); + + PetscInt nrow, ncol; + ierr = MatGetSize(opflow->Jac_Ge, &nrow, &ncol); + + for (PetscInt i = 0; i < nrow; i++) { + PetscInt nvals; + const PetscInt *cols; + const PetscScalar *vals; + ierr = MatGetRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); + for (PetscInt j = 0; j < nvals; j++) { + entries.push_back({(int)i, (int)cols[j], vals[j]}); + } + ierr = MatRestoreRow(opflow->Jac_Ge, i, &nvals, &cols, &vals); + } +} + +int main(int argc, char **argv) { + PetscErrorCode ierr; + PetscBool flg; + char file_c_str[PETSC_MAX_PATH_LEN]; + std::string file; + char appname[] = "opflow"; + MPI_Comm comm = MPI_COMM_WORLD; + + char help[] = "Compare PETSc vs GPU equality constraint Jacobian\n"; + + ierr = ExaGOInitialize(comm, &argc, &argv, appname, help); + if (ierr) { + fprintf(stderr, "Could not initialize ExaGO.\n"); + return ierr; + } + + ierr = PetscOptionsGetString(NULL, NULL, "-netfile", file_c_str, + PETSC_MAX_PATH_LEN, &flg); + if (!flg) + file = "../datafiles/case9/case9mod.m"; + else + file.assign(file_c_str); + + /* ---------------------------------------------------------------- + * Step 1: Set up OPFLOW with PBPOL model (PETSc path) to get + * the reference Jacobian and the initial guess X. + * ---------------------------------------------------------------- */ + OPFLOW opflow_ref; + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow_ref, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow_ref, OPFLOWMODEL_PBPOL); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow_ref, OPFLOWSOLVER_IPOPT); + CHKERRQ(ierr); + ierr = OPFLOWSetInitializationType(opflow_ref, OPFLOWINIT_FROMFILE); + CHKERRQ(ierr); + ierr = OPFLOWSetUp(opflow_ref); + CHKERRQ(ierr); + + Vec X_ref; + ierr = OPFLOWGetSolution(opflow_ref, &X_ref); + CHKERRQ(ierr); + + std::vector ref_entries; + computeReferenceJacobian(opflow_ref, X_ref, ref_entries); + + /* ---------------------------------------------------------------- + * Step 2: Set up OPFLOW with HIOPSPARSE to exercise the GPU path. + * ---------------------------------------------------------------- */ + OPFLOW opflow_gpu; + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow_gpu); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow_gpu, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow_gpu, OPFLOWMODEL_PBPOLRAJAHIOPSPARSE); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow_gpu, OPFLOWSOLVER_HIOPSPARSEGPU); + CHKERRQ(ierr); + ierr = OPFLOWSetInitializationType(opflow_gpu, OPFLOWINIT_FROMFILE); + CHKERRQ(ierr); +#ifdef EXAGO_ENABLE_GPU + ierr = OPFLOWSetHIOPComputeMode(opflow_gpu, "GPU"); + CHKERRQ(ierr); +#endif + ierr = OPFLOWSetUp(opflow_gpu); + CHKERRQ(ierr); + + /* Get the initial guess vector and map to sparse-dense ordering */ + Vec X_gpu; + ierr = OPFLOWGetSolution(opflow_gpu, &X_gpu); + CHKERRQ(ierr); + + int nx = opflow_gpu->nx; + int nnz_eq = opflow_gpu->nnz_eqjacsp; + + printf("\n"); + printf("============================================================\n"); + printf(" Equality Constraint Jacobian: PETSc vs GPU Comparison\n"); + printf(" Network: %s\n", file.c_str()); + printf(" nx = %d, nconeq = %d, nnz_eqjac(GPU) = %d, nnz_eqjac(PETSc) = %d\n", + nx, opflow_gpu->nconeq, nnz_eq, (int)ref_entries.size()); + printf("============================================================\n\n"); + +#if defined(EXAGO_ENABLE_RAJA) + auto &resmgr = umpire::ResourceManager::getInstance(); + umpire::Allocator h_allocator = resmgr.getAllocator("HOST"); + + double *x_host; + ierr = VecGetArray(X_gpu, &x_host); + CHKERRQ(ierr); + + int *iRow, *jCol; + double *values; + iRow = static_cast(h_allocator.allocate(nnz_eq * sizeof(int))); + jCol = static_cast(h_allocator.allocate(nnz_eq * sizeof(int))); + values = static_cast(h_allocator.allocate(nnz_eq * sizeof(double))); + +#ifdef EXAGO_ENABLE_GPU + umpire::Allocator d_allocator = resmgr.getAllocator("DEVICE"); + + double *x_dev = + static_cast(d_allocator.allocate(nx * sizeof(double))); + int *iRow_dev = + static_cast(d_allocator.allocate(nnz_eq * sizeof(int))); + int *jCol_dev = + static_cast(d_allocator.allocate(nnz_eq * sizeof(int))); + double *values_dev = + static_cast(d_allocator.allocate(nnz_eq * sizeof(double))); + + umpire::util::AllocationRecord record_x{x_host, sizeof(double) * nx, + h_allocator.getAllocationStrategy()}; + resmgr.registerAllocation(x_host, record_x); + resmgr.copy(x_dev, x_host); +#else + double *x_dev = x_host; + int *iRow_dev = iRow; + int *jCol_dev = jCol; + double *values_dev = values; +#endif + + /* Call the sparse eq jac function: first for sparsity, then for values */ + ierr = (*opflow_gpu->modelops.computesparseequalityconstraintjacobianhiop)( + opflow_gpu, x_dev, iRow_dev, jCol_dev, NULL); + CHKERRQ(ierr); + + ierr = (*opflow_gpu->modelops.computesparseequalityconstraintjacobianhiop)( + opflow_gpu, x_dev, NULL, NULL, values_dev); + CHKERRQ(ierr); + +#ifdef EXAGO_ENABLE_GPU + resmgr.copy(iRow, iRow_dev); + resmgr.copy(jCol, jCol_dev); + resmgr.copy(values, values_dev); +#endif + + /* Build a map from (row, col) -> value for the GPU result */ + struct PairHash { + size_t operator()(const std::pair &p) const { + return std::hash()(((long long)p.first << 32) | p.second); + } + }; + std::unordered_map, double, PairHash> gpu_map; + for (int i = 0; i < nnz_eq; i++) { + auto key = std::make_pair(iRow[i], jCol[i]); + gpu_map[key] += values[i]; + } + + /* ---------------------------------------------------------------- + * Step 3: Compare and print results + * ---------------------------------------------------------------- */ + int n_match = 0, n_mismatch = 0, n_missing_gpu = 0, n_extra_gpu = 0; + double max_abs_err = 0.0, max_rel_err = 0.0; + int worst_row = -1, worst_col = -1; + double worst_ref = 0, worst_gpu = 0; + const double tol = 1e-6; + + printf(" %-8s %-8s %16s %16s %12s %s\n", "Row", "Col", "PETSc (ref)", "GPU", + "AbsErr", "Status"); + printf(" %-8s %-8s %16s %16s %12s %s\n", "---", "---", "-----------", "---", + "------", "------"); + + for (const auto &e : ref_entries) { + auto key = std::make_pair(e.row, e.col); + auto it = gpu_map.find(key); + double gpu_val = 0.0; + bool found = (it != gpu_map.end()); + + if (found) { + gpu_val = it->second; + gpu_map.erase(it); + } + + double abs_err = fabs(gpu_val - e.val); + double rel_err = (fabs(e.val) > 1e-12) ? abs_err / fabs(e.val) : abs_err; + const char *status; + + if (!found) { + status = "MISSING"; + n_missing_gpu++; + } else if (abs_err < tol) { + status = "OK"; + n_match++; + } else { + status = "MISMATCH"; + n_mismatch++; + } + + if (abs_err > max_abs_err) { + max_abs_err = abs_err; + max_rel_err = rel_err; + worst_row = e.row; + worst_col = e.col; + worst_ref = e.val; + worst_gpu = gpu_val; + } + + if (abs_err >= tol || !found) { + printf(" %-8d %-8d %16.8e %16.8e %12.2e %s\n", e.row, e.col, e.val, + gpu_val, abs_err, status); + } + } + + n_extra_gpu = (int)gpu_map.size(); + if (n_extra_gpu > 0) { + printf("\n Extra entries in GPU (not in PETSc reference):\n"); + for (const auto &kv : gpu_map) { + printf(" %-8d %-8d %16s %16.8e %12s EXTRA\n", kv.first.first, + kv.first.second, "n/a", kv.second, "n/a"); + } + } + + printf("\n"); + printf("============================================================\n"); + printf(" SUMMARY\n"); + printf("============================================================\n"); + printf(" PETSc nnz: %d\n", (int)ref_entries.size()); + printf(" GPU nnz: %d\n", nnz_eq); + printf(" Matching: %d\n", n_match); + printf(" Mismatched: %d\n", n_mismatch); + printf(" Missing in GPU: %d\n", n_missing_gpu); + printf(" Extra in GPU: %d\n", n_extra_gpu); + printf(" Max absolute err: %.2e at (%d, %d) ref=%.8e gpu=%.8e\n", + max_abs_err, worst_row, worst_col, worst_ref, worst_gpu); + printf(" Max relative err: %.2e\n", max_rel_err); + printf(" Tolerance: %.2e\n", tol); + printf(" RESULT: %s\n", + (n_mismatch == 0 && n_missing_gpu == 0 && n_extra_gpu == 0) ? "PASS" + : "FAIL"); + printf("============================================================\n\n"); + + int result = + (n_mismatch == 0 && n_missing_gpu == 0 && n_extra_gpu == 0) ? 0 : 1; + + h_allocator.deallocate(iRow); + h_allocator.deallocate(jCol); + h_allocator.deallocate(values); +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(x_dev); + d_allocator.deallocate(iRow_dev); + d_allocator.deallocate(jCol_dev); + d_allocator.deallocate(values_dev); +#endif + + ierr = VecRestoreArray(X_gpu, &x_host); + CHKERRQ(ierr); +#endif // EXAGO_ENABLE_RAJA + + ierr = OPFLOWDestroy(&opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWDestroy(&opflow_gpu); + CHKERRQ(ierr); + + ExaGOFinalize(); + +#if defined(EXAGO_ENABLE_RAJA) + return result; +#else + return 0; +#endif +} diff --git a/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_perf.cpp b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_perf.cpp new file mode 100644 index 00000000..d42d0e53 --- /dev/null +++ b/tests/unit/opflow/constraint_jacobian/equality/test_eqjac_perf.cpp @@ -0,0 +1,242 @@ +#include +#include +#include +#include + +#include +#include +#include + +#if defined(EXAGO_ENABLE_RAJA) +#include +#include +#include +#endif + +#if defined(EXAGO_ENABLE_HIP) +#include +#define GPU_SYNC() hipDeviceSynchronize() +#elif defined(EXAGO_ENABLE_CUDA) +#include +#define GPU_SYNC() cudaDeviceSynchronize() +#else +#define GPU_SYNC() ((void)0) +#endif + +using Clock = std::chrono::high_resolution_clock; +using Ms = std::chrono::duration; + +static double benchmarkPETSc(OPFLOW opflow, Vec X, int niters) { + PetscErrorCode ierr; + PetscScalar *x_arr; + + ierr = VecGetArray(X, &x_arr); + auto t0 = Clock::now(); + for (int iter = 0; iter < niters; iter++) { + ierr = (*opflow->modelops.computeequalityconstraintjacobian)( + opflow, X, opflow->Jac_Ge); + } + auto t1 = Clock::now(); + ierr = VecRestoreArray(X, &x_arr); + + return Ms(t1 - t0).count() / niters; +} + +int main(int argc, char **argv) { + PetscErrorCode ierr; + PetscBool flg; + char file_c_str[PETSC_MAX_PATH_LEN]; + std::string file; + char appname[] = "opflow"; + MPI_Comm comm = MPI_COMM_WORLD; + int niters = 100; + + char help[] = "Benchmark PETSc vs GPU equality constraint Jacobian\n"; + + ierr = ExaGOInitialize(comm, &argc, &argv, appname, help); + if (ierr) + return ierr; + + ierr = PetscOptionsGetString(NULL, NULL, "-netfile", file_c_str, + PETSC_MAX_PATH_LEN, &flg); + if (!flg) + file = "../datafiles/case9/case9mod.m"; + else + file.assign(file_c_str); + + ierr = PetscOptionsGetInt(NULL, NULL, "-niters", &niters, &flg); + + /* ---------------------------------------------------------------- + * PETSc path: IPOPT + PBPOL + * ---------------------------------------------------------------- */ + OPFLOW opflow_ref; + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow_ref, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow_ref, OPFLOWMODEL_PBPOL); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow_ref, OPFLOWSOLVER_IPOPT); + CHKERRQ(ierr); + ierr = OPFLOWSetInitializationType(opflow_ref, OPFLOWINIT_FROMFILE); + CHKERRQ(ierr); + ierr = OPFLOWSetUp(opflow_ref); + CHKERRQ(ierr); + + Vec X_ref; + ierr = OPFLOWGetSolution(opflow_ref, &X_ref); + CHKERRQ(ierr); + + /* Warmup */ + benchmarkPETSc(opflow_ref, X_ref, 5); + double petsc_ms = benchmarkPETSc(opflow_ref, X_ref, niters); + + int petsc_nnz = 0; + { + MatInfo info; + ierr = MatGetInfo(opflow_ref->Jac_Ge, MAT_LOCAL, &info); + petsc_nnz = (int)info.nz_used; + } + + /* ---------------------------------------------------------------- + * GPU path: HIOPSPARSEGPU + PBPOLRAJAHIOPSPARSE + * ---------------------------------------------------------------- */ + OPFLOW opflow_gpu; + ierr = OPFLOWCreate(PETSC_COMM_WORLD, &opflow_gpu); + CHKERRQ(ierr); + ierr = OPFLOWReadMatPowerData(opflow_gpu, file.c_str()); + CHKERRQ(ierr); + ierr = OPFLOWSetModel(opflow_gpu, OPFLOWMODEL_PBPOLRAJAHIOPSPARSE); + CHKERRQ(ierr); + ierr = OPFLOWSetSolver(opflow_gpu, OPFLOWSOLVER_HIOPSPARSEGPU); + CHKERRQ(ierr); + ierr = OPFLOWSetInitializationType(opflow_gpu, OPFLOWINIT_FROMFILE); + CHKERRQ(ierr); +#ifdef EXAGO_ENABLE_GPU + ierr = OPFLOWSetHIOPComputeMode(opflow_gpu, "GPU"); + CHKERRQ(ierr); +#endif + ierr = OPFLOWSetUp(opflow_gpu); + CHKERRQ(ierr); + + Vec X_gpu; + ierr = OPFLOWGetSolution(opflow_gpu, &X_gpu); + CHKERRQ(ierr); + + int nx = opflow_gpu->nx; + int nnz_eq = opflow_gpu->nnz_eqjacsp; + +#if defined(EXAGO_ENABLE_RAJA) + auto &resmgr = umpire::ResourceManager::getInstance(); + umpire::Allocator h_allocator = resmgr.getAllocator("HOST"); + + double *x_host; + ierr = VecGetArray(X_gpu, &x_host); + CHKERRQ(ierr); + + double *values_dev; + double *x_dev; + +#ifdef EXAGO_ENABLE_GPU + umpire::Allocator d_allocator = resmgr.getAllocator("DEVICE"); + x_dev = static_cast(d_allocator.allocate(nx * sizeof(double))); + values_dev = + static_cast(d_allocator.allocate(nnz_eq * sizeof(double))); + + umpire::util::AllocationRecord record_x{x_host, sizeof(double) * nx, + h_allocator.getAllocationStrategy()}; + resmgr.registerAllocation(x_host, record_x); + resmgr.copy(x_dev, x_host); +#else + x_dev = x_host; + values_dev = + static_cast(h_allocator.allocate(nnz_eq * sizeof(double))); +#endif + + /* Run sparsity phase once (not timed — this is one-time setup) */ + { + int *iRow_dev, *jCol_dev; +#ifdef EXAGO_ENABLE_GPU + iRow_dev = static_cast(d_allocator.allocate(nnz_eq * sizeof(int))); + jCol_dev = static_cast(d_allocator.allocate(nnz_eq * sizeof(int))); +#else + iRow_dev = static_cast(h_allocator.allocate(nnz_eq * sizeof(int))); + jCol_dev = static_cast(h_allocator.allocate(nnz_eq * sizeof(int))); +#endif + ierr = (*opflow_gpu->modelops.computesparseequalityconstraintjacobianhiop)( + opflow_gpu, x_dev, iRow_dev, jCol_dev, NULL); + CHKERRQ(ierr); +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(iRow_dev); + d_allocator.deallocate(jCol_dev); +#else + h_allocator.deallocate(iRow_dev); + h_allocator.deallocate(jCol_dev); +#endif + } + + /* Warmup the values kernel */ + for (int i = 0; i < 5; i++) { + ierr = (*opflow_gpu->modelops.computesparseequalityconstraintjacobianhiop)( + opflow_gpu, x_dev, NULL, NULL, values_dev); + CHKERRQ(ierr); + } + GPU_SYNC(); + + /* Timed runs */ + auto t0 = Clock::now(); + for (int iter = 0; iter < niters; iter++) { + ierr = (*opflow_gpu->modelops.computesparseequalityconstraintjacobianhiop)( + opflow_gpu, x_dev, NULL, NULL, values_dev); + CHKERRQ(ierr); + } + GPU_SYNC(); + auto t1 = Clock::now(); + double gpu_ms = Ms(t1 - t0).count() / niters; + +#ifdef EXAGO_ENABLE_GPU + d_allocator.deallocate(x_dev); + d_allocator.deallocate(values_dev); +#else + h_allocator.deallocate(values_dev); +#endif + + ierr = VecRestoreArray(X_gpu, &x_host); + CHKERRQ(ierr); +#else + double gpu_ms = 0.0; +#endif + + /* ---------------------------------------------------------------- + * Print results + * ---------------------------------------------------------------- */ + printf("\n"); + printf("================================================================\n"); + printf(" Equality Constraint Jacobian — Performance Comparison\n"); + printf("================================================================\n"); + printf(" Network: %s\n", file.c_str()); + printf(" Buses: %d\n", opflow_gpu->ps->nbus); + printf(" Variables (nx): %d\n", nx); + printf(" Eq constraints: %d\n", opflow_gpu->nconeq); + printf(" nnz (PETSc): %d\n", petsc_nnz); + printf(" nnz (GPU): %d\n", nnz_eq); + printf(" Iterations: %d\n", niters); + printf("----------------------------------------------------------------\n"); + printf(" %-20s %12s %12s\n", "", "PETSc (CPU)", "RAJA (GPU)"); + printf(" %-20s %12s %12s\n", "", "-----------", "----------"); + printf(" %-20s %10.4f ms %10.4f ms\n", "Avg time/call", petsc_ms, gpu_ms); + if (gpu_ms > 0.0) { + double speedup = petsc_ms / gpu_ms; + printf(" %-20s %10s %9.2fx\n", "Speedup", "", speedup); + } + printf( + "================================================================\n\n"); + + ierr = OPFLOWDestroy(&opflow_ref); + CHKERRQ(ierr); + ierr = OPFLOWDestroy(&opflow_gpu); + CHKERRQ(ierr); + + ExaGOFinalize(); + return 0; +}