diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index b2cf65a9811239..53e1b46a7d1313 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -4838,6 +4838,10 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // DoPhase(this, PHASE_UNROLL_LOOPS, &Compiler::optUnrollLoops); + // Peel loops + // + DoPhase(this, PHASE_PEEL_LOOPS, &Compiler::optPeelLoops); + // Compute dominators and exceptional entry blocks // DoPhase(this, PHASE_COMPUTE_DOMINATORS, &Compiler::fgComputeDominators); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index ee779d375939d3..2b99515b724185 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6786,10 +6786,15 @@ class Compiler PhaseStatus optCloneLoops(); void optCloneLoop(FlowGraphNaturalLoop* loop, LoopCloneContext* context); + PhaseStatus optUnrollLoops(); // Unrolls loops (needs to have cost info) bool optTryUnrollLoop(FlowGraphNaturalLoop* loop, bool* changedIR); void optRedirectPrevUnrollIteration(FlowGraphNaturalLoop* loop, BasicBlock* prevTestBlock, BasicBlock* target); void optReplaceScalarUsesWithConst(BasicBlock* block, unsigned lclNum, ssize_t cnsVal); + + PhaseStatus optPeelLoops(); + bool optPeelLoop(FlowGraphNaturalLoop* loop); + void optRemoveRedundantZeroInits(); PhaseStatus optIfConversion(); // If conversion @@ -9966,6 +9971,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX STRESS_MODE(DBL_ALN) \ STRESS_MODE(LCL_FLDS) \ STRESS_MODE(UNROLL_LOOPS) \ + STRESS_MODE(PEEL_LOOPS) \ STRESS_MODE(MAKE_CSE) \ STRESS_MODE(LEGACY_INLINE) \ STRESS_MODE(CLONE_EXPR) \ diff --git a/src/coreclr/jit/compmemkind.h b/src/coreclr/jit/compmemkind.h index 835d85f798d29b..96f18238471a12 100644 --- a/src/coreclr/jit/compmemkind.h +++ b/src/coreclr/jit/compmemkind.h @@ -50,6 +50,7 @@ CompMemKindMacro(LoopOpt) CompMemKindMacro(LoopClone) CompMemKindMacro(LoopUnroll) CompMemKindMacro(LoopHoist) +CompMemKindMacro(LoopPeel) CompMemKindMacro(Unknown) CompMemKindMacro(RangeCheck) CompMemKindMacro(CopyProp) diff --git a/src/coreclr/jit/compphases.h b/src/coreclr/jit/compphases.h index 23930985319769..c84fe5b97777ff 100644 --- a/src/coreclr/jit/compphases.h +++ b/src/coreclr/jit/compphases.h @@ -71,6 +71,7 @@ CompPhaseNameMacro(PHASE_ZERO_INITS, "Redundant zero Inits", CompPhaseNameMacro(PHASE_FIND_LOOPS, "Find loops", false, -1, false) CompPhaseNameMacro(PHASE_CLONE_LOOPS, "Clone loops", false, -1, false) CompPhaseNameMacro(PHASE_UNROLL_LOOPS, "Unroll loops", false, -1, false) +CompPhaseNameMacro(PHASE_PEEL_LOOPS, "Peel loops", false, -1, false) CompPhaseNameMacro(PHASE_MORPH_MDARR, "Morph array ops", false, -1, false) CompPhaseNameMacro(PHASE_HOIST_LOOP_CODE, "Hoist loop code", false, -1, false) CompPhaseNameMacro(PHASE_MARK_LOCAL_VARS, "Mark local vars", false, -1, false) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index fc3e2f30a3d23c..2ae5f750d0f99b 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -465,6 +465,7 @@ CONFIG_STRING(JitOnlyOptimizeRange, W("JitOnlyOptimizeRange")) // If set, all methods that do _not_ match are forced into MinOpts CONFIG_STRING(JitEnablePhysicalPromotionRange, W("JitEnablePhysicalPromotionRange")) CONFIG_STRING(JitEnableCrossBlockLocalAssertionPropRange, W("JitEnableCrossBlockLocalAssertionPropRange")) +CONFIG_STRING(JitEnableLoopPeelingRange, W("JitEnableLoopPeelingRange")) CONFIG_INTEGER(JitDoSsa, W("JitDoSsa"), 1) // Perform Static Single Assignment (SSA) numbering on the variables CONFIG_INTEGER(JitDoValueNumber, W("JitDoValueNumber"), 1) // Perform value numbering on method expressions @@ -659,6 +660,8 @@ CONFIG_INTEGER(JitEnableHeadTailMerge, W("JitEnableHeadTailMerge"), 1) // Enable physical promotion CONFIG_INTEGER(JitEnablePhysicalPromotion, W("JitEnablePhysicalPromotion"), 1) +CONFIG_INTEGER(JitEnableLoopPeeling, W("JitEnableLoopPeeling"), 0) + // Enable cross-block local assertion prop CONFIG_INTEGER(JitEnableCrossBlockLocalAssertionProp, W("JitEnableCrossBlockLocalAssertionProp"), 1) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 26b35a4d3d1a42..81e3161b7eb829 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -1794,6 +1794,134 @@ void Compiler::optReplaceScalarUsesWithConst(BasicBlock* block, unsigned lclNum, } } +//----------------------------------------------------------------------------- +// optPeelLoops: Peel loops by duplicating the loop body once. +// +// Returns: +// Suitable phase status. +// +PhaseStatus Compiler::optPeelLoops() +{ + if (m_loops->NumLoops() == 0) + { + return PhaseStatus::MODIFIED_NOTHING; + } + +// if ((JitConfig.JitEnableLoopPeeling() == 0) && !compStressCompile(STRESS_PEEL_LOOPS, 25)) +//{ +// return PhaseStatus::MODIFIED_NOTHING; +//} + +#ifdef DEBUG + static ConfigMethodRange s_range; + s_range.EnsureInit(JitConfig.JitEnableLoopPeelingRange()); + + if (!s_range.Contains(info.compMethodHash())) + { + return PhaseStatus::MODIFIED_NOTHING; + } +#endif + + unsigned numPeeled = 0; + for (FlowGraphNaturalLoop* loop : m_loops->InReversePostOrder()) + { + if (loop->GetParent() != nullptr) + { + continue; + } + + if (optPeelLoop(loop)) + { + numPeeled++; + } + } + + JITDUMP("Peeled %u loops\n", numPeeled); + if (numPeeled == 0) + { + return PhaseStatus::MODIFIED_NOTHING; + } + + fgInvalidateDfsTree(); + m_dfsTree = fgComputeDfs(); + m_loops = FlowGraphNaturalLoops::Find(m_dfsTree); + fgRenumberBlocks(); + + return PhaseStatus::MODIFIED_EVERYTHING; +} + +//----------------------------------------------------------------------------- +// optPeelLoop: Peel the specified loop by duplicating its loop body once +// before the loop. +// +// Returns: +// True if the loop was peeled and the flow graph was changed; otherwise false. +// +bool Compiler::optPeelLoop(FlowGraphNaturalLoop* loop) +{ + JITDUMP("Considering peeling "); + DBEXEC(verbose, FlowGraphNaturalLoop::Dump(loop)); + + BasicBlock* preheader = loop->EntryEdge(0)->getSourceBlock(); + + INDEBUG(const char* reason); + if (!loop->CanDuplicate(INDEBUG(&reason))) + { + JITDUMP(" Cannot peel: %s\n", reason); + return false; + } + + if (!BasicBlock::sameEHRegion(preheader, loop->GetHeader())) + { + JITDUMP(" Cannot peel: preheader and header are in different EH regions\n"); + return false; + } + + // Make a new pre-header block for the fast loop. + JITDUMP("Create new preheader block for fast loop\n"); + + BasicBlock* newPreheader = + fgNewBBafter(BBJ_ALWAYS, preheader, /*extendRegion*/ true, /*jumpDest*/ loop->GetHeader()); + JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", newPreheader->bbNum, preheader->bbNum); + newPreheader->inheritWeight(preheader); + newPreheader->SetFlags(BBF_LOOP_PREHEADER); + + if (newPreheader->JumpsToNext()) + { + newPreheader->SetFlags(BBF_NONE_QUIRK); + } + + fgAddRefPred(loop->GetHeader(), newPreheader); + + assert(preheader->KindIs(BBJ_ALWAYS)); + assert(preheader->TargetIs(loop->GetHeader())); + + preheader->RemoveFlags(BBF_LOOP_PREHEADER); + + // Now duplicate the loop blocks after the old preeheader but before the + // new preheader. This will be the singly peeled iteration. + BasicBlock* insertAfter = preheader; + BlockToBlockMap blockMap(getAllocator(CMK_LoopPeel)); + weight_t scale = 1; + if (!loop->GetHeader()->isRunRarely() && !fgProfileWeightsEqual(loop->GetHeader()->getBBWeight(this), 0)) + { + scale = preheader->getBBWeight(this) / loop->GetHeader()->getBBWeight(this); + } + + loop->Duplicate(&insertAfter, &blockMap, scale, /* bottomNeedsRedirection */ true); + + // Redirect all backedges to the new preheader we created. This removes the + // loop structure of the duplicate. + for (FlowEdge* backedge : loop->BackEdges()) + { + fgReplaceJumpTarget(blockMap[backedge->getSourceBlock()], newPreheader, blockMap[loop->GetHeader()]); + } + + // Finally, the old preheader now jumps to the first peeled iteration. + fgReplaceJumpTarget(preheader, blockMap[loop->GetHeader()], loop->GetHeader()); + return true; +} + Compiler::OptInvertCountTreeInfoType Compiler::optInvertCountTreeInfo(GenTree* tree) { class CountTreeInfoVisitor : public GenTreeVisitor