From 31303a958d24d605ed88c0d62d9ff69af3ae5b74 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Thu, 16 Dec 2021 17:21:59 -0800 Subject: [PATCH] Simplify alignment block marking Currently, loops that are candidates for alignment are determined immediately after loop recognition. The top block of these loops are marked by setting the `BBF_LOOP_ALIGN` flag. Later, just before codegen, the `placeLoopAlignInstructions` phase is called to mark the blocks where the actual align instructions will be placed. Between these two phases, the `BBF_LOOP_ALIGN` flag needs to be maintained through various optimization phases. We can simplify this by deferring marking the loops to align until we call the `placeLoopAlignInstructions` phase. Then, we don't need to worry about maintaining the `BBF_LOOP_ALIGN` flag. The loop table is still valid, so can be used. (Note that if we ever want to "kill" the loop table earlier in compilation, we could simply move the `BBF_LOOP_ALIGN` marking to that point, and still avoid all the flag maintenance.) This PR makes that change. To avoid too many diffs, `DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT` is reduced from 4 to 3. This is because many loops (without profile data) end up with a weight of 4, and if those loops are cloned, the weight of the hot path gets reduced to 3.96 (so the cold path gets at least some non-zero weight). We still want these hot path cloned loops to be aligned. However, decreasing this does cause some more ASP.NET loops with profile data and weights between 3 and 4 to be aligned. Additionally, I added a trivial phase `optClearLoopIterInfo` to clear out all the loop table info related to `LPFLG_ITER`. This data is used by loop cloning and unrolling, but not afterwards. I still want to be able to dump the loop table without hitting asserts about improper `LPFLG_ITER` form data, and don't want downstream phases to take a dependency on this data, so clearing it out enables that. There are a few diffs where we align more loops even without profile data because the weight of loop top blocks increases due to various flow optimizations (like block compaction), and now we see the larger weight before making the alignment decision. --- src/coreclr/jit/block.cpp | 19 ------ src/coreclr/jit/block.h | 2 - src/coreclr/jit/compiler.cpp | 107 +++++++++++++++++++++++++++++--- src/coreclr/jit/compiler.h | 8 ++- src/coreclr/jit/compphases.h | 1 + src/coreclr/jit/fgbasic.cpp | 11 ---- src/coreclr/jit/fgopt.cpp | 10 --- src/coreclr/jit/loopcloning.cpp | 13 +--- src/coreclr/jit/morph.cpp | 2 - src/coreclr/jit/optimizer.cpp | 88 +++++++------------------- 10 files changed, 126 insertions(+), 135 deletions(-) diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp index 9073d3dc50fe6b..2c1ccdfda21e43 100644 --- a/src/coreclr/jit/block.cpp +++ b/src/coreclr/jit/block.cpp @@ -1685,22 +1685,3 @@ BBswtDesc::BBswtDesc(Compiler* comp, const BBswtDesc* other) bbsDstTab[i] = other->bbsDstTab[i]; } } - -//------------------------------------------------------------------------ -// unmarkLoopAlign: Unmarks the LOOP_ALIGN flag from the block and reduce the -// loop alignment count. -// -// Arguments: -// compiler - Compiler instance -// reason - Reason to print in JITDUMP -// -void BasicBlock::unmarkLoopAlign(Compiler* compiler DEBUG_ARG(const char* reason)) -{ - // Make sure we unmark and count just once. - if (isLoopAlign()) - { - compiler->loopAlignCandidates--; - bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Unmarking LOOP_ALIGN from " FMT_BB ". Reason= %s.\n", bbNum, reason); - } -} diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index 81a511f4bceed6..f60d304ec38932 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -657,8 +657,6 @@ struct BasicBlock : private LIR::Range return ((bbFlags & BBF_LOOP_ALIGN) != 0); } - void unmarkLoopAlign(Compiler* comp DEBUG_ARG(const char* reason)); - bool hasAlign() const { return ((bbFlags & BBF_HAS_ALIGN) != 0); diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 56701ceb08fad3..2a2461e914cd0f 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -4776,6 +4776,10 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // Unroll loops // DoPhase(this, PHASE_UNROLL_LOOPS, &Compiler::optUnrollLoops); + + // Clear loop table info that is not used after this point, and might become invalid. + // + DoPhase(this, PHASE_CLEAR_LOOP_INFO, &Compiler::optClearLoopIterInfo); } #ifdef DEBUG @@ -5088,6 +5092,77 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl #if FEATURE_LOOP_ALIGN +//------------------------------------------------------------------------ +// optIdentifyLoopsForAlignment: Determine which loops should be considered for alignment. +// +// All innermost loops whose block weight meets a threshold are candidates for alignment. +// The `top` block of the loop is marked with the BBF_LOOP_ALIGN flag to indicate this +// (the loop table itself is not changed). +// +// Depends on the loop table, and on block weights being set. +// +// Sets `loopAlignCandidates` to the number of loop candidates for alignment. +// +void Compiler::optIdentifyLoopsForAlignment() +{ + assert(codeGen->ShouldAlignLoops()); + + loopAlignCandidates = 0; + + for (BasicBlock::loopNumber loopInd = 0; loopInd < optLoopCount; loopInd++) + { + const LoopDsc& loop = optLoopTable[loopInd]; + + if (loop.lpFlags & LPFLG_REMOVED) + { + // Don't need JitDump output for this common case. + continue; + } + + if (loop.lpChild != BasicBlock::NOT_IN_LOOP) + { + JITDUMP("Skipping alignment for " FMT_LP "; not an innermost loop\n", loopInd); + continue; + } + + if (loop.lpTop == fgFirstBB) + { + // Adding align instruction in prolog is not supported. (This currently seems unlikely since + // loops normally (always?) have a predecessor `head` block.) + // TODO: Insert an empty block before the loop, if want to align it, so we have a place to put + // the align instruction. + JITDUMP("Skipping alignment for " FMT_LP "; loop starts in first block\n", loopInd); + continue; + } + + if (loop.lpFlags & LPFLG_CONTAINS_CALL) + { + // Heuristic: it is not valuable to align loops with calls. + JITDUMP("Skipping alignment for " FMT_LP "; loop contains call\n", loopInd); + continue; + } + + // Now we have an innerloop candidate that might need alignment + + BasicBlock* top = loop.lpTop; + weight_t topWeight = top->getBBWeight(this); + weight_t compareWeight = opts.compJitAlignLoopMinBlockWeight * BB_UNITY_WEIGHT; + if (topWeight >= compareWeight) + { + loopAlignCandidates++; + assert(!top->isLoopAlign()); + top->bbFlags |= BBF_LOOP_ALIGN; + JITDUMP("Aligning " FMT_LP " that starts at " FMT_BB ", weight=" FMT_WT " >= " FMT_WT ".\n", loopInd, + top->bbNum, topWeight, compareWeight); + } + else + { + JITDUMP("Skipping alignment for " FMT_LP " that starts at " FMT_BB ", weight=" FMT_WT " < " FMT_WT ".\n", + loopInd, top->bbNum, topWeight, compareWeight); + } + } +} + //------------------------------------------------------------------------ // placeLoopAlignInstructions: Iterate over all the blocks and determine // the best position to place the 'align' instruction. Inserting 'align' @@ -5100,26 +5175,39 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // void Compiler::placeLoopAlignInstructions() { +#ifdef DEBUG + if (verbose) + { + printf("*************** In placeLoopAlignInstructions()\n"); + } +#endif + + if (!codeGen->ShouldAlignLoops()) + { + JITDUMP("Not aligning loops; ShouldAlignLoops is false\n"); + return; + } + + // Print out the current loop table that we're working on. + DBEXEC(verbose, optPrintLoopTable()); + + assert(loopAlignCandidates == 0); // We currently expect nobody has touched this yet. + optIdentifyLoopsForAlignment(); + if (loopAlignCandidates == 0) { + JITDUMP("No loop candidates\n"); return; } int loopsToProcess = loopAlignCandidates; - JITDUMP("Inside placeLoopAlignInstructions for %d loops.\n", loopAlignCandidates); + JITDUMP("Considering %d loops\n", loopAlignCandidates); // Add align only if there were any loops that needed alignment weight_t minBlockSoFar = BB_MAX_WEIGHT; BasicBlock* bbHavingAlign = nullptr; BasicBlock::loopNumber currentAlignedLoopNum = BasicBlock::NOT_IN_LOOP; - if ((fgFirstBB != nullptr) && fgFirstBB->isLoopAlign()) - { - // Adding align instruction in prolog is not supported - // hence just remove that loop from our list. - loopsToProcess--; - } - for (BasicBlock* const block : Blocks()) { if (currentAlignedLoopNum != BasicBlock::NOT_IN_LOOP) @@ -5178,7 +5266,8 @@ void Compiler::placeLoopAlignInstructions() assert(loopsToProcess == 0); } -#endif + +#endif // FEATURE_LOOP_ALIGN //------------------------------------------------------------------------ // generatePatchpointInfo: allocate and fill in patchpoint info data, diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index ea29174e057691..0b010b5501024f 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3670,6 +3670,8 @@ class Compiler #endif BasicBlock* bbNewBasicBlock(BBjumpKinds jumpKind); + + void optIdentifyLoopsForAlignment(); void placeLoopAlignInstructions(); /* @@ -6987,6 +6989,8 @@ class Compiler BasicBlock* exit, unsigned char exitCnt); + void optClearLoopIterInfo(); + #ifdef DEBUG void optPrintLoopInfo(unsigned lnum, bool printVerbose = false); void optPrintLoopInfo(const LoopDsc* loop, bool printVerbose = false); @@ -7025,8 +7029,6 @@ class Compiler void optFindNaturalLoops(); - void optIdentifyLoopsForAlignment(); - // Ensures that all the loops in the loop nest rooted at "loopInd" (an index into the loop table) are 'canonical' -- // each loop has a unique "top." Returns "true" iff the flowgraph has been modified. bool optCanonicalizeLoopNest(unsigned char loopInd); @@ -9754,7 +9756,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // based on experimenting with various benchmarks. // Default minimum loop block weight required to enable loop alignment. -#define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 4 +#define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 3 // By default a loop will be aligned at 32B address boundary to get better // performance as per architecture manuals. diff --git a/src/coreclr/jit/compphases.h b/src/coreclr/jit/compphases.h index e5c15c1d438a6c..f0fd12a2036838 100644 --- a/src/coreclr/jit/compphases.h +++ b/src/coreclr/jit/compphases.h @@ -61,6 +61,7 @@ CompPhaseNameMacro(PHASE_ZERO_INITS, "Redundant zero Inits", CompPhaseNameMacro(PHASE_FIND_LOOPS, "Find loops", "LOOP-FND", false, -1, false) CompPhaseNameMacro(PHASE_CLONE_LOOPS, "Clone loops", "LP-CLONE", false, -1, false) CompPhaseNameMacro(PHASE_UNROLL_LOOPS, "Unroll loops", "UNROLL", false, -1, false) +CompPhaseNameMacro(PHASE_CLEAR_LOOP_INFO, "Clear loop info", "LP-CLEAR", false, -1, false) CompPhaseNameMacro(PHASE_HOIST_LOOP_CODE, "Hoist loop code", "LP-HOIST", false, -1, false) CompPhaseNameMacro(PHASE_MARK_LOCAL_VARS, "Mark local vars", "MARK-LCL", false, -1, false) CompPhaseNameMacro(PHASE_OPTIMIZE_BOOLS, "Optimize bools", "OPT-BOOL", false, -1, false) diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index 6b41c055422b0b..c8ed81af7b986b 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -4712,14 +4712,6 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) { succBlock->bbFlags |= BBF_LOOP_HEAD; - if (block->isLoopAlign()) - { - loopAlignCandidates++; - succBlock->bbFlags |= BBF_LOOP_ALIGN; - JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " for " FMT_LP "\n ", block->bbNum, - succBlock->bbNum, block->bbNatLoopNum); - } - if (fgDomsComputed && fgReachable(succBlock, block)) { // Mark all the reachable blocks between 'succBlock' and 'bPrev' @@ -4867,9 +4859,6 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) block->bbFlags |= BBF_REMOVED; } - // If this was marked for alignment, remove it - block->unmarkLoopAlign(this DEBUG_ARG("Removed block")); - if (bPrev != nullptr) { switch (bPrev->bbJumpKind) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 4e446837bcb189..e05ebbcb44ef26 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -2147,13 +2147,6 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext) break; } - if (bNext->isLoopAlign()) - { - block->bbFlags |= BBF_LOOP_ALIGN; - JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " during compacting.\n", bNext->bbNum, - block->bbNum); - } - // If we're collapsing a block created after the dominators are // computed, copy block number the block and reuse dominator // information from bNext to block. @@ -5993,9 +5986,6 @@ bool Compiler::fgUpdateFlowGraph(bool doTailDuplication) // Update the loop table if we removed the bottom of a loop, for example. fgUpdateLoopsAfterCompacting(block, bNext); - // If this block was aligned, unmark it - bNext->unmarkLoopAlign(this DEBUG_ARG("Optimized jump")); - // If this is the first Cold basic block update fgFirstColdBlock if (bNext == fgFirstColdBlock) { diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index bdd8d763aa4c7e..24243451c02162 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -1956,18 +1956,7 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) newBlk->scaleBBWeight(slowPathWeightScaleFactor); blk->scaleBBWeight(fastPathWeightScaleFactor); -// TODO: scale the pred edges of `blk`? - -#if FEATURE_LOOP_ALIGN - // If the original loop is aligned, do not align the cloned loop because cloned loop will be executed in - // rare scenario. Additionally, having to align cloned loop will force us to disable some VEX prefix encoding - // and adding compensation for over-estimated instructions. - if (blk->isLoopAlign()) - { - newBlk->bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Removing LOOP_ALIGN flag from cloned loop in " FMT_BB "\n", newBlk->bbNum); - } -#endif + // TODO: scale the pred edges of `blk`? // TODO-Cleanup: The above clones the bbNatLoopNum, which is incorrect. Eventually, we should probably insert // the cloned loop in the loop table. For now, however, we'll just make these blocks be part of the surrounding diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 138e46358065e2..a11f568f431b28 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -15366,8 +15366,6 @@ bool Compiler::fgFoldConditional(BasicBlock* block) optMarkLoopRemoved(loopNum); - optLoopTable[loopNum].lpTop->unmarkLoopAlign(this DEBUG_ARG("Bogus loop")); - #ifdef DEBUG if (verbose) { diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index a6c9ea4d85c689..dde519f18e8395 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -382,8 +382,6 @@ void Compiler::optUnmarkLoopBlocks(BasicBlock* begBlk, BasicBlock* endBlk) } JITDUMP("\n"); - - begBlk->unmarkLoopAlign(this DEBUG_ARG("Removed loop")); } /***************************************************************************************************** @@ -592,6 +590,27 @@ void Compiler::optUpdateLoopsBeforeRemoveBlock(BasicBlock* block, bool skipUnmar } } +//------------------------------------------------------------------------ +// optClearLoopIterInfo: Clear the info related to LPFLG_ITER loops in the loop table. +// The various fields related to iterators is known to be valid for loop cloning and unrolling, +// but becomes invalid afterwards. Clear the info that might be used incorrectly afterwards +// in JitDump or by subsequent phases. +// +void Compiler::optClearLoopIterInfo() +{ + for (unsigned lnum = 0; lnum < optLoopCount; lnum++) + { + LoopDsc& loop = optLoopTable[lnum]; + loop.lpFlags &= ~(LPFLG_ITER | LPFLG_VAR_INIT | LPFLG_CONST_INIT | LPFLG_SIMD_LIMIT | LPFLG_VAR_LIMIT | + LPFLG_CONST_LIMIT | LPFLG_ARRLEN_LIMIT); + + loop.lpIterTree = nullptr; + loop.lpInitBlock = nullptr; + loop.lpConstInit = -1; // union with loop.lpVarInit + loop.lpTestTree = nullptr; + } +} + #ifdef DEBUG /***************************************************************************** @@ -2598,50 +2617,6 @@ void Compiler::optFindNaturalLoops() #endif // DEBUG } -//------------------------------------------------------------------------ -// optIdentifyLoopsForAlignment: Determine which loops should be considered for alignment. -// -// All innermost loops whose block weight meets a threshold are candidates for alignment. -// The `first` block of the loop is marked with the BBF_LOOP_ALIGN flag to indicate this -// (the loop table itself is not changed). -// -// Depends on the loop table, and on block weights being set. -// -void Compiler::optIdentifyLoopsForAlignment() -{ -#if FEATURE_LOOP_ALIGN - if (codeGen->ShouldAlignLoops()) - { - for (BasicBlock::loopNumber loopInd = 0; loopInd < optLoopCount; loopInd++) - { - // An innerloop candidate that might need alignment - if (optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) - { - BasicBlock* top = optLoopTable[loopInd].lpTop; - weight_t topWeight = top->getBBWeight(this); - if (topWeight >= (opts.compJitAlignLoopMinBlockWeight * BB_UNITY_WEIGHT)) - { - // Sometimes with JitOptRepeat > 1, we might end up finding the loops twice. In such - // cases, make sure to count them just once. - if (!top->isLoopAlign()) - { - loopAlignCandidates++; - top->bbFlags |= BBF_LOOP_ALIGN; - JITDUMP(FMT_LP " that starts at " FMT_BB " needs alignment, weight=" FMT_WT ".\n", loopInd, - top->bbNum, top->getBBWeight(this)); - } - } - else - { - JITDUMP("Skip alignment for " FMT_LP " that starts at " FMT_BB " weight=" FMT_WT ".\n", loopInd, - top->bbNum, topWeight); - } - } - } - } -#endif -} - //------------------------------------------------------------------------ // optRedirectBlock: Replace the branch successors of a block based on a block map. // @@ -3912,13 +3887,6 @@ PhaseStatus Compiler::optUnrollLoops() #endif } -#if FEATURE_LOOP_ALIGN - for (BasicBlock* const block : loop.LoopBlocks()) - { - block->unmarkLoopAlign(this DEBUG_ARG("Unrolled loop")); - } -#endif - // Create the unrolled loop statement list. { // When unrolling a loop, that loop disappears (and will be removed from the loop table). Each unrolled @@ -5031,7 +4999,6 @@ void Compiler::optFindLoops() { optFindNaturalLoops(); optFindAndScaleGeneralLoopBlocks(); - optIdentifyLoopsForAlignment(); // Check if any of the loops need alignment } #ifdef DEBUG @@ -8023,19 +7990,6 @@ bool Compiler::optComputeLoopSideEffectsOfBlock(BasicBlock* blk) // Marks the containsCall information to "lnum" and any parent loops. void Compiler::AddContainsCallAllContainingLoops(unsigned lnum) { - -#if FEATURE_LOOP_ALIGN - // If this is the inner most loop, reset the LOOP_ALIGN flag - // because a loop having call will not likely to benefit from - // alignment - if (optLoopTable[lnum].lpChild == BasicBlock::NOT_IN_LOOP) - { - BasicBlock* top = optLoopTable[lnum].lpTop; - - top->unmarkLoopAlign(this DEBUG_ARG("Loop with call")); - } -#endif - assert(0 <= lnum && lnum < optLoopCount); while (lnum != BasicBlock::NOT_IN_LOOP) {