From 8db20710469e154c19e75911672c67eb75312fe5 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Mon, 26 Apr 2021 17:01:21 -0700 Subject: [PATCH 1/2] Scale cloned loop block weights The slow path of cloned loops are scaled to keep only 1% of the original block weights, while the original loop keeps 99% of the weight. It could be argued that the cloned loop slow path, as currently implemented, should only be executed during exceptional paths, and thus should be marked "run rarely", but don't do that for now. There are many diffs, due to weights changing. Mostly, fewer CSEs in cloned loop slow paths. E.g., aspnet spmi diffs included below. Also, add a header comment for `optRedirectBlock` (previous change follow-up). In addition, I didn't like the semantics of the `addPreds` flag -- it didn't match well with the rest of the function -- so I removed it and added the appropriate changes in `optCloneLoop` (the only caller that used that) to compensate. I added code to `optEnsureUniqueHead`, only used by loop cloning, to add and update appropriate preds edges. It turns out, however, this function is currently dead code, due to other limitations in loop cloning. I'll leave the changes anyway, in case those limitations are removed later. This part of the change I verified was no-diff (and verified the code wasn't even executed in our SPMI collections). ``` Summary of Code Size diffs: (Lower is better) Total bytes of base: 18811 Total bytes of diff: 18749 Total bytes of delta: -62 (-0.33% of base) diff is an improvement. ```
Detail diffs ``` Top file regressions (bytes): 23 : 19019.dasm (9.91% of base) 23 : 38588.dasm (9.91% of base) 8 : 39269.dasm (1.15% of base) 3 : 19897.dasm (0.94% of base) 2 : 40501.dasm (1.43% of base) Top file improvements (bytes): -37 : 13173.dasm (-2.41% of base) -37 : 14302.dasm (-2.41% of base) -17 : 31284.dasm (-0.84% of base) -17 : 34351.dasm (-0.83% of base) -5 : 31157.dasm (-0.25% of base) -5 : 34245.dasm (-0.25% of base) -3 : 38167.dasm (-0.49% of base) 12 total files with Code Size differences (7 improved, 5 regressed), 4 unchanged. Top method regressions (bytes): 23 ( 9.91% of base) : 19019.dasm - ObjectEqualityComparer`1:IndexOf(ref,__Canon,int,int):int:this 23 ( 9.91% of base) : 38588.dasm - ObjectEqualityComparer`1:IndexOf(ref,__Canon,int,int):int:this 8 ( 1.15% of base) : 39269.dasm - RuntimeMethodInfo:MakeGenericMethod(ref):MethodInfo:this 3 ( 0.94% of base) : 19897.dasm - ControllerActionInvoker:PrepareArguments(IDictionary`2,ObjectMethodExecutor):ref 2 ( 1.43% of base) : 40501.dasm - SslCredKey:Equals(SslCredKey):bool:this Top method improvements (bytes): -37 (-2.41% of base) : 13173.dasm - DefaultTypeMap:FindConstructor(ref,ref):ConstructorInfo:this -37 (-2.41% of base) : 14302.dasm - DefaultTypeMap:FindConstructor(ref,ref):ConstructorInfo:this -17 (-0.84% of base) : 31284.dasm - EntityMaterializerInjectingExpressionVisitor:MaterializeEntity(EntityShaperExpression,ParameterExpression,ParameterExpression,ParameterExpression,ParameterExpression):Expression:this -17 (-0.83% of base) : 34351.dasm - EntityMaterializerInjectingExpressionVisitor:MaterializeEntity(EntityShaperExpression,ParameterExpression,ParameterExpression,ParameterExpression,ParameterExpression):Expression:this -5 (-0.25% of base) : 31157.dasm - EntityShaperExpression:GenerateMaterializationCondition(IEntityType,bool):LambdaExpression:this -5 (-0.25% of base) : 34245.dasm - EntityShaperExpression:GenerateMaterializationCondition(IEntityType,bool):LambdaExpression:this -3 (-0.49% of base) : 38167.dasm - RuntimeMethodInfo:Equals(Object):bool:this Top method regressions (percentages): 23 ( 9.91% of base) : 19019.dasm - ObjectEqualityComparer`1:IndexOf(ref,__Canon,int,int):int:this 23 ( 9.91% of base) : 38588.dasm - ObjectEqualityComparer`1:IndexOf(ref,__Canon,int,int):int:this 2 ( 1.43% of base) : 40501.dasm - SslCredKey:Equals(SslCredKey):bool:this 8 ( 1.15% of base) : 39269.dasm - RuntimeMethodInfo:MakeGenericMethod(ref):MethodInfo:this 3 ( 0.94% of base) : 19897.dasm - ControllerActionInvoker:PrepareArguments(IDictionary`2,ObjectMethodExecutor):ref Top method improvements (percentages): -37 (-2.41% of base) : 13173.dasm - DefaultTypeMap:FindConstructor(ref,ref):ConstructorInfo:this -37 (-2.41% of base) : 14302.dasm - DefaultTypeMap:FindConstructor(ref,ref):ConstructorInfo:this -17 (-0.84% of base) : 31284.dasm - EntityMaterializerInjectingExpressionVisitor:MaterializeEntity(EntityShaperExpression,ParameterExpression,ParameterExpression,ParameterExpression,ParameterExpression):Expression:this -17 (-0.83% of base) : 34351.dasm - EntityMaterializerInjectingExpressionVisitor:MaterializeEntity(EntityShaperExpression,ParameterExpression,ParameterExpression,ParameterExpression,ParameterExpression):Expression:this -3 (-0.49% of base) : 38167.dasm - RuntimeMethodInfo:Equals(Object):bool:this -5 (-0.25% of base) : 31157.dasm - EntityShaperExpression:GenerateMaterializationCondition(IEntityType,bool):LambdaExpression:this -5 (-0.25% of base) : 34245.dasm - EntityShaperExpression:GenerateMaterializationCondition(IEntityType,bool):LambdaExpression:this 12 total methods with Code Size differences (7 improved, 5 regressed), 4 unchanged. ```
-------------------------------------------------------------------------------- --- src/coreclr/jit/compiler.h | 5 +- src/coreclr/jit/optimizer.cpp | 112 +++++++++++++++++++++++----------- 2 files changed, 76 insertions(+), 41 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index ee2abdb945f60f..1a3d1a8b542024 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6533,10 +6533,7 @@ class Compiler // loop nested in "loopInd" that shares the same head as "loopInd". void optUpdateLoopHead(unsigned loopInd, BasicBlock* from, BasicBlock* to); - // Updates the successors of "blk": if "blk2" is a successor of "blk", and there is a mapping for "blk2->blk3" in - // "redirectMap", change "blk" so that "blk3" is this successor. If `addPreds` is true, add predecessor edges - // for the block based on its new target, whether redirected or not. - void optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap, const bool addPreds = false); + void optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap); // Marks the containsCall information to "lnum" and any parent loops. void AddContainsCallAllContainingLoops(unsigned lnum); diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index daf459faca5be9..c42566c3951a73 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2607,11 +2607,22 @@ void Compiler::optIdentifyLoopsForAlignment() #endif } -void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap, const bool addPreds) +//------------------------------------------------------------------------ +// optRedirectBlock: Replace the branch successors of a block based on a block map. +// +// Updates the successors of `blk`: if `blk2` is a branch successor of `blk`, and there is a mapping +// for `blk2->blk3` in `redirectMap`, change `blk` so that `blk3` is this branch successor. +// +// Arguments: +// blk - block to redirect +// redirectMap - block->block map specifying how the `blk` target will be redirected. +// +void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap) { BasicBlock* newJumpDest = nullptr; switch (blk->bbJumpKind) { + case BBJ_NONE: case BBJ_THROW: case BBJ_RETURN: case BBJ_EHFILTERRET: @@ -2620,13 +2631,6 @@ void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap, c // These have no jump destination to update. break; - case BBJ_NONE: - if (addPreds) - { - fgAddRefPred(blk->bbNext, blk); - } - break; - case BBJ_ALWAYS: case BBJ_LEAVE: case BBJ_CALLFINALLY: @@ -2636,15 +2640,6 @@ void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap, c { blk->bbJumpDest = newJumpDest; } - if (addPreds) - { - fgAddRefPred(blk->bbJumpDest, blk); - if (blk->bbJumpKind == BBJ_COND) - { - // Add a pred for the fall-through path. - fgAddRefPred(blk->bbNext, blk); - } - } break; case BBJ_SWITCH: @@ -2659,12 +2654,8 @@ void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap, c blk->bbJumpSwt->bbsDstTab[i] = newJumpDest; redirected = true; } - if (addPreds) - { - fgAddRefPred(switchDest, blk); - } } - // If any redirections happend, invalidate the switch table map for the switch. + // If any redirections happened, invalidate the switch table map for the switch. if (redirected) { // Don't create a new map just to try to remove an entry. @@ -5495,6 +5486,13 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // Be safe by taking the max with the head block's weight. ambientWeight = max(ambientWeight, loop.lpHead->bbWeight); + // We assume that the fast path will run 99% of the time, and thus should get 99% of the block weights. + // The slow path will, correspondingly, get only 1% of the block weights. It could be argued that we should + // mark the slow path as "run rarely", since it really shouldn't execute (given the currently optimized loop + // conditions) except under exceptional circumstances. + const BasicBlock::weight_t fastPathWeightScaleFactor = 0.99f; + const BasicBlock::weight_t slowPathWeightScaleFactor = 1.0f - fastPathWeightScaleFactor; + // This is the containing loop, if any -- to label any blocks we create that are outside // the loop being cloned. unsigned char ambientLoop = loop.lpParent; @@ -5612,7 +5610,7 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // NOTE: 'h' is no longer the loop head; 'h2' is! } - // Now we'll clone the blocks of the loop body. + // Now we'll clone the blocks of the loop body. These cloned blocks will be the slow path. BasicBlock* newFirst = nullptr; BlockToBlockMap* blockMap = new (getAllocator(CMK_LoopClone)) BlockToBlockMap(getAllocator(CMK_LoopClone)); @@ -5631,6 +5629,11 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) // so clear out the cloned bbRefs field. newBlk->bbRefs = 0; + newBlk->scaleBBWeight(slowPathWeightScaleFactor); + blk->scaleBBWeight(fastPathWeightScaleFactor); + + // TODO: scale the pred edges of `blk`? + #if FEATURE_LOOP_ALIGN // If the original loop is aligned, do not align the cloned loop because cloned loop will be executed in // rare scenario. Additionally, having to align cloned loop will force us to disable some VEX prefix encoding @@ -5672,10 +5675,38 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) optCopyBlkDest(blk, newblk); // Now redirect the new block according to "blockMap". - const bool addPreds = true; - optRedirectBlock(newblk, blockMap, addPreds); + optRedirectBlock(newblk, blockMap); + + // Add predecessor edges for the new successors, as well as the fall-through paths. + switch (newblk->bbJumpKind) + { + case BBJ_NONE: + fgAddRefPred(newblk->bbNext, newblk); + break; + + case BBJ_ALWAYS: + case BBJ_CALLFINALLY: + fgAddRefPred(newblk->bbJumpDest, newblk); + break; + + case BBJ_COND: + fgAddRefPred(newblk->bbNext, newblk); + fgAddRefPred(newblk->bbJumpDest, newblk); + break; + + case BBJ_SWITCH: + { + for (unsigned i = 0; i < newblk->bbJumpSwt->bbsCount; i++) + { + BasicBlock* switchDest = newblk->bbJumpSwt->bbsDstTab[i]; + fgAddRefPred(switchDest, newblk); + } + } + break; - blk->ensurePredListOrder(this); + default: + break; + } } #ifdef DEBUG @@ -5733,6 +5764,7 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) BasicBlock* slowHead = fgNewBBafter(BBJ_ALWAYS, h, /*extendRegion*/ true); JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", slowHead->bbNum, h->bbNum); slowHead->bbWeight = h->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; + slowHead->scaleBBWeight(slowPathWeightScaleFactor); slowHead->bbNatLoopNum = ambientLoop; slowHead->bbJumpDest = e2; @@ -5852,6 +5884,10 @@ BasicBlock* Compiler::optInsertLoopChoiceConditions(LoopCloneContext* context, // preds of the entry to this new block. Sets the weight of the newly created // block to "ambientWeight". // +// NOTE: this is currently dead code, because it is only called by loop cloning, +// and loop cloning only works with single-entry loops where the immediately +// preceding head block is the only predecessor of the loop entry. +// // Arguments: // loopInd - index of loop to process // ambientWeight - weight to give the new head, if created. @@ -5875,21 +5911,17 @@ void Compiler::optEnsureUniqueHead(unsigned loopInd, BasicBlock::weight_t ambien // and redirect the preds of the entry block to go to this. BasicBlock* beforeTop = t->bbPrev; + assert(!beforeTop->bbFallsThrough() || (beforeTop->bbNext == e)); + // Make sure that the new block is in the same region as the loop. // (We will only create loops that are entirely within a region.) - BasicBlock* h2 = fgNewBBafter(BBJ_ALWAYS, beforeTop, true); + BasicBlock* h2 = fgNewBBafter(BBJ_NONE, beforeTop, /*extendRegion*/ true); + assert(beforeTop->bbNext == h2); + // This is in the containing loop. h2->bbNatLoopNum = loop.lpParent; h2->bbWeight = h2->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; - // We don't care where it was put; splice it between beforeTop and top. - if (beforeTop->bbNext != h2) - { - h2->bbPrev->setNext(h2->bbNext); // Splice h2 out. - beforeTop->setNext(h2); // Splice h2 in, between beforeTop and t. - h2->setNext(t); - } - if (h2->bbNext != e) { h2->bbJumpKind = BBJ_ALWAYS; @@ -5897,11 +5929,13 @@ void Compiler::optEnsureUniqueHead(unsigned loopInd, BasicBlock::weight_t ambien } BlockSetOps::Assign(this, h2->bbReach, e->bbReach); + fgAddRefPred(e, h2); + // Redirect paths from preds of "e" to go to "h2" instead of "e". BlockToBlockMap* blockMap = new (getAllocator(CMK_LoopClone)) BlockToBlockMap(getAllocator(CMK_LoopClone)); blockMap->Set(e, h2); - for (flowList* predEntry = e->bbPreds; predEntry; predEntry = predEntry->flNext) + for (flowList* predEntry = e->bbPreds; predEntry != nullptr; predEntry = predEntry->flNext) { BasicBlock* predBlock = predEntry->getBlock(); @@ -5910,10 +5944,14 @@ void Compiler::optEnsureUniqueHead(unsigned loopInd, BasicBlock::weight_t ambien { continue; } + optRedirectBlock(predBlock, blockMap); + + fgAddRefPred(h2, predBlock); + fgRemoveRefPred(e, predBlock); } - optUpdateLoopHead(loopInd, loop.lpHead, h2); + optUpdateLoopHead(loopInd, h, h2); } /***************************************************************************** From 4d55238684dd3986216c0f0ca5f6771d314293d7 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Mon, 26 Apr 2021 18:23:58 -0700 Subject: [PATCH 2/2] Formatting --- src/coreclr/jit/optimizer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index c42566c3951a73..498bb249ff85ba 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -5632,7 +5632,7 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) newBlk->scaleBBWeight(slowPathWeightScaleFactor); blk->scaleBBWeight(fastPathWeightScaleFactor); - // TODO: scale the pred edges of `blk`? +// TODO: scale the pred edges of `blk`? #if FEATURE_LOOP_ALIGN // If the original loop is aligned, do not align the cloned loop because cloned loop will be executed in @@ -5763,7 +5763,7 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) JITDUMP("Create unique head block for slow path loop\n"); BasicBlock* slowHead = fgNewBBafter(BBJ_ALWAYS, h, /*extendRegion*/ true); JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", slowHead->bbNum, h->bbNum); - slowHead->bbWeight = h->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; + slowHead->bbWeight = h->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight; slowHead->scaleBBWeight(slowPathWeightScaleFactor); slowHead->bbNatLoopNum = ambientLoop; slowHead->bbJumpDest = e2;