diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 70abd0a98ebb19..f1f76a960a4866 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -4586,11 +4586,14 @@ class CallArg , m_needPlace(false) , m_isTmp(false) , m_processed(false) + , Tag(0) { } public: CallArgABIInformation AbiInfo; + // Tag that can be used for any purpose during transformations. + int Tag; CallArg(const NewCallArg& arg) : CallArg() { diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 9bbd6fe7f9e3d7..467fa6c170e820 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -16,6 +16,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #endif #include "allocacheck.h" // for alloca +#include "algorithm.h" //------------------------------------------------------------- // fgMorphInit: prepare for running the morph phases @@ -1256,297 +1257,354 @@ void CallArgs::ArgsComplete(Compiler* comp, GenTreeCall* call) m_argsComplete = true; } -//------------------------------------------------------------------------ -// SortArgs: Sort arguments into a better passing order. -// -// Parameters: -// comp - The compiler object. -// call - The call that contains this CallArgs instance. -// sortedArgs - A table of at least `CountArgs()` entries where the sorted -// arguments are written into. -// -void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs) +struct ArgInterferenceGraphNode { - assert(m_argsComplete); + CallArg* Arg; + // Registers clobbered by placing this argument. + regMaskTP Clobbers; + // Registers that may be used by the argument (guess). + regMaskTP Uses; - JITDUMP("\nSorting the arguments:\n"); + int Index; + int LowLink; + bool OnStack; - // Shuffle the arguments around before we build the late args list. The - // idea is to move all "simple" arguments like constants and local vars to - // the end, and move the complex arguments towards the beginning. This will - // help prevent registers from being spilled by allowing us to evaluate the - // more complex arguments before the simpler arguments. We use the late - // list to keep the sorted result at this point, and the ordering ends up - // looking like: - // +------------------------------------+ <--- end of sortedArgs - // | constants | - // +------------------------------------+ - // | local var / local field | - // +------------------------------------+ - // | remaining arguments sorted by cost | - // +------------------------------------+ - // | temps (CallArg::m_needTmp == true) | - // +------------------------------------+ - // | args with calls (GTF_CALL) | - // +------------------------------------+ <--- start of sortedArgs - // + // Index of next node in the SCC. + int SccNext; +}; - unsigned argCount = 0; - for (CallArg& arg : Args()) +class ArgInterferenceGraph +{ + Compiler* m_comp; + int m_index = 0; + + ArrayStack m_nodes; + ArrayStack m_edges; + ArrayStack m_stack; + ArrayStack m_sccs; + // Registers that are used by an argument that does not also clobber that register. + regMaskTP m_regDependencies = RBM_NONE; + // Mask of registers clobbered by placing arguments. + regMaskTP m_allClobbers = RBM_NONE; + +public: + ArgInterferenceGraph(Compiler* comp, unsigned argCount) + : m_comp(comp) + , m_nodes(comp->getAllocator(CMK_CallArgs), static_cast(argCount)) + , m_edges(comp->getAllocator(CMK_CallArgs)) + , m_stack(comp->getAllocator(CMK_CallArgs)) + , m_sccs(comp->getAllocator(CMK_CallArgs), static_cast(argCount)) { - sortedArgs[argCount++] = &arg; } - // Set the beginning and end for the new argument table - unsigned curInx; - int regCount = 0; - unsigned begTab = 0; - unsigned endTab = argCount - 1; - unsigned argsRemaining = argCount; - - // First take care of arguments that are constants. - // [We use a backward iterator pattern] - // - curInx = argCount; - do + int NumSccs() { - curInx--; + return m_sccs.Height(); + } - CallArg* arg = sortedArgs[curInx]; + int FirstSccNode(int sccIndex) + { + return m_sccs.Bottom(sccIndex); + } + ArgInterferenceGraphNode& GetNode(int index) + { + return m_nodes.BottomRef(index); + } - if (arg->AbiInfo.GetRegNum() != REG_STK) + void AddNode(CallArg* arg) + { + regMaskTP clobbers = RBM_NONE; + for (unsigned i = 0; i < arg->AbiInfo.NumRegs; i++) { - regCount++; + clobbers |= genRegMask(arg->AbiInfo.GetRegNum(i)); } - assert(arg->GetLateNode() == nullptr); + ArgInterferenceGraphNode node; + node.Arg = arg; + node.Clobbers = clobbers; + node.Uses = EstimateRegisterUses(arg->GetNode()); + node.Index = -1; + node.LowLink = -1; + node.OnStack = false; + node.SccNext = UINT_MAX; + m_nodes.Push(node); - // Skip any already processed args - // - if (!arg->m_processed) - { - GenTree* argx = arg->GetEarlyNode(); + m_regDependencies |= node.Uses & ~clobbers; + m_allClobbers |= clobbers; + } - assert(argx != nullptr); - // put constants at the end of the table - // - if (argx->gtOper == GT_CNS_INT) + void FindSccs() + { + for (int i = 0; i < m_nodes.Height(); i++) + { + if (m_nodes.BottomRef(i).Index == -1) { - noway_assert(curInx <= endTab); - - arg->m_processed = true; - - // place curArgTabEntry at the endTab position by performing a swap - // - if (curInx != endTab) - { - sortedArgs[curInx] = sortedArgs[endTab]; - sortedArgs[endTab] = arg; - } - - endTab--; - argsRemaining--; + FindScc(i); } } - } while (curInx > 0); + } - if (argsRemaining > 0) + bool HasInterference() { - // Next take care of arguments that are calls. - // [We use a forward iterator pattern] - // - for (curInx = begTab; curInx <= endTab; curInx++) + return (m_allClobbers & m_regDependencies) != RBM_NONE; + } + +private: + // Implementation of Tarjan's algorithm + void FindScc(int index) + { + ArgInterferenceGraphNode& node = m_nodes.BottomRef(index); + node.Index = m_index; + node.LowLink = m_index; + m_index++; + + // Early exit for args that do not clobber any other argument. + if ((m_regDependencies & node.Clobbers) == RBM_NONE) { - CallArg* arg = sortedArgs[curInx]; + node.SccNext = index; + m_sccs.Push(index); + return; + } - // Skip any already processed args - // - if (!arg->m_processed) + uint32_t nodeStackIndex = m_stack.Height(); + m_stack.Push(index); + node.OnStack = true; + + for (int i = 0; i < m_nodes.Height(); i++) + { + if (i == index) { - GenTree* argx = arg->GetEarlyNode(); - assert(argx != nullptr); + continue; + } - // put calls at the beginning of the table - // - if (argx->gtFlags & GTF_CALL) - { - arg->m_processed = true; + ArgInterferenceGraphNode& neighbor = m_nodes.BottomRef(i); + if ((neighbor.Uses & node.Clobbers) == 0) + { + continue; + } - // place curArgTabEntry at the begTab position by performing a swap - // - if (curInx != begTab) - { - sortedArgs[curInx] = sortedArgs[begTab]; - sortedArgs[begTab] = arg; - } + if (neighbor.Index == -1) + { + FindScc(i); + node.LowLink = min(node.LowLink, neighbor.LowLink); + } + else if (neighbor.OnStack) + { + node.LowLink = min(node.LowLink, neighbor.Index); + } + } - begTab++; - argsRemaining--; - } + if (node.LowLink == node.Index) + { + // Pop and link SCC. + for (int j = m_stack.Height() - 1, i = nodeStackIndex; i < m_stack.Height(); j = i++) + { + int ni = m_stack.Bottom(i); + int nj = m_stack.Bottom(j); + ArgInterferenceGraphNode& node = m_nodes.BottomRef(ni); + node.SccNext = nj; + node.OnStack = false; } + + m_stack.Pop(m_stack.Height() - nodeStackIndex); + m_sccs.Push(index); } } - if (argsRemaining > 0) + // Estimate registers used by a tree. + regMaskTP EstimateRegisterUses(GenTree* tree) { - // Next take care arguments that are temps. - // These temps come before the arguments that are - // ordinary local vars or local fields - // since this will give them a better chance to become - // enregistered into their actual argument register. - // [We use a forward iterator pattern] - // - for (curInx = begTab; curInx <= endTab; curInx++) + struct RegisterUsesVisitor : GenTreeVisitor { - CallArg* arg = sortedArgs[curInx]; + enum + { + DoPreOrder = true, + DoLclVarsOnly = true, + }; - // Skip any already processed args - // - if (!arg->m_processed) + regMaskTP Registers = RBM_NONE; + + RegisterUsesVisitor(Compiler* comp) : GenTreeVisitor(comp) { - if (arg->m_needTmp) - { - arg->m_processed = true; + } - // place curArgTabEntry at the begTab position by performing a swap - // - if (curInx != begTab) + fgWalkResult PreOrderVisit(GenTree** use, GenTree* parent) + { + GenTreeLclVarCommon* node = (*use)->AsLclVarCommon(); + LclVarDsc* desc = m_compiler->lvaGetDesc(node); + if (!desc->lvDoNotEnregister && desc->lvIsRegArg) + { + if ((desc->GetArgReg() != REG_NA) && (desc->GetArgReg() != REG_STK)) { - sortedArgs[curInx] = sortedArgs[begTab]; - sortedArgs[begTab] = arg; + Registers |= genRegMask(desc->GetArgReg()); } - - begTab++; - argsRemaining--; +#if FEATURE_MULTIREG_ARGS + if ((desc->GetOtherArgReg() != REG_NA) && (desc->GetOtherArgReg() != REG_STK)) + { + Registers |= genRegMask(desc->GetOtherArgReg()); + } +#endif } + + return fgWalkResult::WALK_CONTINUE; } - } + }; + + RegisterUsesVisitor visitor(m_comp); + visitor.WalkTree(&tree, nullptr); + return visitor.Registers; + } +}; + +//------------------------------------------------------------------------ +// SortArgs: Sort arguments into a better passing order. +// +// Parameters: +// comp - The compiler object. +// call - The call that contains this CallArgs instance. +// sortedArgs - A table of at least `CountArgs()` entries where the sorted +// arguments are written into. +// +// Remarks: +// It is expected that arguments that interfere (in terms of side effects) +// have been marked as being evaluated into temps and that this function is +// thus free to reorder arguments freely. For arguments evaluated into temp, +// the result affects when the temp is placed. +// +void CallArgs::SortArgs(Compiler* comp, GenTreeCall* call, CallArg** sortedArgs) +{ + assert(m_argsComplete); + + JITDUMP("\nSorting the arguments:\n"); + + unsigned argCount = 0; + for (CallArg& arg : Args()) + { + sortedArgs[argCount] = &arg; + arg.Tag = static_cast(argCount); + argCount++; } - if (argsRemaining > 0) + if ((argCount <= 1) || (argCount >= 64)) { - // Next take care of local var and local field arguments. - // These are moved towards the end of the argument evaluation. - // [We use a backward iterator pattern] - // - curInx = endTab + 1; - do + JITDUMP(" Placed arguments in order (%u arguments).\n", argCount); + return; + } + + // First sort the arguments according to two heuristics: + // 1. Put constants at the end of the table. They cannot conflict with other arguments, so placing them last is + // always beneficial. + // 2. Put calls at the beginning of the table. + jitstd::sort(sortedArgs, sortedArgs + argCount, [](CallArg* l, CallArg* r) { + GenTree* lNode = l->GetNode(); + GenTree* rNode = r->GetNode(); + // Put constants at the end, they do not conflict with anything. + if (lNode->OperIsConst() != rNode->OperIsConst()) { - curInx--; + return rNode->OperIsConst(); + } - CallArg* arg = sortedArgs[curInx]; + // Put calls at the beginning. + if (lNode->IsCall() != rNode->IsCall()) + { + return lNode->IsCall(); + } - // Skip any already processed args - // - if (!arg->m_processed) - { - GenTree* argx = arg->GetEarlyNode(); - assert(argx != nullptr); + return l->Tag < r->Tag; + }); - // As a CQ heuristic, sort TYP_STRUCT args using the cost estimation below. - if (!argx->TypeIs(TYP_STRUCT) && argx->OperIs(GT_LCL_VAR, GT_LCL_FLD)) - { - noway_assert(curInx <= endTab); + if (comp->opts.OptimizationDisabled()) + { + return; + } - arg->m_processed = true; + // When optimizing also resolve conflicts due to arguments using parameters + // that may be enregistered. For example: if an argument uses a register + // 'rcx', try to ensure that 'rcx' is placed _after_ that argument. We + // create an interference graph and use Tarjan's algorithm, which will both + // find the SCCs (cycles) and reverse topologically sort the arguments, + // ensuring the above. The implementation of Tarjan's here also iterates + // neighbors in order such that arguments are not needlessly reordered when + // there are no conflicts. + ArgInterferenceGraph graph(comp, argCount); - // place curArgTabEntry at the endTab position by performing a swap - // - if (curInx != endTab) - { - sortedArgs[curInx] = sortedArgs[endTab]; - sortedArgs[endTab] = arg; - } + for (unsigned i = 0; i < argCount; i++) + { + graph.AddNode(sortedArgs[i]); + } - endTab--; - argsRemaining--; - } - } - } while (curInx > begTab); + if (!graph.HasInterference()) + { + JITDUMP(" No interference found between arguments.\n"); + return; } - // Finally, take care of all the remaining arguments. - // Note that we fill in one arg at a time using a while loop. - bool costsPrepared = false; // Only prepare tree costs once, the first time through this loop - while (argsRemaining > 0) + graph.FindSccs(); + + JITDUMP(" Arguments have register interference. Argument order and SCCs:\n"); + + unsigned curIndex = 0; + for (int scc = 0; scc < graph.NumSccs(); scc++) { - /* Find the most expensive arg remaining and evaluate it next */ + int firstNodeIndex = graph.FirstSccNode(scc); - CallArg* expensiveArg = nullptr; - unsigned expensiveArgIndex = UINT_MAX; - unsigned expensiveArgCost = 0; + unsigned sccSize = 0; + int nodeIndex = firstNodeIndex; - // [We use a forward iterator pattern] - // - for (curInx = begTab; curInx <= endTab; curInx++) + int lclVarIndex = -1; + do { - CallArg* arg = sortedArgs[curInx]; - - // Skip any already processed args - // - if (!arg->m_processed) + ArgInterferenceGraphNode& node = graph.GetNode(nodeIndex); + if (node.Arg->GetNode()->OperIs(GT_LCL_VAR)) { - GenTree* argx = arg->GetEarlyNode(); - assert(argx != nullptr); + lclVarIndex = nodeIndex; + } - // We should have already handled these kinds of args - assert((!argx->OperIs(GT_LCL_VAR, GT_LCL_FLD) || argx->TypeIs(TYP_STRUCT)) && - !argx->OperIs(GT_CNS_INT)); + assert(curIndex + sccSize < argCount); + sortedArgs[curIndex + sccSize] = node.Arg; - // This arg should either have no persistent side effects or be the last one in our table - // assert(((argx->gtFlags & GTF_PERSISTENT_SIDE_EFFECTS) == 0) || (curInx == (argCount-1))); + nodeIndex = node.SccNext; - if (argsRemaining == 1) - { - // This is the last arg to place - expensiveArgIndex = curInx; - expensiveArg = arg; - assert(begTab == endTab); - break; - } - else - { - if (!costsPrepared) - { - /* We call gtPrepareCost to measure the cost of evaluating this tree */ - comp->gtPrepareCost(argx); - } + sccSize++; - if (argx->GetCostEx() > expensiveArgCost) - { - // Remember this arg as the most expensive one that we have yet seen - expensiveArgCost = argx->GetCostEx(); - expensiveArgIndex = curInx; - expensiveArg = arg; - } - } - } - } + } while (nodeIndex != firstNodeIndex); - noway_assert(expensiveArgIndex != UINT_MAX); + // LSRA is able to break cycles by rehoming LCL_VAR parameters into + // another register. It does this better when it sees the LCL_VAR + // last. If we notice that the last arg we placed is not a LCL_VAR, + // and we had a LCL_VAR, then repeat the loop but ensure that we + // place a LCL_VAR node last. + if ((lclVarIndex != -1) && !sortedArgs[curIndex + sccSize - 1]->GetNode()->OperIs(GT_LCL_VAR)) + { + // Refill the sorted args, this time placing a LCL_VAR last. + firstNodeIndex = graph.GetNode(lclVarIndex).SccNext; - // put the most expensive arg towards the beginning of the table + sccSize = 0; + nodeIndex = firstNodeIndex; - expensiveArg->m_processed = true; + do + { + ArgInterferenceGraphNode& node = graph.GetNode(nodeIndex); + sortedArgs[curIndex + sccSize] = node.Arg; + nodeIndex = node.SccNext; + sccSize++; + } while (nodeIndex != firstNodeIndex); + } - // place expensiveArgTabEntry at the begTab position by performing a swap - // - if (expensiveArgIndex != begTab) +#ifdef DEBUG + if (comp->verbose) { - sortedArgs[expensiveArgIndex] = sortedArgs[begTab]; - sortedArgs[begTab] = expensiveArg; + printf(" ["); + for (unsigned j = 0; j < sccSize; j++) + printf(" [%06u]", sortedArgs[curIndex + j]->GetNode()->gtTreeID); + printf(" ]\n"); } +#endif - begTab++; - argsRemaining--; - - costsPrepared = true; // If we have more expensive arguments, don't re-evaluate the tree cost on the next loop + curIndex += sccSize; } - // The table should now be completely filled and thus begTab should now be adjacent to endTab - // and regArgsRemaining should be zero - assert(begTab == (endTab + 1)); - assert(argsRemaining == 0); + assert(curIndex == argCount); } //------------------------------------------------------------------------------