diff --git a/src/jit/codegenlinear.cpp b/src/jit/codegenlinear.cpp index 39c7cf9b7d3d..b7840323a28d 100644 --- a/src/jit/codegenlinear.cpp +++ b/src/jit/codegenlinear.cpp @@ -1647,8 +1647,10 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk, unsigned outArg unsigned thisFieldOffset = argOffset + fieldListPtr->gtFieldOffset; getEmitter()->emitIns_S_R(ins_Store(type), attr, reg, outArgVarNum, thisFieldOffset); - // We can't write beyond the arg area - assert((thisFieldOffset + EA_SIZE_IN_BYTES(attr)) <= compiler->lvaLclSize(outArgVarNum)); + // We can't write beyond the arg area unless this is a tail call, in which case we use + // the first stack arg as the base of the incoming arg area. + assert(putArgStk->gtCall->IsFastTailCall() || + (thisFieldOffset + EA_SIZE_IN_BYTES(attr)) <= compiler->lvaLclSize(outArgVarNum)); } } #endif // !_TARGET_X86_ diff --git a/src/jit/compiler.h b/src/jit/compiler.h index 9a7e06447f75..7a61e2dfc448 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -8733,7 +8733,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #if FEATURE_FASTTAILCALL size_t compArgStackSize; // Incoming argument stack size in bytes - bool compHasMultiSlotArgs; // Caller has >8 byte sized struct parameter #endif // FEATURE_FASTTAILCALL unsigned compRetBuffArg; // position of hidden return param var (0, 1) (BAD_VAR_NUM means not present); diff --git a/src/jit/jit.h b/src/jit/jit.h index acc2c0d42b6d..819b5e6c31b3 100644 --- a/src/jit/jit.h +++ b/src/jit/jit.h @@ -276,15 +276,14 @@ #define UNIX_AMD64_ABI_ONLY(x) #endif // defined(UNIX_AMD64_ABI) -#if defined(UNIX_AMD64_ABI) || !defined(_TARGET_64BIT_) || (defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)) +#if defined(UNIX_AMD64_ABI) || !defined(_TARGET_64BIT_) || defined(_TARGET_ARM64_) #define FEATURE_PUT_STRUCT_ARG_STK 1 #define PUT_STRUCT_ARG_STK_ONLY_ARG(x) , x #define PUT_STRUCT_ARG_STK_ONLY(x) x -#else // !(defined(UNIX_AMD64_ABI) && defined(_TARGET_64BIT_) && !(defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)) +#else #define PUT_STRUCT_ARG_STK_ONLY_ARG(x) #define PUT_STRUCT_ARG_STK_ONLY(x) -#endif // !(defined(UNIX_AMD64_ABI) && defined(_TARGET_64BIT_) && !(defined(_TARGET_WINDOWS_) && - // defined(_TARGET_ARM64_)) +#endif #if defined(UNIX_AMD64_ABI) #define UNIX_AMD64_ABI_ONLY_ARG(x) , x diff --git a/src/jit/lclvars.cpp b/src/jit/lclvars.cpp index 70cc7fe1757c..9cbf4e99b816 100644 --- a/src/jit/lclvars.cpp +++ b/src/jit/lclvars.cpp @@ -377,7 +377,6 @@ void Compiler::lvaInitArgs(InitVarDscInfo* varDscInfo) // We can get register usage information using codeGen->intRegState and // codeGen->floatRegState info.compArgStackSize = varDscInfo->stackArgSize; - info.compHasMultiSlotArgs = varDscInfo->hasMultiSlotStruct; #endif // FEATURE_FASTTAILCALL // The total argument size must be aligned. @@ -851,7 +850,6 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo) if (cSlots == 2) { varDsc->lvOtherArgReg = genMapRegArgNumToRegNum(firstAllocatedRegArgNum + 1, TYP_I_IMPL); - varDscInfo->hasMultiSlotStruct = true; } } #elif defined(UNIX_AMD64_ABI) @@ -864,7 +862,6 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo) { secondEightByteType = GetEightByteType(structDesc, 1); secondAllocatedRegArgNum = varDscInfo->allocRegArg(secondEightByteType, 1); - varDscInfo->hasMultiSlotStruct = true; } if (secondEightByteType != TYP_UNDEF) @@ -1004,11 +1001,7 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo) #endif // _TARGET_XXX_ #if FEATURE_FASTTAILCALL - if (cSlots > 1) - { - varDscInfo->hasMultiSlotStruct = true; - } - + varDsc->lvStkOffs = varDscInfo->stackArgSize; varDscInfo->stackArgSize += roundUp(argSize, TARGET_POINTER_SIZE); #endif // FEATURE_FASTTAILCALL } @@ -1102,6 +1095,7 @@ void Compiler::lvaInitGenericsCtxt(InitVarDscInfo* varDscInfo) // returns false. varDsc->lvOnFrame = true; #if FEATURE_FASTTAILCALL + varDsc->lvStkOffs = varDscInfo->stackArgSize; varDscInfo->stackArgSize += TARGET_POINTER_SIZE; #endif // FEATURE_FASTTAILCALL } @@ -1171,6 +1165,7 @@ void Compiler::lvaInitVarArgsHandle(InitVarDscInfo* varDscInfo) // returns false. varDsc->lvOnFrame = true; #if FEATURE_FASTTAILCALL + varDsc->lvStkOffs = varDscInfo->stackArgSize; varDscInfo->stackArgSize += TARGET_POINTER_SIZE; #endif // FEATURE_FASTTAILCALL } diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index b06591042bc3..5c04bda78b5d 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -1167,6 +1167,11 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, fgArgTabEntry* inf } } +#ifdef DEBUG + if (comp->verbose) + printf("Creating putarg at slot %d, num slots %d\n", info->slotNum, info->numSlots); +#endif + putArg = new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, TYP_VOID, arg, info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots), @@ -1892,6 +1897,70 @@ void Lowering::InsertProfTailCallHook(GenTreeCall* call, GenTree* insertionPoint BlockRange().InsertBefore(insertionPoint, profHookNode); } +// For fast tail calls it is necessary to set up stack args in the incoming arg space +// on the stack. When args passed also come from this area we may run into problems +// because we may end up overwriting the stack slot before using it. For example, +// for foo(a, b) { return bar(b, a); }, if a and b are on incoming arg space in foo +// they need to be swapped in this area for the call to bar. This function introduces +// a new temp for the specified local (assumed to be in the arg space) and updates uses. +// Note: This function may introduce new temps. +void Lowering::RehomeArgForFastTailCall(unsigned int lclNum, + GenTree* insertTempBefore, + GenTree* lookForUsesStart, + GenTreeCall* callNode) +{ + unsigned int tmpLclNum = BAD_VAR_NUM; + for (GenTree* treeNode = lookForUsesStart; treeNode != callNode; treeNode = treeNode->gtNext) + { + if (!treeNode->OperIsLocal() && !treeNode->OperIsLocalAddr()) + { + continue; + } + + // This should not be a GT_PHI_ARG. + assert(treeNode->OperGet() != GT_PHI_ARG); + + GenTreeLclVarCommon* lcl = treeNode->AsLclVarCommon(); + + if (lcl->GetLclNum() != lclNum) + { + continue; + } + + // Create tmp and use it in place of callerArgDsc + if (tmpLclNum == BAD_VAR_NUM) + { + tmpLclNum = comp->lvaGrabTemp(true DEBUGARG("Fast tail call lowering is creating a new local variable")); + + LclVarDsc* callerArgDsc = comp->lvaTable + lclNum; + var_types tmpTyp = genActualType(callerArgDsc->TypeGet()); + comp->lvaTable[tmpLclNum].lvType = tmpTyp; + comp->lvaTable[tmpLclNum].lvDoNotEnregister = comp->lvaTable[lcl->gtLclNum].lvDoNotEnregister; + GenTree* value = comp->gtNewLclvNode(lclNum, tmpTyp); + + if (tmpTyp == TYP_STRUCT) + { + comp->lvaSetStruct(tmpLclNum, comp->lvaGetStruct(lclNum), false); + GenTree* loc = new (comp, GT_LCL_VAR_ADDR) GenTreeLclVar(GT_LCL_VAR_ADDR, TYP_STRUCT, tmpLclNum); + loc->gtType = TYP_BYREF; + GenTreeBlk* storeBlk = new (comp, GT_STORE_BLK) + GenTreeBlk(GT_STORE_BLK, TYP_STRUCT, loc, value, callerArgDsc->lvExactSize); + storeBlk->gtFlags |= GTF_ASG; + BlockRange().InsertBefore(insertTempBefore, LIR::SeqTree(comp, storeBlk)); + LowerNode(storeBlk); + } + else + { + GenTree* assignExpr = comp->gtNewTempAssign(tmpLclNum, value); + ContainCheckRange(value, assignExpr); + BlockRange().InsertBefore(insertTempBefore, LIR::SeqTree(comp, assignExpr)); + } + } + + lcl->SetLclNum(tmpLclNum); + } +} + // Lower fast tail call implemented as epilog+jmp. // Also inserts PInvoke method epilog if required. void Lowering::LowerFastTailCall(GenTreeCall* call) @@ -1940,11 +2009,9 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) // of call is setup. Note that once a stack arg is setup, it cannot have nested // calls subsequently in execution order to setup other args, because the nested // call could over-write the stack arg that is setup earlier. - GenTree* firstPutArgStk = nullptr; - GenTreeArgList* args; ArrayStack putargs(comp->getAllocator(CMK_ArrayStack)); - for (args = call->gtCallArgs; args; args = args->Rest()) + for (GenTreeArgList* args = call->gtCallArgs; args; args = args->Rest()) { GenTree* tmp = args->Current(); if (tmp->OperGet() == GT_PUTARG_STK) @@ -1953,7 +2020,7 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) } } - for (args = call->gtCallLateArgs; args; args = args->Rest()) + for (GenTreeArgList* args = call->gtCallLateArgs; args; args = args->Rest()) { GenTree* tmp = args->Current(); if (tmp->OperGet() == GT_PUTARG_STK) @@ -1962,127 +2029,16 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) } } - if (!putargs.Empty()) - { - firstPutArgStk = putargs.Bottom(); - } - - // If we have a putarg_stk node, also count the number of non-standard args the - // call node has. Note that while determining whether a tail call can be fast - // tail called, we don't count non-standard args (passed in R10 or R11) since they - // don't contribute to outgoing arg space. These non-standard args are not - // accounted in caller's arg count but accounted in callee's arg count after - // fgMorphArgs(). Therefore, exclude callee's non-standard args while mapping - // callee's stack arg num to corresponding caller's stack arg num. - unsigned calleeNonStandardArgCount = call->GetNonStandardAddedArgCount(comp); - - // Say Caller(a, b, c, d, e) fast tail calls Callee(e, d, c, b, a) - // i.e. passes its arguments in reverse to Callee. During call site - // setup, after computing argument side effects, stack args are setup - // first and reg args next. In the above example, both Callers and - // Callee stack args (e and a respectively) share the same stack slot - // and are alive at the same time. The act of setting up Callee's - // stack arg will over-write the stack arg of Caller and if there are - // further uses of Caller stack arg we have to make sure that we move - // it to a temp before over-writing its slot and use temp in place of - // the corresponding Caller stack arg. - // - // For the above example, conceptually this is what is done - // tmp = e; - // Stack slot of e = a - // R9 = b, R8 = c, RDx = d - // RCX = tmp - // - // The below logic is meant to detect cases like this and introduce - // temps to set up args correctly for Callee. - - for (int i = 0; i < putargs.Height(); i++) - { - GenTree* putArgStkNode = putargs.Bottom(i); - - assert(putArgStkNode->OperGet() == GT_PUTARG_STK); - - // Get the caller arg num corresponding to this callee arg. - // Note that these two args share the same stack slot. Therefore, - // if there are further uses of corresponding caller arg, we need - // to move it to a temp and use the temp in this call tree. - // - // Note that Caller is guaranteed to have a param corresponding to - // this Callee's arg since fast tail call mechanism counts the - // stack slots required for both Caller and Callee for passing params - // and allow fast tail call only if stack slots required by Caller >= - // Callee. - fgArgTabEntry* argTabEntry = comp->gtArgEntryByNode(call, putArgStkNode); - assert(argTabEntry); - unsigned callerArgNum = argTabEntry->argNum - calleeNonStandardArgCount; - - unsigned callerArgLclNum = callerArgNum; - LclVarDsc* callerArgDsc = comp->lvaTable + callerArgLclNum; - if (callerArgDsc->lvPromoted) - { - callerArgLclNum = - callerArgDsc->lvFieldLclStart; // update the callerArgNum to the promoted struct field's lclNum - callerArgDsc = comp->lvaTable + callerArgLclNum; - } - noway_assert(callerArgDsc->lvIsParam); - - // Start searching in execution order list till we encounter call node - unsigned tmpLclNum = BAD_VAR_NUM; - var_types tmpType = TYP_UNDEF; - for (GenTree* treeNode = putArgStkNode->gtNext; treeNode != call; treeNode = treeNode->gtNext) - { - if (treeNode->OperIsLocal() || treeNode->OperIsLocalAddr()) - { - // This should not be a GT_PHI_ARG. - assert(treeNode->OperGet() != GT_PHI_ARG); - - GenTreeLclVarCommon* lcl = treeNode->AsLclVarCommon(); - - // Fast tail calling criteria permits passing of structs of size 1, 2, 4 and 8 as args. - // It is possible that the callerArgLclNum corresponds to such a struct whose stack slot - // is getting over-written by setting up of a stack arg and there are further uses of - // any of its fields if such a struct is type-dependently promoted. In this case too - // we need to introduce a temp. - if ((lcl->gtLclNum == callerArgNum) || (lcl->gtLclNum == callerArgLclNum)) - { - // Create tmp and use it in place of callerArgDsc - if (tmpLclNum == BAD_VAR_NUM) - { - // Set tmpType first before calling lvaGrabTemp, as that call invalidates callerArgDsc - tmpType = genActualType(callerArgDsc->lvaArgType()); - tmpLclNum = comp->lvaGrabTemp( - true DEBUGARG("Fast tail call lowering is creating a new local variable")); - - comp->lvaTable[tmpLclNum].lvType = tmpType; - comp->lvaTable[tmpLclNum].lvDoNotEnregister = comp->lvaTable[lcl->gtLclNum].lvDoNotEnregister; - } - - lcl->SetLclNum(tmpLclNum); - } - } - } - - // If we have created a temp, insert an embedded assignment stmnt before - // the first putargStkNode i.e. - // tmpLcl = CallerArg - if (tmpLclNum != BAD_VAR_NUM) - { - assert(tmpType != TYP_UNDEF); - GenTreeLclVar* local = new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, tmpType, callerArgLclNum); - GenTree* assignExpr = comp->gtNewTempAssign(tmpLclNum, local); - ContainCheckRange(local, assignExpr); - BlockRange().InsertBefore(firstPutArgStk, LIR::SeqTree(comp, assignExpr)); - } - } - - // Insert GT_START_NONGC node before the first GT_PUTARG_STK node. - // Note that if there are no args to be setup on stack, no need to - // insert GT_START_NONGC node. GenTree* startNonGCNode = nullptr; - if (firstPutArgStk != nullptr) + if (!putargs.Empty()) { + bool unused; + GenTree* insertionPoint = BlockRange().GetTreeRange(putargs.Bottom(), &unused).FirstNode(); + // Insert GT_START_NONGC node before we evaluate the PUTARG_STK args. + // Note that if there are no args to be setup on stack, no need to + // insert GT_START_NONGC node. startNonGCNode = new (comp, GT_START_NONGC) GenTree(GT_START_NONGC, TYP_VOID); - BlockRange().InsertBefore(firstPutArgStk, startNonGCNode); + BlockRange().InsertBefore(insertionPoint, startNonGCNode); // Gc-interruptability in the following case: // foo(a, b, c, d, e) { bar(a, b, c, d, e); } @@ -2100,6 +2056,89 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) GenTree* noOp = new (comp, GT_NO_OP) GenTree(GT_NO_OP, TYP_VOID); BlockRange().InsertBefore(startNonGCNode, noOp); } + + // Since this is a fast tailcall each PUTARG_STK will place the argument in the + // _incoming_ arg space area. This will effectively overwrite our already existing + // incoming args that live in that area. If we have later uses of those args, this + // is a problem. We introduce a defensive copy into a temp here of those args that + // potentially may cause problems. + for (int i = 0; i < putargs.Height(); i++) + { + GenTreePutArgStk* put = putargs.Bottom(i)->AsPutArgStk(); + // This put will overwrite our incoming stack args. If there are uses + // of the overwritten arg then introduce a defensive copy of it. + unsigned int overwrittenStart = put->getArgOffset(); + unsigned int overwrittenEnd = overwrittenStart + put->getArgSize(); +#if !(defined(_TARGET_WINDOWS_) && defined(_TARGET_64BIT_)) + int baseOff = -1; // Stack offset of first arg on stack +#endif +#ifdef DEBUG + if (comp->verbose) + printf("putarg%d: start: %u, size: %u\n", i, put->getArgOffset(), put->getArgSize()); +#endif + + for (unsigned callerArgLclNum = 0; callerArgLclNum < comp->info.compArgsCount; callerArgLclNum++) + { + LclVarDsc* callerArgDsc = comp->lvaTable + callerArgLclNum; +#ifdef DEBUG + if (comp->verbose) + { + printf("arg%d: IsRegArg: %u, Reg: %d, stkoffs: %d, size: %d\n", callerArgLclNum, callerArgDsc->lvIsRegArg, callerArgDsc->lvArgReg, callerArgDsc->lvStkOffs, comp->lvaLclSize(callerArgLclNum)); + } +#endif + if (callerArgDsc->lvIsRegArg) + continue; + +#if defined(_TARGET_WINDOWS_) && defined(_TARGET_64BIT_) + // On Win64, the argument position determines the stack slot uniquely, and even the + // register args take up space in the stack frame (shadow space). + unsigned int argStart = callerArgLclNum * TARGET_POINTER_SIZE; + unsigned int argEnd = argStart + static_cast(callerArgDsc->lvArgStackSize()); +#else + assert(callerArgDsc->lvStkOffs != BAD_STK_OFFS); + + if (baseOff == -1) + baseOff = callerArgDsc->lvStkOffs; + + // On all ABIs where we fast tail call the stack args should come in order. + assert(baseOff <= callerArgDsc->lvStkOffs); + + // Compute offset of this stack argument relative to the first stack arg. + // This will be its offset into the incoming arg space area. + unsigned int argStart = static_cast(callerArgDsc->lvStkOffs - baseOff); + unsigned int argEnd = argStart + comp->lvaLclSize(callerArgLclNum); +#endif + + // If ranges do not overlap then this PUTARG_STK will not mess up the arg. + if ((overwrittenEnd <= argStart) || (overwrittenStart >= argEnd)) + continue; + + // Codegen cannot handle a disjoint overlapping copy. For example, if + // we have + // bar(S16 stack, S32 stack2) + // foo(S32 stack, S32 stack2) { bar(..., stack) } + // then we may end up having to move 'stack' in foo 16 bytes ahead. + // It is possible that this PUTARG_STK is the only use, in which case we will + // need to introduce a temp + // look for uses starting from it. Note that we assume non-disjoint copies are OK. + GenTree* lookForUsesFrom = put->gtNext; + if (overwrittenStart != argStart) + lookForUsesFrom = insertionPoint; + + RehomeArgForFastTailCall(callerArgLclNum, insertionPoint, lookForUsesFrom, call); + // The call can introduce temps and invalidate the pointer. + callerArgDsc = comp->lvaTable + callerArgLclNum; + + // For promoted locals we have more work to do as its fields could also have been invalidated. + if (!callerArgDsc->lvPromoted) + continue; + + unsigned int fieldsFirst = callerArgDsc->lvFieldLclStart; + unsigned int fieldsEnd = fieldsFirst + callerArgDsc->lvFieldCnt; + for (unsigned int j = fieldsFirst; j < fieldsEnd; j++) + RehomeArgForFastTailCall(j, insertionPoint, lookForUsesFrom, call); + } + } } // Insert GT_PROF_HOOK node to emit profiler tail call hook. This should be diff --git a/src/jit/lower.h b/src/jit/lower.h index e0a7c64d2f97..fd5482a14fbd 100644 --- a/src/jit/lower.h +++ b/src/jit/lower.h @@ -137,6 +137,10 @@ class Lowering : public Phase GenTree* LowerDirectCall(GenTreeCall* call); GenTree* LowerNonvirtPinvokeCall(GenTreeCall* call); GenTree* LowerTailCallViaHelper(GenTreeCall* callNode, GenTree* callTarget); + void RehomeArgForFastTailCall(unsigned int lclNum, + GenTree* insertTempBefore, + GenTree* lookForUsesStart, + GenTreeCall* callNode); void LowerFastTailCall(GenTreeCall* callNode); void InsertProfTailCallHook(GenTreeCall* callNode, GenTree* insertionPoint); GenTree* LowerVirtualVtableCall(GenTreeCall* call); diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index b91d40845942..0445b61c09f2 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -7019,7 +7019,8 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) fgInitArgInfo(callee); fgArgInfo* argInfo = callee->fgArgInfo; - auto reportFastTailCallDecision = [this, callee](const char* msg, size_t callerStackSize, size_t calleeStackSize) { + auto reportFastTailCallDecision = [this, callee](const char* msg, size_t callerArgStackSize, + size_t calleeArgStackSize) { #if DEBUG if ((JitConfig.JitReportFastTailCallDecisions()) == 1) { @@ -7039,9 +7040,10 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) info.compFullName); } - if (callerStackSize != -1) + if (callerArgStackSize != -1) { - printf("%s (CallerStackSize: %d, CalleeStackSize: %d)\n\n", msg, callerStackSize, calleeStackSize); + printf("%s (CallerStackSize: %d, CalleeStackSize: %d)\n\n", msg, callerArgStackSize, + calleeArgStackSize); } else { @@ -7078,54 +7080,19 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) } #endif // (defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM_)) || defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)) - // Count user args while tracking whether any of them has a larger than one - // stack slot sized requirement. This requirement is required to support - // lowering the fast tail call, which, currently only supports copying - // stack slot arguments which have only one stack slot. - // - // For each struct arg, determine whether the argument would have > 1 stack - // slot if on the stack. If it has > 1 stack slot we will not fastTailCall. - // This is an implementation limitation of LowerFastTailCall that is tracked by: - // https://github.com/dotnet/coreclr/issues/12644. - - bool hasLargerThanOneStackSlotSizedStruct = false; - bool hasNonEnregisterableStructs = false; - bool hasByrefParameter = false; - size_t calleeStackSize = 0; + bool hasByrefParameter = false; + size_t calleeArgStackSize = 0; for (unsigned index = 0; index < argInfo->ArgCount(); ++index) { fgArgTabEntry* arg = argInfo->GetArgEntry(index, false); - unsigned argStackSize = arg->stackSize(); - unsigned argFloatRegCount = arg->floatRegCount(); - unsigned argIntRegCount = arg->intRegCount(); - -#if !defined(FEATURE_ARG_SPLIT) - if (argStackSize > 0) - { - assert(argFloatRegCount == 0 && argIntRegCount == 0); - } -#endif // !defined(FEATURE_ARG_SPLIT) - - unsigned countRegistersUsedForArg = argIntRegCount + argFloatRegCount; + unsigned argStackSize = arg->stackSize(); - calleeStackSize += argStackSize; + calleeArgStackSize += argStackSize; - // This exists to account for special case situations where we will not - // fast tail call. if (arg->isStruct) { - if (argStackSize > 0) - { - hasNonEnregisterableStructs = true; - } - - if (countRegistersUsedForArg > 1) - { - hasLargerThanOneStackSlotSizedStruct = true; - } - // Byref struct arguments are not allowed to fast tail call as the information // of the caller's stack is lost when the callee is compiled. if (arg->passedByRef) @@ -7136,100 +7103,42 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) } } + size_t callerArgStackSize = info.compArgStackSize; + if (callee->HasRetBufArg()) // RetBuf { // If callee has RetBuf param, caller too must have it. // Otherwise go the slow route. if (info.compRetBuffArg == BAD_VAR_NUM) { - reportFastTailCallDecision("Callee has RetBuf but caller does not.", 0, 0); + reportFastTailCallDecision("Callee has RetBuf but caller does not.", callerArgStackSize, + calleeArgStackSize); return false; } } - const unsigned maxRegArgs = MAX_REG_ARG; - size_t callerStackSize = info.compArgStackSize; - hasLargerThanOneStackSlotSizedStruct = hasLargerThanOneStackSlotSizedStruct || info.compHasMultiSlotArgs; - - bool hasStackArgs = false; - if (callerStackSize > 0 || calleeStackSize > 0) - { - hasStackArgs = true; - } - + // For Windows some struct parameters are copied on the local frame + // and then passed by reference. We cannot fast tail call in these situation + // as we need to keep our frame around. if (hasByrefParameter) { - reportFastTailCallDecision("Callee has a byref parameter", 0, 0); - return false; - } - -// If we reached here means that callee has only those argument types which can be passed in -// a register and if passed on stack will occupy exactly one stack slot in out-going arg area. -// If we are passing args on stack for the callee and it has a larger stack size than -// the caller, then fast tail call cannot be performed. -// -// Note that the GC'ness of on stack args need not match since the arg setup area is marked -// as non-interruptible for fast tail calls. - -#ifdef WINDOWS_AMD64_ABI - // x64 Windows: If we have stack args then make sure the callee's incoming - // arguments is less than the caller's - if (hasStackArgs && (calleeStackSize > callerStackSize)) - { - reportFastTailCallDecision("Will not fastTailCall hasStackArgs && (calleeStackSize > callerStackSize)", - callerStackSize, calleeStackSize); - return false; - } - -#elif (defined(_TARGET_AMD64_) && defined(UNIX_AMD64_ABI)) || defined(_TARGET_ARM64_) - - // For unix Amd64 and Arm64 check to see if all arguments for the callee - // and caller are passing in registers. If not, ensure that the outgoing argument stack size - // requirement for the callee is less than or equal to the caller's entire stack frame usage. - // - // Also, in the case that we have to pass arguments on the stack make sure - // that we are not dealing with structs that are >8 bytes. - - // Either the caller or callee has a >8 and <=16 byte struct and arguments that has to go on the stack. Do not - // fastTailCall. - // - // When either the caller or callee have multi-stlot stack arguments we cannot safely - // shuffle arguments in LowerFastTailCall. See https://github.com/dotnet/coreclr/issues/12468. - if (hasLargerThanOneStackSlotSizedStruct && calleeStackSize > 0) - { - reportFastTailCallDecision("Will not fastTailCall hasLargerThanOneStackSlotSizedStruct && calleeStackSize", - callerStackSize, calleeStackSize); + reportFastTailCallDecision("Callee has a byref parameter", callerArgStackSize, calleeArgStackSize); return false; } - if (hasNonEnregisterableStructs) + // For a fast tail call the caller will use its incoming arg stack space to place + // arguments, so if the callee requires more arg stack space than is available here + // the fast tail call cannot be performed. This is common to all platforms. + // Note that the GC'ness of on stack args need not match since the arg setup area is marked + // as non-interruptible for fast tail calls. + if (calleeArgStackSize > callerArgStackSize) { - reportFastTailCallDecision("Will not fastTailCall hasNonEnregisterableStructs", callerStackSize, - calleeStackSize); - return false; - } - - // TODO-AMD64-Unix - // TODO-ARM64 - // - // LowerFastTailCall currently assumes nCalleeArgs <= nCallerArgs. This is - // not true in many cases on x64 linux, remove this pessimization when - // LowerFastTailCall is fixed. See https://github.com/dotnet/coreclr/issues/12468 - // for more information. - if (hasStackArgs && (calleeStackSize > callerStackSize)) - { - reportFastTailCallDecision("Will not fastTailCall hasStackArgs && (calleeStackSize > callerStackSize)", - callerStackSize, calleeStackSize); + reportFastTailCallDecision("Will not fastTailCall (calleeArgStackSize > callerArgStackSize)", + callerArgStackSize, calleeArgStackSize); return false; } -#else - - NYI("fastTailCall not supported on this Architecture."); - -#endif // WINDOWS_AMD64_ABI - - reportFastTailCallDecision("Will fastTailCall", callerStackSize, calleeStackSize); + reportFastTailCallDecision("Will fastTailCall", callerArgStackSize, calleeArgStackSize); return true; #else // FEATURE_FASTTAILCALL return false;