diff --git a/src/jit/compiler.h b/src/jit/compiler.h index 674ec5394d01..9a7e06447f75 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -1327,6 +1327,14 @@ struct fgArgTabEntry // Note that on ARM, if we have a double hfa, this reflects the number // of DOUBLE registers. +#if defined(UNIX_AMD64_ABI) + // Unix amd64 will split floating point types and integer types in structs + // between floating point and general purpose registers. Keep track of that + // information so we do not need to recompute it later. + unsigned structIntRegs; + unsigned structFloatRegs; +#endif // UNIX_AMD64_ABI + // A slot is a pointer sized region in the OutArg area. unsigned slotNum; // When an argument is passed in the OutArg area this is the slot number in the OutArg area unsigned numSlots; // Count of number of slots that this argument uses @@ -1453,6 +1461,45 @@ struct fgArgTabEntry #endif } + unsigned intRegCount() + { +#if defined(UNIX_AMD64_ABI) + if (this->isStruct) + { + return this->structIntRegs; + } +#endif // defined(UNIX_AMD64_ABI) + + if (!this->isPassedInFloatRegisters()) + { + return this->numRegs; + } + + return 0; + } + + unsigned floatRegCount() + { +#if defined(UNIX_AMD64_ABI) + if (this->isStruct) + { + return this->structFloatRegs; + } +#endif // defined(UNIX_AMD64_ABI) + + if (this->isPassedInFloatRegisters()) + { + return this->numRegs; + } + + return 0; + } + + unsigned stackSize() + { + return (TARGET_POINTER_SIZE * this->numSlots); + } + __declspec(property(get = GetHfaType)) var_types hfaType; var_types GetHfaType() { @@ -1728,6 +1775,8 @@ class fgArgInfo const bool isStruct, const bool isVararg, const regNumber otherRegNum, + const unsigned structIntRegs, + const unsigned structFloatRegs, const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* const structDescPtr = nullptr); #endif // UNIX_AMD64_ABI diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp index c635c327c58a..e8f0717c1e87 100644 --- a/src/jit/gentree.cpp +++ b/src/jit/gentree.cpp @@ -1078,7 +1078,15 @@ bool GenTreeCall::AreArgsComplete() const assert((gtCallLateArgs != nullptr) || !fgArgInfo->HasRegArgs()); return true; } + +#if defined(FEATURE_FASTTAILCALL) +// If we have FEATURE_FASTTAILCALL, 'fgCanFastTailCall()' can call 'fgInitArgInfo()', and in that +// scenario it is valid to have 'fgArgInfo' be non-null when 'fgMorphArgs()' first queries this, +// when it hasn't yet morphed the arguments. +#else assert(gtCallArgs == nullptr); +#endif + return false; } diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index 809f48f0f876..b06591042bc3 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -2015,7 +2015,6 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) fgArgTabEntry* argTabEntry = comp->gtArgEntryByNode(call, putArgStkNode); assert(argTabEntry); unsigned callerArgNum = argTabEntry->argNum - calleeNonStandardArgCount; - noway_assert(callerArgNum < comp->info.compArgsCount); unsigned callerArgLclNum = callerArgNum; LclVarDsc* callerArgDsc = comp->lvaTable + callerArgLclNum; diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 5450b4396cf5..b91d40845942 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -1177,12 +1177,16 @@ fgArgTabEntry* fgArgInfo::AddRegArg(unsigned const bool isStruct, const bool isVararg, const regNumber otherRegNum, + const unsigned structIntRegs, + const unsigned structFloatRegs, const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* const structDescPtr) { fgArgTabEntry* curArgTabEntry = AddRegArg(argNum, node, parent, regNum, numRegs, alignment, isStruct, isVararg); assert(curArgTabEntry != nullptr); - curArgTabEntry->isStruct = isStruct; // is this a struct arg + curArgTabEntry->isStruct = isStruct; // is this a struct arg + curArgTabEntry->structIntRegs = structIntRegs; + curArgTabEntry->structFloatRegs = structFloatRegs; curArgTabEntry->checkIsStruct(); assert(numRegs <= 2); @@ -1213,12 +1217,16 @@ fgArgTabEntry* fgArgInfo::AddStkArg(unsigned argNum, nextSlotNum = roundUp(nextSlotNum, alignment); curArgTabEntry->setRegNum(0, REG_STK); - curArgTabEntry->argNum = argNum; - curArgTabEntry->node = node; - curArgTabEntry->argType = node->TypeGet(); - curArgTabEntry->parent = parent; - curArgTabEntry->slotNum = nextSlotNum; - curArgTabEntry->numRegs = 0; + curArgTabEntry->argNum = argNum; + curArgTabEntry->node = node; + curArgTabEntry->argType = node->TypeGet(); + curArgTabEntry->parent = parent; + curArgTabEntry->slotNum = nextSlotNum; + curArgTabEntry->numRegs = 0; +#if defined(UNIX_AMD64_ABI) + curArgTabEntry->structIntRegs = 0; + curArgTabEntry->structFloatRegs = 0; +#endif // defined(UNIX_AMD64_ABI) curArgTabEntry->numSlots = numSlots; curArgTabEntry->alignment = alignment; curArgTabEntry->lateArgInx = UINT_MAX; @@ -2958,7 +2966,8 @@ void Compiler::fgInitArgInfo(GenTreeCall* call) // This is a register argument - put it in the table. call->fgArgInfo->AddRegArg(argIndex, argx, nullptr, genMapIntRegArgNumToRegNum(intArgRegNum), 1, 1, false, - callIsVararg UNIX_AMD64_ABI_ONLY_ARG(REG_STK) UNIX_AMD64_ABI_ONLY_ARG(nullptr)); + callIsVararg UNIX_AMD64_ABI_ONLY_ARG(REG_STK) UNIX_AMD64_ABI_ONLY_ARG(0) + UNIX_AMD64_ABI_ONLY_ARG(0) UNIX_AMD64_ABI_ONLY_ARG(nullptr)); intArgRegNum++; #ifdef WINDOWS_AMD64_ABI @@ -3535,12 +3544,12 @@ void Compiler::fgInitArgInfo(GenTreeCall* call) if (structDesc.IsIntegralSlot(i)) { *nextRegNumPtrs[i] = genMapIntRegArgNumToRegNum(intArgRegNum + structIntRegs); - structIntRegs++; + ++structIntRegs; } else if (structDesc.IsSseSlot(i)) { *nextRegNumPtrs[i] = genMapFloatRegArgNumToRegNum(nextFltArgRegNum + structFloatRegs); - structFloatRegs++; + ++structFloatRegs; } } } @@ -3561,7 +3570,9 @@ void Compiler::fgInitArgInfo(GenTreeCall* call) // This is a register argument - put it in the table newArgEntry = call->fgArgInfo->AddRegArg(argIndex, argx, args, nextRegNum, size, argAlign, isStructArg, callIsVararg UNIX_AMD64_ABI_ONLY_ARG(nextOtherRegNum) - UNIX_AMD64_ABI_ONLY_ARG(&structDesc)); + UNIX_AMD64_ABI_ONLY_ARG(structIntRegs) + UNIX_AMD64_ABI_ONLY_ARG(structFloatRegs) + UNIX_AMD64_ABI_ONLY_ARG(&structDesc)); newArgEntry->SetIsBackFilled(isBackFilled); newArgEntry->isNonStandard = isNonStandard; @@ -5123,6 +5134,8 @@ void Compiler::fgMakeOutgoingStructArgCopy(GenTreeCall* call, // struct parameters if they are passed as arguments to a tail call. if (!call->IsTailCallViaHelper() && (varDsc->lvRefCnt(RCS_EARLY) == 1) && !fgMightHaveLoop()) { + assert(!call->IsTailCall()); + varDsc->setLvRefCnt(0, RCS_EARLY); args->gtOp.gtOp1 = lcl; argEntry->node = lcl; @@ -6892,7 +6905,7 @@ void Compiler::fgMorphCallInlineHelper(GenTreeCall* call, InlineResult* result) // caller(int, int, int, int) // callee(int, int, float, int) // -// -- Callee requires stack space that is equal to the caller -- +// -- Callee requires stack space that is equal or less than the caller -- // caller(struct, struct, struct, struct, struct, struct) // callee(int, int, int, int, int, int) // @@ -6910,6 +6923,10 @@ void Compiler::fgMorphCallInlineHelper(GenTreeCall* call, InlineResult* result) // caller(struct, double, struct, float, struct, struct) // callee(int, int, int, int, int, double, double, double) // +// -- Callee has a byref struct argument -- +// caller(int, int, int) +// callee(struct(size 3 bytes)) +// // Unix Amd64 && Arm64: // A fastTailCall decision can be made whenever the callee's stack space is // less than or equal to the caller's stack space. There are many permutations @@ -6935,12 +6952,6 @@ void Compiler::fgMorphCallInlineHelper(GenTreeCall* call, InlineResult* result) // decision of do not fast tail call is taken. This limitations should be // removed if/when fgMorphArgs no longer depends on fgCanFastTailCall. // -// 4) Arm64 Only, if there are HFA arguments and the callee has stack -// arguments, the decision will be reported as cannot fast tail call. -// This is because before fgMorphArgs is done, the struct is unknown whether it -// will be placed on the stack or enregistered. Therefore, the conservative -// decision of do not fast tail call is taken. -// // Can fast tail call examples (amd64 Unix): // // -- Callee will have all register arguments -- @@ -6948,14 +6959,16 @@ void Compiler::fgMorphCallInlineHelper(GenTreeCall* call, InlineResult* result) // callee(int, int, float, int) // // -- Callee requires stack space that is equal to the caller -- -// caller({ int, int }, { int, int }, { int }, { int }, { int }, { int }) -- 6 int register arguments, 16 byte stack +// caller({ long, long }, { int, int }, { int }, { int }, { int }, { int }) -- 6 int register arguments, 16 byte +// stack // space // callee(int, int, int, int, int, int, int, int) -- 6 int register arguments, 16 byte stack space // // -- Callee requires stack space that is less than the caller -- -// caller({ int, int }, int, { int, int }, int, { int, int }, { int, int }) 6 int register arguments, 32 byte stack +// caller({ long, long }, int, { long, long }, int, { long, long }, { long, long }) 6 int register arguments, 32 byte +// stack // space -// callee(int, int, int, int, int, int, { int, int } ) // 6 int register arguments, 16 byte stack space +// callee(int, int, int, int, int, int, { long, long } ) // 6 int register arguments, 16 byte stack space // // -- Callee will have all register arguments -- // caller(int) @@ -7001,6 +7014,11 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) } #endif + assert(!callee->AreArgsComplete()); + + fgInitArgInfo(callee); + fgArgInfo* argInfo = callee->fgArgInfo; + auto reportFastTailCallDecision = [this, callee](const char* msg, size_t callerStackSize, size_t calleeStackSize) { #if DEBUG if ((JitConfig.JitReportFastTailCallDecisions()) == 1) @@ -7040,273 +7058,153 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) #endif // DEBUG }; - // Note on vararg methods: - // If the caller is vararg method, we don't know the number of arguments passed by caller's caller. - // But we can be sure that in-coming arg area of vararg caller would be sufficient to hold its - // fixed args. Therefore, we can allow a vararg method to fast tail call other methods as long as - // out-going area required for callee is bounded by caller's fixed argument space. - // - // Note that callee being a vararg method is not a problem since we can account the params being passed. - unsigned nCallerArgs = info.compArgsCount; - - size_t callerArgRegCount = codeGen->intRegState.rsCalleeRegArgCount; - size_t callerFloatArgRegCount = codeGen->floatRegState.rsCalleeRegArgCount; - - // Count the callee args including implicit and hidden. - // Note that GenericContext and VarargCookie are added by importer while - // importing the call to gtCallArgs list along with explicit user args. - size_t calleeArgRegCount = 0; - size_t calleeFloatArgRegCount = 0; +// Note on vararg methods: +// If the caller is vararg method, we don't know the number of arguments passed by caller's caller. +// But we can be sure that in-coming arg area of vararg caller would be sufficient to hold its +// fixed args. Therefore, we can allow a vararg method to fast tail call other methods as long as +// out-going area required for callee is bounded by caller's fixed argument space. +// +// Note that callee being a vararg method is not a problem since we can account the params being passed. +// +// We will currently decide to not fast tail call on Windows armarch if the caller or callee is a vararg +// method. This is due to the ABI differences for native vararg methods for these platforms. There is +// work required to shuffle arguments to the correct locations. - if (callee->gtCallObjp) // thisPtr +#if (defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM_)) || (defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)) + if (info.compIsVarArgs || callee->IsVarargs()) { - ++calleeArgRegCount; + reportFastTailCallDecision("Fast tail calls with varargs not supported on Windows ARM/ARM64", 0, 0); + return false; } +#endif // (defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM_)) || defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)) - if (callee->HasRetBufArg()) // RetBuf - { - // We don't increment calleeArgRegCount here, since it is already in callee->gtCallArgs. + // Count user args while tracking whether any of them has a larger than one + // stack slot sized requirement. This requirement is required to support + // lowering the fast tail call, which, currently only supports copying + // stack slot arguments which have only one stack slot. + // + // For each struct arg, determine whether the argument would have > 1 stack + // slot if on the stack. If it has > 1 stack slot we will not fastTailCall. + // This is an implementation limitation of LowerFastTailCall that is tracked by: + // https://github.com/dotnet/coreclr/issues/12644. - // If callee has RetBuf param, caller too must have it. - // Otherwise go the slow route. - if (info.compRetBuffArg == BAD_VAR_NUM) - { - reportFastTailCallDecision("Callee has RetBuf but caller does not.", 0, 0); - return false; - } - } + bool hasLargerThanOneStackSlotSizedStruct = false; + bool hasNonEnregisterableStructs = false; + bool hasByrefParameter = false; + size_t calleeStackSize = 0; - // Count user args while tracking whether any of them is a multi-byte params - // that cannot be passed in a register. Note that we don't need to count - // non-standard and secret params passed in registers (e.g. R10, R11) since - // these won't contribute to out-going arg size. - // For each struct arg, hasMultiByteStackArgs will track if it can be passed in registers. - // If it cannot we will break the loop and not fastTailCall. This is an implementation limitation - // where the callee only is checked for non enregisterable structs. - // It is tracked with https://github.com/dotnet/coreclr/issues/12644. - bool hasMultiByteStackArgs = false; - bool hasTwoSlotSizedStruct = false; - bool hasHfaArg = false; - size_t nCalleeArgs = calleeArgRegCount; // Keep track of how many args we have. - size_t calleeStackSize = 0; - for (GenTree* args = callee->gtCallArgs; (args != nullptr); args = args->gtOp.gtOp2) + for (unsigned index = 0; index < argInfo->ArgCount(); ++index) { - ++nCalleeArgs; - assert(args->OperIsList()); - GenTree* argx = args->gtOp.gtOp1; + fgArgTabEntry* arg = argInfo->GetArgEntry(index, false); - if (varTypeIsStruct(argx)) + unsigned argStackSize = arg->stackSize(); + unsigned argFloatRegCount = arg->floatRegCount(); + unsigned argIntRegCount = arg->intRegCount(); + +#if !defined(FEATURE_ARG_SPLIT) + if (argStackSize > 0) { - // Actual arg may be a child of a GT_COMMA. Skip over comma opers. - argx = argx->gtEffectiveVal(true /*commaOnly*/); + assert(argFloatRegCount == 0 && argIntRegCount == 0); + } +#endif // !defined(FEATURE_ARG_SPLIT) - // Get the size of the struct and see if it is register passable. - CORINFO_CLASS_HANDLE objClass = nullptr; + unsigned countRegistersUsedForArg = argIntRegCount + argFloatRegCount; - if (argx->OperGet() == GT_OBJ) + calleeStackSize += argStackSize; + + // This exists to account for special case situations where we will not + // fast tail call. + if (arg->isStruct) + { + if (argStackSize > 0) { - objClass = argx->AsObj()->gtClass; + hasNonEnregisterableStructs = true; } - else if (argx->IsLocal()) + + if (countRegistersUsedForArg > 1) { - objClass = lvaTable[argx->AsLclVarCommon()->gtLclNum].lvVerTypeInfo.GetClassHandle(); + hasLargerThanOneStackSlotSizedStruct = true; } - if (objClass != nullptr) - { -#if defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_) - - unsigned typeSize = 0; - // We should have already broken out of the loop if we've set hasMultiByteStackArgs to true. - assert(!hasMultiByteStackArgs); - hasMultiByteStackArgs = - !VarTypeIsMultiByteAndCanEnreg(argx->TypeGet(), objClass, &typeSize, false, false); - -#if defined(UNIX_AMD64_ABI) - SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc; - - assert(objClass != nullptr); - eeGetSystemVAmd64PassStructInRegisterDescriptor(objClass, &structDesc); - - if (structDesc.passedInRegisters) - { - if (structDesc.eightByteCount == 2) - { - hasTwoSlotSizedStruct = true; - } - - for (unsigned int i = 0; i < structDesc.eightByteCount; i++) - { - if (structDesc.IsIntegralSlot(i)) - { - ++calleeArgRegCount; - } - else if (structDesc.IsSseSlot(i)) - { - ++calleeFloatArgRegCount; - } - else - { - assert(false && "Invalid eightbyte classification type."); - break; - } - } - } - else - { - calleeStackSize += roundUp(typeSize, TARGET_POINTER_SIZE); - hasMultiByteStackArgs = true; - } - -#elif defined(_TARGET_ARM64_) // ARM64 - var_types hfaType = GetHfaType(argx); - bool isHfaArg = varTypeIsValidHfaType(hfaType); - size_t size = 1; - - if (isHfaArg) - { - hasHfaArg = true; - - calleeFloatArgRegCount += GetHfaCount(argx); - } - else - { - // Structs are either passed in 1 or 2 (64-bit) slots - size_t roundupSize = roundUp(typeSize, TARGET_POINTER_SIZE); - size = roundupSize / TARGET_POINTER_SIZE; - - if (size > 2) - { - size = 1; - } - - else if (size == 2) - { - hasTwoSlotSizedStruct = true; - } - - calleeArgRegCount += size; - } - -#elif defined(WINDOWS_AMD64_ABI) - - ++calleeArgRegCount; -#endif // UNIX_AMD64_ABI - -#else - assert(!"Target platform ABI rules regarding passing struct type args in registers"); - unreached(); -#endif //_TARGET_AMD64_ || _TARGET_ARM64_ - } - else + // Byref struct arguments are not allowed to fast tail call as the information + // of the caller's stack is lost when the callee is compiled. + if (arg->passedByRef) { - hasMultiByteStackArgs = true; + hasByrefParameter = true; + break; } } - else - { - varTypeIsFloating(argx) ? ++calleeFloatArgRegCount : ++calleeArgRegCount; - } + } - // We can break early on multiByte cases. - if (hasMultiByteStackArgs) + if (callee->HasRetBufArg()) // RetBuf + { + // If callee has RetBuf param, caller too must have it. + // Otherwise go the slow route. + if (info.compRetBuffArg == BAD_VAR_NUM) { - break; + reportFastTailCallDecision("Callee has RetBuf but caller does not.", 0, 0); + return false; } } - const unsigned maxRegArgs = MAX_REG_ARG; - hasTwoSlotSizedStruct = hasTwoSlotSizedStruct || info.compHasMultiSlotArgs; - -// If we reached here means that callee has only those argument types which can be passed in -// a register and if passed on stack will occupy exactly one stack slot in out-going arg area. -// If we are passing args on stack for the callee and it has more args passed on stack than -// the caller, then fast tail call cannot be performed. -// -// Note that the GC'ness of on stack args need not match since the arg setup area is marked -// as non-interruptible for fast tail calls. - -#ifdef WINDOWS_AMD64_ABI - assert(calleeStackSize == 0); - size_t calleeStackSlots = ((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) - ? (calleeArgRegCount + calleeFloatArgRegCount) - maxRegArgs - : 0; - calleeStackSize = calleeStackSlots * TARGET_POINTER_SIZE; - size_t callerStackSize = info.compArgStackSize; + const unsigned maxRegArgs = MAX_REG_ARG; + size_t callerStackSize = info.compArgStackSize; + hasLargerThanOneStackSlotSizedStruct = hasLargerThanOneStackSlotSizedStruct || info.compHasMultiSlotArgs; bool hasStackArgs = false; - if (callerStackSize > 0 || calleeStackSize > 0) { hasStackArgs = true; } - // Go the slow route, if it has multi-byte params. This is an implementation - // limitatio see https://github.com/dotnet/coreclr/issues/12644. - if (hasMultiByteStackArgs) + if (hasByrefParameter) { - reportFastTailCallDecision("Will not fastTailCall hasMultiByteStackArgs", callerStackSize, calleeStackSize); + reportFastTailCallDecision("Callee has a byref parameter", 0, 0); return false; } - // x64 Windows: If we have more callee registers used than MAX_REG_ARG, then - // make sure the callee's incoming arguments is less than the caller's - if (hasStackArgs && (nCalleeArgs > nCallerArgs)) +// If we reached here means that callee has only those argument types which can be passed in +// a register and if passed on stack will occupy exactly one stack slot in out-going arg area. +// If we are passing args on stack for the callee and it has a larger stack size than +// the caller, then fast tail call cannot be performed. +// +// Note that the GC'ness of on stack args need not match since the arg setup area is marked +// as non-interruptible for fast tail calls. + +#ifdef WINDOWS_AMD64_ABI + // x64 Windows: If we have stack args then make sure the callee's incoming + // arguments is less than the caller's + if (hasStackArgs && (calleeStackSize > callerStackSize)) { - reportFastTailCallDecision("Will not fastTailCall hasStackArgs && (nCalleeArgs > nCallerArgs)", callerStackSize, - calleeStackSize); + reportFastTailCallDecision("Will not fastTailCall hasStackArgs && (calleeStackSize > callerStackSize)", + callerStackSize, calleeStackSize); return false; } #elif (defined(_TARGET_AMD64_) && defined(UNIX_AMD64_ABI)) || defined(_TARGET_ARM64_) - // For *nix Amd64 and Arm64 check to see if all arguments for the callee + // For unix Amd64 and Arm64 check to see if all arguments for the callee // and caller are passing in registers. If not, ensure that the outgoing argument stack size // requirement for the callee is less than or equal to the caller's entire stack frame usage. // // Also, in the case that we have to pass arguments on the stack make sure // that we are not dealing with structs that are >8 bytes. - bool hasStackArgs = false; - size_t maxFloatRegArgs = MAX_FLOAT_REG_ARG; - - size_t calleeIntStackArgCount = calleeArgRegCount > maxRegArgs ? calleeArgRegCount - maxRegArgs : 0; - size_t calleeFloatStackArgCount = - calleeFloatArgRegCount > maxFloatRegArgs ? calleeFloatArgRegCount - maxFloatRegArgs : 0; - - size_t calleeStackArgCount = calleeIntStackArgCount + calleeFloatStackArgCount; - size_t callerStackSize = info.compArgStackSize; - calleeStackSize += calleeStackArgCount * TARGET_POINTER_SIZE; - - if (callerStackSize > 0 || calleeStackSize > 0) - { - hasStackArgs = true; - } - - // Go the slow route, if it has multi-byte params. This is an implementation - // limitation see https://github.com/dotnet/coreclr/issues/12644. - if (hasMultiByteStackArgs) - { - reportFastTailCallDecision("Will not fastTailCall hasMultiByteStackArgs", callerStackSize, calleeStackSize); - return false; - } - // Either the caller or callee has a >8 and <=16 byte struct and arguments that has to go on the stack. Do not // fastTailCall. // // When either the caller or callee have multi-stlot stack arguments we cannot safely // shuffle arguments in LowerFastTailCall. See https://github.com/dotnet/coreclr/issues/12468. - if (hasStackArgs && hasTwoSlotSizedStruct) + if (hasLargerThanOneStackSlotSizedStruct && calleeStackSize > 0) { - reportFastTailCallDecision("Will not fastTailCall calleeStackSize > 0 && hasTwoSlotSizedStruct", + reportFastTailCallDecision("Will not fastTailCall hasLargerThanOneStackSlotSizedStruct && calleeStackSize", callerStackSize, calleeStackSize); return false; } - // Callee has an HFA struct and arguments that has to go on the stack. Do not fastTailCall. - if (calleeStackSize > 0 && hasHfaArg) + if (hasNonEnregisterableStructs) { - reportFastTailCallDecision("Will not fastTailCall calleeStackSize > 0 && hasHfaArg", callerStackSize, + reportFastTailCallDecision("Will not fastTailCall hasNonEnregisterableStructs", callerStackSize, calleeStackSize); return false; } @@ -7318,17 +7216,10 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // not true in many cases on x64 linux, remove this pessimization when // LowerFastTailCall is fixed. See https://github.com/dotnet/coreclr/issues/12468 // for more information. - if (hasStackArgs && (nCalleeArgs > nCallerArgs)) + if (hasStackArgs && (calleeStackSize > callerStackSize)) { - reportFastTailCallDecision("Will not fastTailCall hasStackArgs && (nCalleeArgs > nCallerArgs)", callerStackSize, - calleeStackSize); - return false; - } - - if (calleeStackSize > callerStackSize) - { - reportFastTailCallDecision("Will not fastTailCall calleeStackSize > callerStackSize", callerStackSize, - calleeStackSize); + reportFastTailCallDecision("Will not fastTailCall hasStackArgs && (calleeStackSize > callerStackSize)", + callerStackSize, calleeStackSize); return false; } @@ -8251,7 +8142,7 @@ GenTree* Compiler::fgMorphCall(GenTreeCall* call) szFailReason = "Method with non-standard args passed in callee trash register cannot be tail " "called via helper"; } -#ifdef _TARGET_ARM64_ +#if defined(_TARGET_ARM64_) || defined(_TARGET_UNIX_) else { // NYI - TAILCALL_RECURSIVE/TAILCALL_HELPER. @@ -8327,6 +8218,9 @@ GenTree* Compiler::fgMorphCall(GenTreeCall* call) // temp = call // ret temp + // Force re-evaluating the argInfo as the return argument has changed. + call->fgArgInfo = nullptr; + // Create a new temp. unsigned tmpNum = lvaGrabTemp(false DEBUGARG("Return value temp for multi-reg return (rejected tail call).")); @@ -8415,6 +8309,10 @@ GenTree* Compiler::fgMorphCall(GenTreeCall* call) if (!canFastTailCall) { fgMorphTailCall(call, pfnCopyArgs); + + // Force re-evaluating the argInfo. fgMorphTailCall will modify the + // argument list, invalidating the argInfo. + call->fgArgInfo = nullptr; } // Implementation note : If we optimize tailcall to do a direct jump @@ -8696,7 +8594,9 @@ GenTree* Compiler::fgMorphCall(GenTreeCall* call) if (info.compCompHnd->isStructRequiringStackAllocRetBuf(structHnd) && !(dest->OperGet() == GT_LCL_VAR && dest->gtLclVar.gtLclNum == info.compRetBuffArg)) { - origDest = dest; + // Force re-evaluating the argInfo as the return argument has changed. + call->fgArgInfo = nullptr; + origDest = dest; retValTmpNum = lvaGrabTemp(true DEBUGARG("substitute local for ret buff arg")); lvaSetStruct(retValTmpNum, structHnd, true); diff --git a/tests/src/JIT/opt/FastTailCall/FastTailCallCandidates.cs b/tests/src/JIT/opt/FastTailCall/FastTailCallCandidates.cs index dd2e33198b8f..190714909973 100644 --- a/tests/src/JIT/opt/FastTailCall/FastTailCallCandidates.cs +++ b/tests/src/JIT/opt/FastTailCall/FastTailCallCandidates.cs @@ -65,6 +65,12 @@ public static int Tester(int a) CheckOutput(DoubleCountRetBuffCaller(1)); CheckOutput(Struct32CallerWrapper()); CheckOutput(Struct32CallerWrapperCalleeHasStack(2)); + CheckOutput(CallerEnregisterableAmd64WindowsStructs8Bytes(1, 2)); + CheckOutput(CallerAmd64WindowsStructs7Bytes(1, 2)); + CheckOutput(CallerAmd64WindowsStructs6Bytes(1, 2)); + CheckOutput(CallerAmd64WindowsStructs5Bytes(1, 2)); + CheckOutput(CallerAmd64WindowsStructs4Bytes(1, 2)); + CheckOutput(CallerAmd64WindowsStructs3Bytes(1, 2)); return s_ret_value; @@ -844,6 +850,94 @@ public static int CallerHFaCaseCalleeStackArgs(double one, // Stack Based args. //////////////////////////////////////////////////////////////////////////// + public struct StructSizeOneNotExplicit + { + public byte a; + + public StructSizeOneNotExplicit(byte a) + { + this.a = a; + } + } + + public struct StructSizeTwoNotExplicit + { + public byte a; + public byte b; + + public StructSizeTwoNotExplicit(byte a, byte b) + { + this.a = a; + this.b = b; + } + } + + public struct StructSizeThreeNotExplicit + { + public byte a; + public byte b; + public byte c; + + public StructSizeThreeNotExplicit(byte a, byte b, byte c) + { + this.a = a; + this.b = b; + this.c = c; + } + } + + public struct StructSizeFourNotExplicit + { + public int a; + + public StructSizeFourNotExplicit(int a) + { + this.a = a; + } + } + + public struct StructSizeFiveNotExplicit + { + public int a; + public byte b; + + public StructSizeFiveNotExplicit(int a, byte b) + { + this.a = a; + this.b = b; + } + } + + public struct StructSizeSixNotExplicit + { + public int a; + public byte b; + public byte c; + + public StructSizeSixNotExplicit(int a, byte b, byte c) + { + this.a = a; + this.b = b; + this.c = c; + } + } + + public struct StructSizeSevenNotExplicit + { + public int a; + public byte b; + public byte c; + public byte d; + + public StructSizeSevenNotExplicit(int a, byte b, byte c, byte d) + { + this.a = a; + this.b = b; + this.c = c; + this.d = d; + } + } + public struct StructSizeEightNotExplicit { public long a; @@ -876,7 +970,32 @@ public StructSizeSixteenNotExplicit(long a, long b) this.a = a; this.b = b; } + } + + public struct StructSize24NotExplicit + { + public long a; + public long b; + public long c; + + public StructSize24NotExplicit(long a, long b, long c) + { + this.a = a; + this.b = b; + this.c = c; + } + } + public struct StructSize48Nested + { + public StructSize24NotExplicit a; + public StructSize24NotExplicit b; + + public StructSize48Nested(long a, long b, long c, long d, long e, long f) + { + this.a = new StructSize24NotExplicit(a, b, c); + this.b = new StructSize24NotExplicit(d, e, f); + } } /// @@ -974,12 +1093,12 @@ public static int CallerGithubIssue12468(int one, [StructLayout(LayoutKind.Explicit, Size=8, CharSet=CharSet.Ansi)] public struct StructSizeThirtyTwo { - [FieldOffset(0)] public int a; - [FieldOffset(8)] public int b; - [FieldOffset(16)] public int c; - [FieldOffset(24)] public int d; + [FieldOffset(0)] public long a; + [FieldOffset(8)] public long b; + [FieldOffset(16)] public long c; + [FieldOffset(24)] public long d; - public StructSizeThirtyTwo(int a, int b, int c, int d) + public StructSizeThirtyTwo(long a, long b, long c, long d) { this.a = a; this.b = b; @@ -991,9 +1110,9 @@ public StructSizeThirtyTwo(int a, int b, int c, int d) [StructLayout(LayoutKind.Explicit, Size=8, CharSet=CharSet.Ansi)] public struct StructSizeTwentyFour { - [FieldOffset(0)] public int a; - [FieldOffset(8)] public int b; - [FieldOffset(16)] public int c; + [FieldOffset(0)] public long a; + [FieldOffset(8)] public long b; + [FieldOffset(16)] public long c; public StructSizeTwentyFour(int a, int b, int c) { @@ -1098,17 +1217,21 @@ public static StructSizeThirtyTwo DoubleCountRetBuffCallerWrapper(int a, int b) { if (a % 2 == 0) { - StructSizeEightIntNotExplicit eightBytes = new StructSizeEightIntNotExplicit(a, a); a = 1; - b = b + 2; - return DoubleCountRetBuffCallee(eightBytes, eightBytes, eightBytes, eightBytes, eightBytes); + return DoubleCountRetBuffCallee(new StructSizeEightIntNotExplicit(a, a), + new StructSizeEightIntNotExplicit(a, a), + new StructSizeEightIntNotExplicit(a, a), + new StructSizeEightIntNotExplicit(a, a), + new StructSizeEightIntNotExplicit(a, a)); } else { - StructSizeEightIntNotExplicit eightBytes = new StructSizeEightIntNotExplicit(b, b); - a = 4; b = b + 1; - return DoubleCountRetBuffCallee(eightBytes, eightBytes, eightBytes, eightBytes, eightBytes); + return DoubleCountRetBuffCallee(new StructSizeEightIntNotExplicit(b, b), + new StructSizeEightIntNotExplicit(b, b), + new StructSizeEightIntNotExplicit(b, b), + new StructSizeEightIntNotExplicit(b, b), + new StructSizeEightIntNotExplicit(b, b)); } } @@ -1132,7 +1255,7 @@ public static int DoubleCountRetBuffCaller(int i) { StructSizeThirtyTwo retVal = DoubleCountRetBuffCallerWrapper(4, 2); - if (retVal.b == 4.0) + if (retVal.b == 6.0) { return 100; } @@ -1145,7 +1268,7 @@ public static int DoubleCountRetBuffCaller(int i) { StructSizeThirtyTwo retVal = DoubleCountRetBuffCallerWrapper(3, 1); - if (retVal.b == 1.0) + if (retVal.b == 2.0) { return 100; } @@ -1354,7 +1477,7 @@ public static int Struct32CalleeWithStack(long one, /// The callee uses 6 integer registers, 32 bytes of stack (3 args) /// /// Return 100 is a pass. - /// Return 113 is a failure. + /// Return 114 is a failure. /// /// public static int Struct32CallerWrapperCalleeHasStack(int two) @@ -1378,6 +1501,373 @@ public static int Struct32CallerWrapperCalleeHasStack(int two) return 100; } + /// + /// Decision to fast tail call. See CallerEnregisterableAmd64WindowsStructs8Bytes for more + /// information. + /// + public static int CalleeEnregisterableAmd64WindowsStructs8Bytes(StructSizeEightNotExplicit eightByteStruct) + { + long a = eightByteStruct.a; + + // Force this to not be inlined + int count = 0; + for (int i = 0; i < a; ++i) + { + if (i % 2 == 0) + { + ++count; + } + } + + if (count == 1000000) + { + a = count; + } + + if (count == 1) + { + a = 100; + } + else + { + a = 115; + } + + return (int)a; + } + + /// + /// Windows x64 tail call tests + /// + /// + /// + /// All targets will fast tail call + /// + /// The caller uses 2 integer registers (2 args) + /// The callee uses 1 integer registers (1 args) + /// + /// Return 100 is a pass. + /// Return 115 is a failure. + /// + /// + public static int CallerEnregisterableAmd64WindowsStructs8Bytes(int a, int b) + { + if (a % 2 == 0) + { + return CalleeEnregisterableAmd64WindowsStructs8Bytes(new StructSizeEightNotExplicit(a)); + } + else + { + return CalleeEnregisterableAmd64WindowsStructs8Bytes(new StructSizeEightNotExplicit(b)); + } + } + + /// + /// Decision to fast tail call. See CallerAmd64WindowsStructs7Bytes for more + /// information. + /// + public static int CalleeAmd64WindowsStructs7Bytes(StructSizeSevenNotExplicit sevenByteStruct) + { + int a = sevenByteStruct.a; + + // Force this to not be inlined + int count = 0; + for (int i = 0; i < a; ++i) + { + if (i % 2 == 0) + { + ++count; + } + } + + if (count == 1000000) + { + a = count; + } + + if (count == 1) + { + a = 100; + } + else + { + a = 116; + } + + return (int)a; + } + + /// + /// Windows x64 tail call tests + /// + /// + /// + /// All targets will fast tail call + /// + /// The caller uses 2 integer registers (2 args) + /// The callee uses 1 integer registers (1 args) + /// + /// Return 100 is a pass. + /// Return 116 is a failure. + /// + /// + public static int CallerAmd64WindowsStructs7Bytes(int a, int b) + { + if (a % 2 == 0) + { + return CalleeAmd64WindowsStructs7Bytes(new StructSizeSevenNotExplicit(a, 1, 2, 3)); + } + else + { + return CalleeAmd64WindowsStructs7Bytes(new StructSizeSevenNotExplicit(b, 1, 2, 3)); + } + } + + /// + /// Decision to fast tail call. See CallerAmd64WindowsStructs6Bytes for more + /// information. + /// + public static int CalleeAmd64WindowsStructs6Bytes(StructSizeSixNotExplicit sixByteStruct) + { + int a = sixByteStruct.a; + + // Force this to not be inlined + int count = 0; + for (int i = 0; i < a; ++i) + { + if (i % 2 == 0) + { + ++count; + } + } + + if (count == 1000000) + { + a = count; + } + + if (count == 1) + { + a = 100; + } + else + { + a = 117; + } + + return (int)a; + } + + /// + /// Windows x64 tail call tests + /// + /// + /// + /// All targets will fast tail call + /// + /// The caller uses 2 integer registers (2 args) + /// The callee uses 1 integer registers (1 args) + /// + /// Return 100 is a pass. + /// Return 117 is a failure. + /// + /// + public static int CallerAmd64WindowsStructs6Bytes(int a, int b) + { + if (a % 2 == 0) + { + return CalleeAmd64WindowsStructs6Bytes(new StructSizeSixNotExplicit(a, 1, 2)); + } + else + { + return CalleeAmd64WindowsStructs6Bytes(new StructSizeSixNotExplicit(b, 1, 2)); + } + } + + /// + /// Decision to fast tail call. See CallerAmd64WindowsStructs5Bytes for more + /// information. + /// + public static int CalleeAmd64WindowsStructs5Bytes(StructSizeFiveNotExplicit fiveByteStruct) + { + int a = fiveByteStruct.a; + + // Force this to not be inlined + int count = 0; + for (int i = 0; i < a; ++i) + { + if (i % 2 == 0) + { + ++count; + } + } + + if (count == 1000000) + { + a = count; + } + + if (count == 1) + { + a = 100; + } + else + { + a = 118; + } + + return (int)a; + } + + /// + /// Windows x64 tail call tests + /// + /// + /// + /// All targets will fast tail call + /// + /// The caller uses 2 integer registers (2 args) + /// The callee uses 1 integer registers (1 args) + /// + /// Return 100 is a pass. + /// Return 118 is a failure. + /// + /// + public static int CallerAmd64WindowsStructs5Bytes(int a, int b) + { + if (a % 2 == 0) + { + return CalleeAmd64WindowsStructs5Bytes(new StructSizeFiveNotExplicit(a, 1)); + } + else + { + return CalleeAmd64WindowsStructs5Bytes(new StructSizeFiveNotExplicit(b, 1)); + } + } + + /// + /// Decision to fast tail call. See CallerAmd64WindowsStructs4Bytes for more + /// information. + /// + public static int CalleeAmd64WindowsStructs4Bytes(StructSizeFourNotExplicit fourByteStruct) + { + int a = fourByteStruct.a; + + // Force this to not be inlined + int count = 0; + for (int i = 0; i < a; ++i) + { + if (i % 2 == 0) + { + ++count; + } + } + + if (count == 1000000) + { + a = count; + } + + if (count == 1) + { + a = 100; + } + else + { + a = 119; + } + + return (int)a; + } + + /// + /// Windows x64 tail call tests + /// + /// + /// + /// All targets will fast tail call + /// + /// The caller uses 2 integer registers (2 args) + /// The callee uses 1 integer registers (1 args) + /// + /// Return 100 is a pass. + /// Return 119 is a failure. + /// + /// + public static int CallerAmd64WindowsStructs4Bytes(int a, int b) + { + if (a % 2 == 0) + { + return CalleeAmd64WindowsStructs4Bytes(new StructSizeFourNotExplicit(a)); + } + else + { + return CalleeAmd64WindowsStructs4Bytes(new StructSizeFourNotExplicit(a)); + } + } + + /// + /// Decision to fast tail call. See CallerAmd64WindowsStructs3Bytes for more + /// information. + /// + public static int CalleeAmd64WindowsStructs3Bytes(StructSizeThreeNotExplicit threeByteStruct) + { + int a = threeByteStruct.a; + + // Force this to not be inlined + int count = 0; + for (int i = 0; i < a; ++i) + { + if (i % 2 == 0) + { + ++count; + } + } + + if (count == 1000000) + { + a = count; + } + + if (count == 1) + { + a = 100; + } + else + { + a = 120; + } + + return (int)a; + } + + /// + /// Windows x64 tail call tests + /// + /// + /// + /// x64 windows will not fast tail call because the struct is passed + /// byref. + /// + /// The caller uses 2 integer registers (2 args) + /// The callee uses 1 integer registers (1 args) + /// + /// Return 100 is a pass. + /// Return 120 is a failure. + /// + /// + public static int CallerAmd64WindowsStructs3Bytes(byte a, byte b) + { + if (a % 2 == 0) + { + return CalleeAmd64WindowsStructs3Bytes(new StructSizeThreeNotExplicit(a, a, a)); + } + else + { + return CalleeAmd64WindowsStructs3Bytes(new StructSizeThreeNotExplicit(b, b, b)); + } + } + //////////////////////////////////////////////////////////////////////////// // Main //////////////////////////////////////////////////////////////////////////// diff --git a/tests/src/JIT/opt/FastTailCall/FastTailCallCandidates.csproj b/tests/src/JIT/opt/FastTailCall/FastTailCallCandidates.csproj index e7e45d848566..63669431584b 100644 --- a/tests/src/JIT/opt/FastTailCall/FastTailCallCandidates.csproj +++ b/tests/src/JIT/opt/FastTailCall/FastTailCallCandidates.csproj @@ -24,7 +24,7 @@ - None + PdbOnly True True True