From bbc548f45999ffd321b9dd08f79342a88c681c59 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Thu, 5 Jun 2025 09:25:21 +0200 Subject: [PATCH 1/2] Enable FEATURE_INSTANTIATINGSTUB_AS_IL on win-x86 --- src/coreclr/clrdefinitions.cmake | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/coreclr/clrdefinitions.cmake b/src/coreclr/clrdefinitions.cmake index 133d30d41e615d..8d819950d66fb9 100644 --- a/src/coreclr/clrdefinitions.cmake +++ b/src/coreclr/clrdefinitions.cmake @@ -64,9 +64,7 @@ if(NOT CLR_CMAKE_TARGET_ARCH_I386) add_definitions(-DFEATURE_PORTABLE_SHUFFLE_THUNKS) endif() -if(CLR_CMAKE_TARGET_UNIX OR NOT CLR_CMAKE_TARGET_ARCH_I386) - add_definitions(-DFEATURE_INSTANTIATINGSTUB_AS_IL) -endif() +add_definitions(-DFEATURE_INSTANTIATINGSTUB_AS_IL) add_compile_definitions(FEATURE_CODE_VERSIONING) add_definitions(-DFEATURE_COLLECTIBLE_TYPES) From 5deed43c05fe2ef23176678f512019aceffa09b4 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Fri, 6 Jun 2025 10:00:16 +0200 Subject: [PATCH 2/2] WIP: Enable fast tailcall on x86 Co-authored-by: Jakob Botsch Nielsen --- src/coreclr/jit/codegen.h | 12 +-- src/coreclr/jit/codegencommon.cpp | 22 +++-- src/coreclr/jit/codegenlinear.cpp | 15 ++- src/coreclr/jit/codegenxarch.cpp | 78 ++++++++++++---- src/coreclr/jit/compiler.h | 14 ++- src/coreclr/jit/emitxarch.cpp | 4 + src/coreclr/jit/lower.cpp | 20 +++- src/coreclr/jit/lowerxarch.cpp | 91 ++++++++++--------- src/coreclr/jit/lsraxarch.cpp | 11 ++- src/coreclr/jit/morph.cpp | 69 +++++++++++++- src/coreclr/jit/targetx86.h | 4 +- .../ReadyToRun/Target_X86/ImportThunk.cs | 14 +++ .../JitInterface/CorInfoImpl.ReadyToRun.cs | 5 +- 13 files changed, 257 insertions(+), 102 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index cda843455067c4..6ca3d84e4f3419 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1172,14 +1172,12 @@ class CodeGen final : public CodeGenInterface #endif -#ifndef TARGET_X86 void genPutArgStkFieldList(GenTreePutArgStk* putArgStk, unsigned outArgVarNum); -#endif // !TARGET_X86 #ifdef TARGET_X86 bool genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk); void genPushReg(var_types type, regNumber srcReg); - void genPutArgStkFieldList(GenTreePutArgStk* putArgStk); + void genPutArgStkPushFieldList(GenTreePutArgStk* putArgStk); #endif // TARGET_X86 void genPutStructArgStk(GenTreePutArgStk* treeNode); @@ -1194,9 +1192,8 @@ class CodeGen final : public CodeGenInterface void genStructPutArgUnroll(GenTreePutArgStk* putArgStkNode); #ifdef TARGET_X86 void genStructPutArgPush(GenTreePutArgStk* putArgStkNode); -#else - void genStructPutArgPartialRepMovs(GenTreePutArgStk* putArgStkNode); #endif + void genStructPutArgPartialRepMovs(GenTreePutArgStk* putArgStkNode); void genCodeForStoreBlk(GenTreeBlk* storeBlkNode); void genCodeForInitBlkLoop(GenTreeBlk* initBlkNode); @@ -1294,10 +1291,9 @@ class CodeGen final : public CodeGenInterface #ifdef TARGET_X86 bool m_pushStkArg; -#else // !TARGET_X86 - unsigned m_stkArgVarNum; - unsigned m_stkArgOffset; #endif // !TARGET_X86 + unsigned m_stkArgVarNum = BAD_VAR_NUM; + unsigned m_stkArgOffset; #if defined(DEBUG) && defined(TARGET_XARCH) void genStackPointerCheck(bool doStackPointerCheck, diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 8fc6ccce7adf7f..dd750c43718bcb 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -201,9 +201,7 @@ void CodeGenInterface::CopyRegisterInfo() CodeGen::CodeGen(Compiler* theCompiler) : CodeGenInterface(theCompiler) { -#if !defined(TARGET_X86) m_stkArgVarNum = BAD_VAR_NUM; -#endif #if defined(UNIX_X86_ABI) curNestedAlignment = 0; @@ -4456,7 +4454,7 @@ void CodeGen::genFinalizeFrame() #if defined(TARGET_X86) - if (compiler->compTailCallUsed) + if (compiler->compTailCallViaJitHelperUsed) { // If we are generating a helper-based tailcall, we've set the tailcall helper "flags" // argument to "1", indicating to the tailcall helper that we've saved the callee-saved @@ -5895,10 +5893,22 @@ unsigned CodeGen::getFirstArgWithStackSlot() return BAD_VAR_NUM; #elif defined(TARGET_AMD64) return 0; -#else // TARGET_X86 - // Not implemented for x86. - NYI_X86("getFirstArgWithStackSlot not yet implemented for x86."); +#elif defined(TARGET_X86) + // On x86 args are passed on the stack in reverse, so the base is the last stack argument. + for (unsigned i = compiler->info.compArgsCount; i != 0; i--) + { + const ABIPassingInformation& abiInfo = compiler->lvaGetParameterABIInfo(i - 1); + assert(abiInfo.NumSegments == 1); + if (abiInfo.Segment(0).IsPassedOnStack()) + { + return i - 1; + } + } + + assert(!"Expected to find stack parameter"); return BAD_VAR_NUM; +#else +#error Not implemented #endif // TARGET_X86 } diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 67467b5c71ee85..eb7104528a98a2 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -1793,7 +1793,14 @@ void CodeGen::genConsumePutStructArgStk(GenTreePutArgStk* putArgNode, #ifdef TARGET_X86 assert(dstReg != REG_SPBASE); - inst_Mov(TYP_I_IMPL, dstReg, REG_SPBASE, /* canSkip */ false); + if (m_stkArgVarNum == BAD_VAR_NUM) + { + inst_Mov(TYP_I_IMPL, dstReg, REG_SPBASE, /* canSkip */ false); + } + else + { + GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, dstReg, m_stkArgVarNum, putArgNode->getArgOffset()); + } #else // !TARGET_X86 GenTree* dstAddr = putArgNode; if (dstAddr->GetRegNum() != dstReg) @@ -1834,10 +1841,9 @@ void CodeGen::genConsumePutStructArgStk(GenTreePutArgStk* putArgNode, // outArgVarNum - The lclVar num for the argument // // Notes: -// The x86 version of this is in codegenxarch.cpp, and doesn't take an -// outArgVarNum, as it pushes its args onto the stack. +// The primary x86 version of this is genPutArgStkPushFieldList in +// codegenxarch.cpp. This version is only used for fast tailcalls on x86. // -#ifndef TARGET_X86 void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk, unsigned outArgVarNum) { assert(putArgStk->gtOp1->OperIs(GT_FIELD_LIST)); @@ -1884,7 +1890,6 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk, unsigned outArg #endif } } -#endif // !TARGET_X86 //------------------------------------------------------------------------ // genSetBlockSize: Ensure that the block size is in the given register diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 8716d2eb2256dc..8c5733409930f5 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -4126,7 +4126,6 @@ void CodeGen::genStructPutArgPush(GenTreePutArgStk* putArgNode) } #endif // TARGET_X86 -#ifndef TARGET_X86 //------------------------------------------------------------------------ // genStructPutArgPartialRepMovs: Generates code for passing a struct arg by value on stack using // a mix of pointer-sized stores, "movsq" and "rep movsd". @@ -4217,7 +4216,6 @@ void CodeGen::genStructPutArgPartialRepMovs(GenTreePutArgStk* putArgNode) assert(numGCSlotsCopied == layout->GetGCPtrCount()); } -#endif // !TARGET_X86 //------------------------------------------------------------------------ // If any Vector3 args are on stack and they are not pass-by-ref, the upper 32bits @@ -8107,7 +8105,7 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk) } //--------------------------------------------------------------------- -// genPutArgStkFieldList - generate code for passing a GT_FIELD_LIST arg on the stack. +// genPutArgStkPushFieldList - generate code for passing a GT_FIELD_LIST arg on the stack. // // Arguments // treeNode - the GT_PUTARG_STK node whose op1 is a GT_FIELD_LIST @@ -8115,7 +8113,7 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk) // Return value: // None // -void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) +void CodeGen::genPutArgStkPushFieldList(GenTreePutArgStk* putArgStk) { GenTreeFieldList* const fieldList = putArgStk->gtOp1->AsFieldList(); assert(fieldList != nullptr); @@ -8332,28 +8330,63 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk) if (data->OperIs(GT_FIELD_LIST)) { - genPutArgStkFieldList(putArgStk); + if (putArgStk->putInIncomingArgArea()) + { + genPutArgStkFieldList(putArgStk, getBaseVarForPutArgStk(putArgStk)); + } + else + { + genPutArgStkPushFieldList(putArgStk); + } return; } if (varTypeIsStruct(targetType)) { - genAdjustStackForPutArgStk(putArgStk); - genPutStructArgStk(putArgStk); + if (putArgStk->putInIncomingArgArea()) + { + m_pushStkArg = false; + m_stkArgVarNum = getBaseVarForPutArgStk(putArgStk); + m_stkArgOffset = putArgStk->getArgOffset(); + genPutStructArgStk(putArgStk); + m_stkArgVarNum = BAD_VAR_NUM; + } + else + { + genAdjustStackForPutArgStk(putArgStk); + genPutStructArgStk(putArgStk); + } return; } genConsumeRegs(data); - if (data->isUsedFromReg()) + if (putArgStk->putInIncomingArgArea()) { - genPushReg(targetType, data->GetRegNum()); + unsigned baseVarNum = getBaseVarForPutArgStk(putArgStk); + if (data->isContainedIntOrIImmed()) + { + GetEmitter()->emitIns_S_I(ins_Store(targetType), emitTypeSize(targetType), baseVarNum, + putArgStk->getArgOffset(), (int)data->AsIntConCommon()->IconValue()); + } + else + { + GetEmitter()->emitIns_S_R(ins_Store(targetType), emitTypeSize(targetType), data->GetRegNum(), baseVarNum, + putArgStk->getArgOffset()); + } } else { - assert(genTypeSize(data) == TARGET_POINTER_SIZE); - inst_TT(INS_push, emitTypeSize(data), data); - AddStackLevel(TARGET_POINTER_SIZE); + if (data->isUsedFromReg()) + { + genPushReg(targetType, data->GetRegNum()); + } + else + { + assert(genTypeSize(data) == TARGET_POINTER_SIZE); + inst_TT(INS_push, emitTypeSize(data), data); + AddStackLevel(TARGET_POINTER_SIZE); + } } #else // !TARGET_X86 { @@ -8543,7 +8576,14 @@ void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset } else { - GetEmitter()->emitIns_AR_R(ins, attr, srcReg, REG_SPBASE, offset); + if (m_stkArgVarNum != BAD_VAR_NUM) + { + GetEmitter()->emitIns_S_R(ins, attr, srcReg, m_stkArgVarNum, m_stkArgOffset + offset); + } + else + { + GetEmitter()->emitIns_AR_R(ins, attr, srcReg, REG_SPBASE, offset); + } } #else // !TARGET_X86 assert(m_stkArgVarNum != BAD_VAR_NUM); @@ -8571,7 +8611,7 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk) var_types targetType = source->TypeGet(); #if defined(TARGET_X86) && defined(FEATURE_SIMD) - if (putArgStk->isSIMD12()) + if (putArgStk->isSIMD12() && !putArgStk->putInIncomingArgArea()) { genPutArgStkSimd12(putArgStk); return; @@ -8593,11 +8633,10 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk) case GenTreePutArgStk::Kind::RepInstr: genStructPutArgRepMovs(putArgStk); break; -#ifndef TARGET_X86 + case GenTreePutArgStk::Kind::PartialRepInstr: genStructPutArgPartialRepMovs(putArgStk); break; -#endif // !TARGET_X86 case GenTreePutArgStk::Kind::Unroll: genStructPutArgUnroll(putArgStk); @@ -10567,7 +10606,8 @@ void CodeGen::genFnEpilog(BasicBlock* block) // Add 'compiler->compLclFrameSize' to ESP. Use "pop ECX" for that, except in cases // where ECX may contain some state. - if ((frameSize == TARGET_POINTER_SIZE) && !compiler->compJmpOpUsed && !compiler->compIsAsync()) + if ((frameSize == TARGET_POINTER_SIZE) && !compiler->compJmpOpUsed && !compiler->compIsAsync() && + !compiler->compFastTailCallUsed) { inst_RV(INS_pop, REG_ECX, TYP_I_IMPL); regSet.verifyRegUsed(REG_ECX); @@ -10655,7 +10695,7 @@ void CodeGen::genFnEpilog(BasicBlock* block) } #ifdef TARGET_X86 else if ((compiler->compLclFrameSize == REGSIZE_BYTES) && !compiler->compJmpOpUsed && - !compiler->compIsAsync()) + !compiler->compIsAsync() && !compiler->compFastTailCallUsed) { // "pop ecx" will make ESP point to the callee-saved registers inst_RV(INS_pop, REG_ECX, TYP_I_IMPL); @@ -10809,7 +10849,7 @@ void CodeGen::genFnEpilog(BasicBlock* block) #if FEATURE_FASTTAILCALL else { - genCallInstruction(jmpNode->AsCall()); + genCallInstruction(jmpNode->AsCall() X86_ARG(0)); } #endif // FEATURE_FASTTAILCALL } diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 847a19eb7c96c6..4fec03fb0545a7 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -9877,11 +9877,15 @@ class Compiler InlineResult* compInlineResult; // The result of importing the inlinee method. - bool compDoAggressiveInlining = false; // If true, mark every method as CORINFO_FLG_FORCEINLINE - bool compJmpOpUsed = false; // Does the method do a JMP - bool compLongUsed = false; // Does the method use TYP_LONG - bool compFloatingPointUsed = false; // Does the method use TYP_FLOAT or TYP_DOUBLE - bool compTailCallUsed = false; // Does the method do a tailcall + bool compDoAggressiveInlining = false; // If true, mark every method as CORINFO_FLG_FORCEINLINE + bool compJmpOpUsed = false; // Does the method do a JMP + bool compLongUsed = false; // Does the method use TYP_LONG + bool compFloatingPointUsed = false; // Does the method use TYP_FLOAT or TYP_DOUBLE + bool compTailCallUsed = false; // Does the method do a tailcall +#ifdef TARGET_X86 + bool compTailCallViaJitHelperUsed = false; + bool compFastTailCallUsed = false; +#endif bool compTailPrefixSeen = false; // Does the method IL have tail. prefix bool compLocallocSeen = false; // Does the method IL have localloc opcode bool compLocallocUsed = false; // Does the method use localloc. diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index f819f7fe7fc5a4..ab1133072822ca 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -11266,12 +11266,14 @@ void emitter::emitIns_Call(const EmitCallParams& params) id->idAddr()->iiaAddrMode.amScale = params.xmul ? emitEncodeScale(params.xmul) : emitter::OPSZ1; code_t code = insCodeMR(ins); +#ifndef TARGET_X86 if (ins == INS_tail_i_jmp) { // Tailcall with addressing mode/register needs to be rex.w // prefixed to be recognized as part of epilog by unwinder. code = AddRexWPrefix(id, code); } +#endif sz = emitInsSizeAM(id, code); @@ -14386,12 +14388,14 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if ((ins == INS_call) || (ins == INS_tail_i_jmp)) { code = AddX86PrefixIfNeeded(id, code, size); +#ifndef TARGET_X86 if (ins == INS_tail_i_jmp) { // tail call with addressing mode (or through register) needs rex.w // prefix to be recognized by unwinder as part of epilog. code = AddRexWPrefix(id, code); } +#endif // Special case: call via a register if (id->idIsCallRegPtr()) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 01c139d37f8177..b114ea0267817b 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -1729,9 +1729,21 @@ void Lowering::LowerArg(GenTreeCall* call, CallArg* callArg) assert(abiInfo.NumSegments == 1); const ABIPassingSegment& stackSeg = abiInfo.Segment(0); const bool putInIncomingArgArea = call->IsFastTailCall(); + unsigned argOffset = stackSeg.GetStackOffset(); + +#ifdef TARGET_X86 + if (putInIncomingArgArea) + { + // On x86 GetStackOffset returns the offset to subtract from + // the top of the stack to get the argument's address. Since + // we reuse the x86 code paths for fast tailcall transform + // the stack offset into the format that they expect. + argOffset = call->gtArgs.OutgoingArgsStackSize() - argOffset; + } +#endif GenTree* putArg = - new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, TYP_VOID, arg, stackSeg.GetStackOffset(), + new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, TYP_VOID, arg, argOffset, stackSeg.GetStackSize(), call, putInIncomingArgArea); BlockRange().InsertAfter(arg, putArg); @@ -2291,7 +2303,7 @@ void Lowering::LegalizeArgPlacement(GenTreeCall* call) cur->gtLIRFlags &= ~LIR::Flags::Mark; #if defined(DEBUG) && !FEATURE_FIXED_OUT_ARGS - if (cur->OperIs(GT_PUTARG_STK)) + if (cur->OperIs(GT_PUTARG_STK) && !cur->AsPutArgStk()->putInIncomingArgArea()) { // For !FEATURE_FIXED_OUT_ARGS (only x86) byte offsets are // subtracted from the top of the stack frame; so last pushed @@ -3503,7 +3515,7 @@ void Lowering::RehomeArgForFastTailCall(unsigned int lclNum, #endif // DEBUG GenTree* value; -#ifdef TARGET_ARM +#ifndef TARGET_64BIT if (tmpTyp == TYP_LONG) { GenTree* loResult = comp->gtNewLclFldNode(lclNum, TYP_INT, 0); @@ -3511,7 +3523,7 @@ void Lowering::RehomeArgForFastTailCall(unsigned int lclNum, value = new (comp, GT_LONG) GenTreeOp(GT_LONG, TYP_LONG, loResult, hiResult); } else -#endif // TARGET_ARM +#endif // TARGET_64BIT { value = comp->gtNewLclvNode(lclNum, tmpTyp); } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 0ff71753e89899..313cce636b7cbd 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -638,53 +638,56 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) if (src->OperIs(GT_FIELD_LIST)) { #ifdef TARGET_X86 - GenTreeFieldList* fieldList = src->AsFieldList(); - - // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order - // of uses is visible to LSRA. - assert(fieldList->Uses().IsSorted()); - fieldList->Uses().Reverse(); - - // Containment checks. - for (GenTreeFieldList::Use& use : fieldList->Uses()) + if (!putArgStk->putInIncomingArgArea()) { - GenTree* const fieldNode = use.GetNode(); - const var_types fieldType = use.GetType(); - assert(!fieldNode->TypeIs(TYP_LONG)); + GenTreeFieldList* fieldList = src->AsFieldList(); + + // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the + // order of uses is visible to LSRA. + assert(fieldList->Uses().IsSorted()); + fieldList->Uses().Reverse(); - // For x86 we must mark all integral fields as contained or reg-optional, and handle them - // accordingly in code generation, since we may have up to 8 fields, which cannot all be in - // registers to be consumed atomically by the call. - if (varTypeIsIntegralOrI(fieldNode)) + // Containment checks. + for (GenTreeFieldList::Use& use : fieldList->Uses()) { - if (IsContainableImmed(putArgStk, fieldNode)) - { - MakeSrcContained(putArgStk, fieldNode); - } - else if (IsContainableMemoryOp(fieldNode) && IsSafeToContainMem(putArgStk, fieldNode)) - { - MakeSrcContained(putArgStk, fieldNode); - } - else + GenTree* const fieldNode = use.GetNode(); + const var_types fieldType = use.GetType(); + assert(!fieldNode->TypeIs(TYP_LONG)); + + // For x86 we must mark all integral fields as contained or reg-optional, and handle them + // accordingly in code generation, since we may have up to 8 fields, which cannot all be in + // registers to be consumed atomically by the call. + if (varTypeIsIntegralOrI(fieldNode)) { - // For the case where we cannot directly push the value, if we run out of registers, - // it would be better to defer computation until we are pushing the arguments rather - // than spilling, but this situation is not all that common, as most cases of FIELD_LIST - // are promoted structs, which do not not have a large number of fields, and of those - // most are lclVars or copy-propagated constants. + if (IsContainableImmed(putArgStk, fieldNode)) + { + MakeSrcContained(putArgStk, fieldNode); + } + else if (IsContainableMemoryOp(fieldNode) && IsSafeToContainMem(putArgStk, fieldNode)) + { + MakeSrcContained(putArgStk, fieldNode); + } + else + { + // For the case where we cannot directly push the value, if we run out of registers, + // it would be better to defer computation until we are pushing the arguments rather + // than spilling, but this situation is not all that common, as most cases of FIELD_LIST + // are promoted structs, which do not not have a large number of fields, and of those + // most are lclVars or copy-propagated constants. - fieldNode->SetRegOptional(); + fieldNode->SetRegOptional(); + } } } - } - // Set the copy kind. - // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should - // adjust the stack once for those fields. The latter is really best done in code generation, but - // this tuning should probably be undertaken as a whole. - // Also, if there are floating point fields, it may be better to use the "Unroll" mode - // of copying the struct as a whole, if the fields are not register candidates. - putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; + // Set the copy kind. + // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should + // adjust the stack once for those fields. The latter is really best done in code generation, but + // this tuning should probably be undertaken as a whole. + // Also, if there are floating point fields, it may be better to use the "Unroll" mode + // of copying the struct as a whole, if the fields are not register candidates. + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; + } #endif // TARGET_X86 return; } @@ -717,7 +720,8 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) // chunks. As such, we'll only use this path for correctly-sized sources. if ((loadSize < XMM_REGSIZE_BYTES) && ((loadSize % TARGET_POINTER_SIZE) == 0)) { - putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; + putArgStk->gtPutArgStkKind = putArgStk->putInIncomingArgArea() ? GenTreePutArgStk::Kind::Unroll + : GenTreePutArgStk::Kind::Push; } else #endif // TARGET_X86 @@ -736,7 +740,8 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) // On x86, we must use `push` to store GC references to the stack in order for the emitter to // properly update the function's GC info. These `putargstk` nodes will generate a sequence of // `push` instructions. - putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; + putArgStk->gtPutArgStkKind = putArgStk->putInIncomingArgArea() ? GenTreePutArgStk::Kind::PartialRepInstr + : GenTreePutArgStk::Kind::Push; #else // !TARGET_X86 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PartialRepInstr; #endif // !TARGET_X86 @@ -803,7 +808,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) MakeSrcContained(putArgStk, src); } #ifdef TARGET_X86 - else if (genTypeSize(src) == TARGET_POINTER_SIZE) + else if ((genTypeSize(src) == TARGET_POINTER_SIZE) && !putArgStk->putInIncomingArgArea()) { // We can use "src" directly from memory with "push [mem]". TryMakeSrcContainedOrRegOptional(putArgStk, src); @@ -7565,7 +7570,7 @@ void Lowering::ContainCheckCallOperands(GenTreeCall* call) } else #endif // TARGET_X86 - if (ctrlExpr->isIndir()) + if (ctrlExpr->isIndir() && IsSafeToContainMem(call, ctrlExpr)) { // We may have cases where we have set a register target on the ctrlExpr, but if it // contained we must clear it. diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 344585d192eb6a..e1e928fdcdda79 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1743,7 +1743,14 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) for (GenTreeFieldList::Use& use : putArgStk->gtOp1->AsFieldList()->Uses()) { - srcCount += BuildOperandUses(use.GetNode()); + SingleTypeRegSet candidates = RBM_NONE; +#ifdef TARGET_X86 + if (varTypeIsByte(use.GetType()) && putArgStk->putInIncomingArgArea()) + { + candidates = allByteRegs(); + } +#endif + srcCount += BuildOperandUses(use.GetNode(), candidates); } buildInternalRegisterUses(); @@ -1801,9 +1808,7 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) break; case GenTreePutArgStk::Kind::RepInstr: -#ifndef TARGET_X86 case GenTreePutArgStk::Kind::PartialRepInstr: -#endif buildInternalIntRegisterDefForNode(putArgStk, SRBM_RDI); buildInternalIntRegisterDefForNode(putArgStk, SRBM_RCX); buildInternalIntRegisterDefForNode(putArgStk, SRBM_RSI); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index fc9dd838c24161..cd62da0e20a86e 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -4229,7 +4229,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee, const char** failReason) return false; } -#ifdef TARGET_AMD64 +#ifdef TARGET_XARCH // Needed for Jit64 compat. // In future, enabling fast tail calls from methods that need GS cookie // check would require codegen side work to emit GS cookie check before a @@ -4259,6 +4259,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee, const char** failReason) } } +#ifndef TARGET_X86 // For a fast tail call the caller will use its incoming arg stack space to place // arguments, so if the callee requires more arg stack space than is available here // the fast tail call cannot be performed. This is common to all platforms. @@ -4269,6 +4270,47 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee, const char** failReason) reportFastTailCallDecision("Not enough incoming arg space"); return false; } +#else + // Managed calling convention on x86 uses callee cleanup convention with + // "ret n" so bail out on any stack size mismatch. + if (calleeArgStackSize != callerArgStackSize) + { + reportFastTailCallDecision("Mismatch in incoming arg space"); + return false; + } + + // Allow only one fast tailcall per method since we have limited number of + // epilogs that we can encoded into GC info. This is conservative but we + // don't currently track the number of used epilogs in this phase. + if (compFastTailCallUsed) + { + reportFastTailCallDecision("Fast tailcall already used"); + return false; + } + + if (fgReturnCount >= SET_EPILOGCNT_MAX) + { + reportFastTailCallDecision("Too many epilogs"); + return false; + } + + // VM uses special convention for VSD that looks at the call site. + if (callee->IsVirtualStubRelativeIndir()) + { + reportFastTailCallDecision("Indirect VSD call"); + return false; + } + + // We need to avoid the call address expression order problem on x86 + // and it's just easier to bail out. See fgMorphTailCallViaJitHelper, + // Lowering::LowerDelegateInvoke and Lowering::LowerVirtualVtableCall + // for details. Covered by GitHub_20625 and Runtime_70259 CLR tests. + if (callee->IsDelegateInvoke() || callee->IsVirtualVtable()) + { + reportFastTailCallDecision("Delegate or virtual invocation"); + return false; + } +#endif // For Windows some struct parameters are copied on the local frame // and then passed by reference. We cannot fast tail call in these situation @@ -4751,8 +4793,17 @@ GenTree* Compiler::fgMorphPotentialTailCall(GenTreeCall* call) call->gtCallMoreFlags |= GTF_CALL_M_TAILCALL; if (tailCallViaJitHelper) { +#ifdef TARGET_X86 + compTailCallViaJitHelperUsed = true; +#endif call->gtCallMoreFlags |= GTF_CALL_M_TAILCALL_VIA_JIT_HELPER; } +#ifdef TARGET_X86 + else + { + compFastTailCallUsed = true; + } +#endif #if FEATURE_TAILCALL_OPT if (fastTailCallToLoop) @@ -4944,11 +4995,11 @@ GenTree* Compiler::fgMorphPotentialTailCall(GenTreeCall* call) // etc.) for the JIT helper case. if (tailCallViaJitHelper) { - fgMorphTailCallViaJitHelper(call); - // Force re-evaluating the argInfo. fgMorphTailCallViaJitHelper will modify the // argument list, invalidating the argInfo. call->gtArgs.ResetFinalArgsAndABIInfo(); + + fgMorphTailCallViaJitHelper(call); } // Tail call via JIT helper: The VM can't use return address hijacking @@ -6394,7 +6445,13 @@ GenTree* Compiler::fgMorphCall(GenTreeCall* call) if (call->gtCallType == CT_INDIRECT) { optCallCount++; - optIndirectCallCount++; + // Do not count the tailcall morphed from fgMorphCall -> + // fgMorphPotentialTailCall -> fgMorphCall as indirect call + // to prevent forcing EBP frame later. + if ((call->gtCallMoreFlags & GTF_CALL_M_TAILCALL) == 0) + { + optIndirectCallCount++; + } } else if (call->gtCallType == CT_USER_FUNC) { @@ -13569,8 +13626,10 @@ void Compiler::fgSetOptions() #ifdef TARGET_X86 - if (compTailCallUsed) + if (compTailCallViaJitHelperUsed) + { codeGen->setFramePointerRequired(true); + } #endif // TARGET_X86 diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index ac0130f5d274ce..8858414172c12a 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -22,8 +22,8 @@ #define FEATURE_FIXED_OUT_ARGS 0 // X86 uses push instructions to pass args #define FEATURE_STRUCTPROMOTE 1 // JIT Optimization to promote fields of structs into registers #define FEATURE_MULTIREG_STRUCT_PROMOTE 0 // True when we want to promote fields of a multireg struct into registers - #define FEATURE_FASTTAILCALL 0 // Tail calls made as epilog+jmp - #define FEATURE_TAILCALL_OPT 0 // opportunistic Tail calls (without ".tail" prefix) made as fast tail calls. + #define FEATURE_FASTTAILCALL 1 // Tail calls made as epilog+jmp + #define FEATURE_TAILCALL_OPT 1 // opportunistic Tail calls (without ".tail" prefix) made as fast tail calls. // the flags need to be set #define FEATURE_IMPLICIT_BYREFS 0 // Support for struct parameters passed via pointers to shadow copies #define FEATURE_MULTIREG_ARGS_OR_RET 1 // Support for passing and/or returning single values in more than one register diff --git a/src/coreclr/tools/aot/ILCompiler.ReadyToRun/Compiler/DependencyAnalysis/ReadyToRun/Target_X86/ImportThunk.cs b/src/coreclr/tools/aot/ILCompiler.ReadyToRun/Compiler/DependencyAnalysis/ReadyToRun/Target_X86/ImportThunk.cs index 5ae8df415f54fd..36698a295e8e64 100644 --- a/src/coreclr/tools/aot/ILCompiler.ReadyToRun/Compiler/DependencyAnalysis/ReadyToRun/Target_X86/ImportThunk.cs +++ b/src/coreclr/tools/aot/ILCompiler.ReadyToRun/Compiler/DependencyAnalysis/ReadyToRun/Target_X86/ImportThunk.cs @@ -37,6 +37,20 @@ protected override void EmitCode(NodeFactory factory, ref X86Emitter instruction break; + case Kind.DelayLoadHelperWithExistingIndirectionCell: + // Indirection cell is already in eax which will be first arg. Used for fast tailcalls. + + if (!relocsOnly) + { + // push table index + instructionEncoder.EmitPUSH((sbyte)_containingImportSection.IndexFromBeginningOfArray); + } + + // push [module] + instructionEncoder.EmitPUSH(factory.ModuleImport); + + break; + case Kind.Lazy: // mov edx, [module] instructionEncoder.EmitMOV(Register.EDX, factory.ModuleImport); diff --git a/src/coreclr/tools/aot/ILCompiler.ReadyToRun/JitInterface/CorInfoImpl.ReadyToRun.cs b/src/coreclr/tools/aot/ILCompiler.ReadyToRun/JitInterface/CorInfoImpl.ReadyToRun.cs index c39971017ce7ec..1d312ece17ea90 100644 --- a/src/coreclr/tools/aot/ILCompiler.ReadyToRun/JitInterface/CorInfoImpl.ReadyToRun.cs +++ b/src/coreclr/tools/aot/ILCompiler.ReadyToRun/JitInterface/CorInfoImpl.ReadyToRun.cs @@ -3255,9 +3255,10 @@ private void reportInliningDecision(CORINFO_METHOD_STRUCT_* inlinerHnd, CORINFO_ private void updateEntryPointForTailCall(ref CORINFO_CONST_LOOKUP entryPoint) { - // In x64 we normally use a return address to find the indirection cell for delay load. + // In x86/x64 we normally use a return address to find the indirection cell for delay load. // For tailcalls we instead expect the JIT to leave the indirection in rax. - if (_compilation.TypeSystemContext.Target.Architecture != TargetArchitecture.X64) + if (_compilation.TypeSystemContext.Target.Architecture != TargetArchitecture.X64 && + _compilation.TypeSystemContext.Target.Architecture != TargetArchitecture.X86) return; object node = HandleToObject(entryPoint.addr);