From c0974772035a716ae9e54558096e6431e0f65ab9 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Fri, 14 Feb 2025 13:14:28 +0100 Subject: [PATCH 1/4] JIT: Clean up and optimize `StackLevelSetter` The stack levels computed are not actually used for anything outside x86, so avoid computing them in that case. Also, the max stack level does not actually get used by the backend, even for x86, so no need to save it in `Compiler`. --- src/coreclr/jit/codegenxarch.cpp | 6 ----- src/coreclr/jit/compiler.h | 27 --------------------- src/coreclr/jit/compiler.hpp | 14 ----------- src/coreclr/jit/flowgraph.cpp | 2 +- src/coreclr/jit/stacklevelsetter.cpp | 35 +++++++++++++++++----------- 5 files changed, 22 insertions(+), 62 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e0da7904ecc688..92aa15e1723eb6 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -9557,9 +9557,6 @@ void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed) 0, // argSize. Again, we have to lie about it EA_UNKNOWN); // retSize - // Check that we have place for the push. - assert(compiler->fgGetPtrArgCntMax() >= 1); - #if defined(UNIX_X86_ABI) // Restoring alignment manually. This is similar to CodeGen::genRemoveAlignmentAfterCall GetEmitter()->emitIns_R_I(INS_add, EA_4BYTE, REG_SPBASE, 0x10); @@ -9638,9 +9635,6 @@ void CodeGen::genProfilingLeaveCallback(unsigned helper) #endif genEmitHelperCall(helper, argSize, EA_UNKNOWN /* retSize */); - // Check that we have place for the push. - assert(compiler->fgGetPtrArgCntMax() >= 1); - #if defined(UNIX_X86_ABI) // Restoring alignment manually. This is similar to CodeGen::genRemoveAlignmentAfterCall GetEmitter()->emitIns_R_I(INS_add, EA_4BYTE, REG_SPBASE, 0x10); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index f8855bfc41ab2d..c88d6d3aed3f69 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6698,33 +6698,6 @@ class Compiler //------------------------- Morphing -------------------------------------- - unsigned fgPtrArgCntMax = 0; - -public: - //------------------------------------------------------------------------ - // fgGetPtrArgCntMax: Return the maximum number of pointer-sized stack arguments that calls inside this method - // can push on the stack. This value is calculated during morph. - // - // Return Value: - // Returns fgPtrArgCntMax, that is a private field. - // - unsigned fgGetPtrArgCntMax() const - { - return fgPtrArgCntMax; - } - - //------------------------------------------------------------------------ - // fgSetPtrArgCntMax: Set the maximum number of pointer-sized stack arguments that calls inside this method - // can push on the stack. This function is used during StackLevelSetter to fix incorrect morph calculations. - // - void fgSetPtrArgCntMax(unsigned argCntMax) - { - fgPtrArgCntMax = argCntMax; - } - - bool compCanEncodePtrArgCntMax(); - -private: hashBv* fgAvailableOutgoingArgTemps; ArrayStack* fgUsedSharedTemps = nullptr; diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp index 1738810e149b4b..6272380a4aaf50 100644 --- a/src/coreclr/jit/compiler.hpp +++ b/src/coreclr/jit/compiler.hpp @@ -3024,20 +3024,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */ -inline bool Compiler::compCanEncodePtrArgCntMax() -{ -#ifdef JIT32_GCENCODER - // DDB 204533: - // The GC encoding for fully interruptible methods does not - // support more than 1023 pushed arguments, so we have to - // use a partially interruptible GC info/encoding. - // - return (fgPtrArgCntMax < MAX_PTRARG_OFS); -#else // JIT32_GCENCODER - return true; -#endif -} - /***************************************************************************** * * Call the given function pointer for all nodes in the tree. The 'visitor' diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index 70b0e7c5667486..e06de9b8b41e61 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -4019,7 +4019,7 @@ PhaseStatus Compiler::fgSetBlockOrder() BasicBlock::s_nMaxTrees = 0; #endif - if (compCanEncodePtrArgCntMax() && fgHasCycleWithoutGCSafePoint()) + if (fgHasCycleWithoutGCSafePoint()) { JITDUMP("Marking method as fully interruptible\n"); SetInterruptible(true); diff --git a/src/coreclr/jit/stacklevelsetter.cpp b/src/coreclr/jit/stacklevelsetter.cpp index eab295b9bc0bc6..f8b1b94dfd691f 100644 --- a/src/coreclr/jit/stacklevelsetter.cpp +++ b/src/coreclr/jit/stacklevelsetter.cpp @@ -24,22 +24,12 @@ StackLevelSetter::StackLevelSetter(Compiler* compiler) } //------------------------------------------------------------------------ -// DoPhase: Calculate stack slots numbers for outgoing args. +// DoPhase: Calculate stack slots numbers for outgoing args and compute +// requirements of throw helper blocks. // // Returns: // PhaseStatus indicating what, if anything, was changed. // -// Notes: -// For non-x86 platforms it calculates the max number of slots -// that calls inside this method can push on the stack. -// This value is used for sanity checks in the emitter. -// -// Stack slots are pointer-sized: 4 bytes for 32-bit platforms, 8 bytes for 64-bit platforms. -// -// For x86 it also sets throw-helper blocks incoming stack depth and set -// framePointerRequired when it is necessary. These values are used to pop -// pushed args when an exception occurs. -// PhaseStatus StackLevelSetter::DoPhase() { for (BasicBlock* const block : comp->Blocks()) @@ -56,7 +46,6 @@ PhaseStatus StackLevelSetter::DoPhase() CheckAdditionalArgs(); - comp->fgSetPtrArgCntMax(maxStackLevel); CheckArgCnt(); // When optimizing, check if there are any unused throw helper blocks, @@ -125,10 +114,21 @@ PhaseStatus StackLevelSetter::DoPhase() void StackLevelSetter::ProcessBlock(BasicBlock* block) { assert(currentStackLevel == 0); +#ifndef TARGET_X86 + // Outside x86 we do not need to compute pushed/popped stack slots. + // However, we do optimize throw-helpers away here. + if (!throwHelperBlocksUsed || comp->opts.OptimizationDisabled()) + { + return; + } +#endif + LIR::ReadOnlyRange& range = LIR::AsRange(block); for (auto i = range.rbegin(); i != range.rend(); ++i) { GenTree* node = *i; + +#ifdef TARGET_X86 if (node->OperIsPutArgStkOrSplit()) { GenTreePutArgStk* putArg = node->AsPutArgStk(); @@ -145,6 +145,7 @@ void StackLevelSetter::ProcessBlock(BasicBlock* block) call->gtArgs.SetStkSizeBytes(usedStackSlotsCount * TARGET_POINTER_SIZE); #endif // UNIX_X86_ABI } +#endif if (!throwHelperBlocksUsed) { @@ -410,7 +411,12 @@ void StackLevelSetter::SubStackLevel(unsigned value) // void StackLevelSetter::CheckArgCnt() { - if (!comp->compCanEncodePtrArgCntMax()) +#ifdef JIT32_GCENCODER + // The GC encoding for fully interruptible methods does not + // support more than 1023 pushed arguments, so we have to + // use a partially interruptible GC info/encoding. + // + if (maxStackLevel >= MAX_PTRARG_OFS) { #ifdef DEBUG if (comp->verbose) @@ -431,6 +437,7 @@ void StackLevelSetter::CheckArgCnt() #endif comp->codeGen->setFramePointerRequired(true); } +#endif } //------------------------------------------------------------------------ From ab794f75fe8868d0506b9dcef56a1de878d0926d Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Fri, 14 Feb 2025 14:08:53 +0100 Subject: [PATCH 2/4] Quirk away the diffs --- src/coreclr/jit/block.cpp | 14 +++++------ src/coreclr/jit/block.h | 10 ++++---- src/coreclr/jit/lower.cpp | 35 ++++++++++++++++++++++++++++ src/coreclr/jit/lower.h | 1 + src/coreclr/jit/morph.cpp | 4 ++-- src/coreclr/jit/stacklevelsetter.cpp | 1 + 6 files changed, 51 insertions(+), 14 deletions(-) diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp index eb4d5666fcf4e8..e8b11450b9af2f 100644 --- a/src/coreclr/jit/block.cpp +++ b/src/coreclr/jit/block.cpp @@ -1432,8 +1432,8 @@ bool BasicBlock::endsWithJmpMethod(Compiler* comp) const // bool BasicBlock::endsWithTailCallOrJmp(Compiler* comp, bool fastTailCallsOnly /*=false*/) const { - GenTree* tailCall = nullptr; - bool tailCallsConvertibleToLoopOnly = false; + GenTreeCall* tailCall = nullptr; + bool tailCallsConvertibleToLoopOnly = false; return endsWithJmpMethod(comp) || endsWithTailCall(comp, fastTailCallsOnly, tailCallsConvertibleToLoopOnly, &tailCall); } @@ -1454,10 +1454,10 @@ bool BasicBlock::endsWithTailCallOrJmp(Compiler* comp, bool fastTailCallsOnly /* // Notes: // At most one of fastTailCallsOnly and tailCallsConvertibleToLoopOnly flags can be true. // -bool BasicBlock::endsWithTailCall(Compiler* comp, - bool fastTailCallsOnly, - bool tailCallsConvertibleToLoopOnly, - GenTree** tailCall) const +bool BasicBlock::endsWithTailCall(Compiler* comp, + bool fastTailCallsOnly, + bool tailCallsConvertibleToLoopOnly, + GenTreeCall** tailCall) const { assert(!fastTailCallsOnly || !tailCallsConvertibleToLoopOnly); *tailCall = nullptr; @@ -1524,7 +1524,7 @@ bool BasicBlock::endsWithTailCall(Compiler* comp, // Return Value: // true if the block ends with a tail call convertible to loop. // -bool BasicBlock::endsWithTailCallConvertibleToLoop(Compiler* comp, GenTree** tailCall) const +bool BasicBlock::endsWithTailCallConvertibleToLoop(Compiler* comp, GenTreeCall** tailCall) const { bool fastTailCallsOnly = false; bool tailCallsConvertibleToLoopOnly = true; diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index bb746593842a7c..4e838d3eb48c46 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -1765,14 +1765,14 @@ struct BasicBlock : private LIR::Range bool endsWithJmpMethod(Compiler* comp) const; - bool endsWithTailCall(Compiler* comp, - bool fastTailCallsOnly, - bool tailCallsConvertibleToLoopOnly, - GenTree** tailCall) const; + bool endsWithTailCall(Compiler* comp, + bool fastTailCallsOnly, + bool tailCallsConvertibleToLoopOnly, + GenTreeCall** tailCall) const; bool endsWithTailCallOrJmp(Compiler* comp, bool fastTailCallsOnly = false) const; - bool endsWithTailCallConvertibleToLoop(Compiler* comp, GenTree** tailCall) const; + bool endsWithTailCallConvertibleToLoop(Compiler* comp, GenTreeCall** tailCall) const; // Returns the first statement in the statement list of "this" that is // not an SSA definition (a lcl = phi(...) store). diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index eb4738d4db37fc..cfdeea7771600c 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10942,5 +10942,40 @@ void Lowering::FinalizeOutgoingArgSpace() comp->lvaOutgoingArgSpaceSize = m_outgoingArgSpaceSize; comp->lvaOutgoingArgSpaceSize.MarkAsReadOnly(); comp->lvaGetDesc(comp->lvaOutgoingArgSpaceVar)->GrowBlockLayout(comp->typGetBlkLayout(m_outgoingArgSpaceSize)); + + SetFramePointerFromArgSpaceSize(); +#endif +} + +//---------------------------------------------------------------------------------------------- +// Lowering::SetFramePointerFromArgSpaceSize: +// Set the frame pointer from the arg space size. This is a quirk because +// StackLevelSetter used to do this even outside x86. +// +void Lowering::SetFramePointerFromArgSpaceSize() +{ +#ifdef TARGET_X64 + unsigned stackLevelSpace = m_outgoingArgSpaceSize; + + if (comp->compTailCallUsed) + { + // StackLevelSetter also used to count tailcalls. + for (BasicBlock* block : comp->Blocks()) + { + GenTreeCall* tailCall; + if (block->endsWithTailCall(comp, true, false, &tailCall)) + { + stackLevelSpace = max(stackLevelSpace, tailCall->gtArgs.OutgoingArgsStackSize()); + } + } + } + + unsigned stackLevel = + (max(stackLevelSpace, (unsigned)MIN_ARG_AREA_FOR_CALL) - MIN_ARG_AREA_FOR_CALL) / TARGET_POINTER_SIZE; + + if (stackLevel >= 4) + { + comp->codeGen->setFramePointerRequired(true); + } #endif } diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 86b336c517f3c1..44e57dc7b83807 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -45,6 +45,7 @@ class Lowering final : public Phase } void FinalizeOutgoingArgSpace(); + void SetFramePointerFromArgSpaceSize(); private: // LowerRange handles new code that is introduced by or after Lowering. diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 836ad85d53e221..a74066a85ef9dd 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -13148,10 +13148,10 @@ void Compiler::fgMorphStmts(BasicBlock* block) } #if FEATURE_FASTTAILCALL - GenTree* recursiveTailCall = nullptr; + GenTreeCall* recursiveTailCall = nullptr; if (block->endsWithTailCallConvertibleToLoop(this, &recursiveTailCall)) { - fgMorphRecursiveFastTailCallIntoLoop(block, recursiveTailCall->AsCall()); + fgMorphRecursiveFastTailCallIntoLoop(block, recursiveTailCall); } #endif diff --git a/src/coreclr/jit/stacklevelsetter.cpp b/src/coreclr/jit/stacklevelsetter.cpp index f8b1b94dfd691f..484aeaf0cc8094 100644 --- a/src/coreclr/jit/stacklevelsetter.cpp +++ b/src/coreclr/jit/stacklevelsetter.cpp @@ -427,6 +427,7 @@ void StackLevelSetter::CheckArgCnt() #endif comp->SetInterruptible(false); } + if (maxStackLevel >= sizeof(unsigned)) { #ifdef DEBUG From 7add4c5166653447d79a08a8fac56626710feffa Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Fri, 14 Feb 2025 15:12:21 +0100 Subject: [PATCH 3/4] Fix ifdef --- src/coreclr/jit/lower.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index cfdeea7771600c..a721a70b64a6f5 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10954,7 +10954,6 @@ void Lowering::FinalizeOutgoingArgSpace() // void Lowering::SetFramePointerFromArgSpaceSize() { -#ifdef TARGET_X64 unsigned stackLevelSpace = m_outgoingArgSpaceSize; if (comp->compTailCallUsed) @@ -10977,5 +10976,4 @@ void Lowering::SetFramePointerFromArgSpaceSize() { comp->codeGen->setFramePointerRequired(true); } -#endif } From 62dad6ba0ec2a3c7a72f08e5be45b195a718fb1a Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Wed, 19 Feb 2025 20:43:01 +0100 Subject: [PATCH 4/4] Skip iterating blocks entirely --- src/coreclr/jit/stacklevelsetter.cpp | 36 ++++++++++++++++++---------- src/coreclr/jit/stacklevelsetter.h | 1 + 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/stacklevelsetter.cpp b/src/coreclr/jit/stacklevelsetter.cpp index 484aeaf0cc8094..6cccf3053dd9cd 100644 --- a/src/coreclr/jit/stacklevelsetter.cpp +++ b/src/coreclr/jit/stacklevelsetter.cpp @@ -32,10 +32,7 @@ StackLevelSetter::StackLevelSetter(Compiler* compiler) // PhaseStatus StackLevelSetter::DoPhase() { - for (BasicBlock* const block : comp->Blocks()) - { - ProcessBlock(block); - } + ProcessBlocks(); #if !FEATURE_FIXED_OUT_ARGS if (framePointerRequired) @@ -98,7 +95,28 @@ PhaseStatus StackLevelSetter::DoPhase() } //------------------------------------------------------------------------ -// ProcessBlock: Do stack level calculations for one block. +// ProcessBlocks: Process all the blocks if necessary. +// +void StackLevelSetter::ProcessBlocks() +{ +#ifndef TARGET_X86 + // Outside x86 we do not need to compute pushed/popped stack slots. + // However, we do optimize throw-helpers and need to process the blocks for + // that, but only when optimizing. + if (!throwHelperBlocksUsed || comp->opts.OptimizationDisabled()) + { + return; + } +#endif + + for (BasicBlock* const block : comp->Blocks()) + { + ProcessBlock(block); + } +} + +//------------------------------------------------------------------------ +// ProcessBlock: Do stack level and throw helper determinations for one block. // // Notes: // Block starts and ends with an empty outgoing stack. @@ -114,14 +132,6 @@ PhaseStatus StackLevelSetter::DoPhase() void StackLevelSetter::ProcessBlock(BasicBlock* block) { assert(currentStackLevel == 0); -#ifndef TARGET_X86 - // Outside x86 we do not need to compute pushed/popped stack slots. - // However, we do optimize throw-helpers away here. - if (!throwHelperBlocksUsed || comp->opts.OptimizationDisabled()) - { - return; - } -#endif LIR::ReadOnlyRange& range = LIR::AsRange(block); for (auto i = range.rbegin(); i != range.rend(); ++i) diff --git a/src/coreclr/jit/stacklevelsetter.h b/src/coreclr/jit/stacklevelsetter.h index 7813e0e645e078..55fdeb8428a274 100644 --- a/src/coreclr/jit/stacklevelsetter.h +++ b/src/coreclr/jit/stacklevelsetter.h @@ -14,6 +14,7 @@ class StackLevelSetter final : public Phase virtual PhaseStatus DoPhase() override; private: + void ProcessBlocks(); void ProcessBlock(BasicBlock* block); void SetThrowHelperBlocks(GenTree* node, BasicBlock* block);