From eb6c689650dfc70344c0212ab74261ec9a90cb99 Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Mon, 27 Oct 2025 00:11:52 +0200 Subject: [PATCH 01/30] Bring throw helpers to PUSH_COOP_PINVOKE_FRAME plan --- src/coreclr/vm/amd64/AsmHelpers.asm | 44 +++++ src/coreclr/vm/amd64/asmhelpers.S | 45 +++++ src/coreclr/vm/arm/asmhelpers.S | 46 +++++ src/coreclr/vm/arm64/asmhelpers.S | 45 +++++ src/coreclr/vm/arm64/asmhelpers.asm | 44 +++++ src/coreclr/vm/excep.cpp | 246 ++++++++++++++++++++++++ src/coreclr/vm/frames.h | 2 - src/coreclr/vm/i386/asmhelpers.asm | 6 +- src/coreclr/vm/jithelpers.cpp | 30 +-- src/coreclr/vm/loongarch64/asmhelpers.S | 66 +++++++ src/coreclr/vm/riscv64/asmhelpers.S | 71 +++++++ 11 files changed, 613 insertions(+), 32 deletions(-) diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index 83e53a633ded54..5b42128222ed42 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -1200,4 +1200,48 @@ NESTED_END CallJittedMethodRetI8, _TEXT endif ; FEATURE_INTERPRETER +;========================================================================== +; Capture a transition block with register values and call the IL_Throw_Impl +; implementation written in C. +; +; Input state: +; RCX = Pointer to exception object +;========================================================================== +NESTED_ENTRY IL_Throw, _TEXT + PUSH_COOP_PINVOKE_FRAME rdx + ; RCX already contains exception object + ; RDX contains pointer to TransitionBlock + call IL_Throw_Impl + ; Should never return + int 3 +NESTED_END IL_Throw, _TEXT + +;========================================================================== +; Capture a transition block with register values and call the IL_ThrowExact_Impl +; implementation written in C. +; +; Input state: +; RCX = Pointer to exception object +;========================================================================== +NESTED_ENTRY IL_ThrowExact, _TEXT + PUSH_COOP_PINVOKE_FRAME rdx + ; RCX already contains exception object + ; RDX contains pointer to TransitionBlock + call IL_ThrowExact_Impl + ; Should never return + int 3 +NESTED_END IL_ThrowExact, _TEXT + +;========================================================================== +; Capture a transition block with register values and call the IL_Rethrow_Impl +; implementation written in C. +;========================================================================== +NESTED_ENTRY IL_Rethrow, _TEXT + PUSH_COOP_PINVOKE_FRAME rcx + ; RCX contains pointer to TransitionBlock + call IL_Rethrow_Impl + ; Should never return + int 3 +NESTED_END IL_Rethrow, _TEXT + end diff --git a/src/coreclr/vm/amd64/asmhelpers.S b/src/coreclr/vm/amd64/asmhelpers.S index 9f5fd30792dcf4..148af0ac0e013f 100644 --- a/src/coreclr/vm/amd64/asmhelpers.S +++ b/src/coreclr/vm/amd64/asmhelpers.S @@ -1913,3 +1913,48 @@ END_PROLOGUE NESTED_END CallJittedMethodRetDoubleDouble, _TEXT #endif // FEATURE_INTERPRETER + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_Throw_Impl +// implementation written in C. +// +// Input state: +// rdi = Pointer to exception object +// ------------------------------------------------------------------ +NESTED_ENTRY IL_Throw, _TEXT, NoHandler + PUSH_COOP_PINVOKE_FRAME rsi + // rdi already contains exception object + // rsi contains pointer to TransitionBlock + call C_FUNC(IL_Throw_Impl) + // Should never return + int3 +NESTED_END IL_Throw, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_ThrowExact_Impl +// implementation written in C. +// +// Input state: +// rdi = Pointer to exception object +// ------------------------------------------------------------------ +NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler + PUSH_COOP_PINVOKE_FRAME rsi + // rdi already contains exception object + // rsi contains pointer to TransitionBlock + call C_FUNC(IL_ThrowExact_Impl) + // Should never return + int3 +NESTED_END IL_ThrowExact, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_Rethrow_Impl +// implementation written in C. +// ------------------------------------------------------------------ +NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler + PUSH_COOP_PINVOKE_FRAME rdi + // rdi contains pointer to TransitionBlock + call C_FUNC(IL_Rethrow_Impl) + // Should never return + int3 +NESTED_END IL_Rethrow, _TEXT + diff --git a/src/coreclr/vm/arm/asmhelpers.S b/src/coreclr/vm/arm/asmhelpers.S index 74cf86fd5f31b1..50281235a3b74b 100644 --- a/src/coreclr/vm/arm/asmhelpers.S +++ b/src/coreclr/vm/arm/asmhelpers.S @@ -897,3 +897,49 @@ LEAF_ENTRY ThisPtrRetBufPrecodeWorker, _TEXT eor r0, r0, r1 EPILOG_BRANCH_REG r12 LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_Throw_Impl +// implementation written in C. +// +// Input state: +// r0 = Pointer to exception object +// ------------------------------------------------------------------ +NESTED_ENTRY IL_Throw, _TEXT, NoHandler + PROLOG_PUSH "{r0-r3,r11,lr}" + PROLOG_VPUSH "{d0-d7}" + mov r1, sp // r1 = TransitionBlock* + bl C_FUNC(IL_Throw_Impl) + // Should never return + bkpt #0 +NESTED_END IL_Throw, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_ThrowExact_Impl +// implementation written in C. +// +// Input state: +// r0 = Pointer to exception object +// ------------------------------------------------------------------ +NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler + PROLOG_PUSH "{r0-r3,r11,lr}" + PROLOG_VPUSH "{d0-d7}" + mov r1, sp // r1 = TransitionBlock* + bl C_FUNC(IL_ThrowExact_Impl) + // Should never return + bkpt #0 +NESTED_END IL_ThrowExact, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_Rethrow_Impl +// implementation written in C. +// ------------------------------------------------------------------ +NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler + PROLOG_PUSH "{r0-r3,r11,lr}" + PROLOG_VPUSH "{d0-d7}" + mov r0, sp // r0 = TransitionBlock* + bl C_FUNC(IL_Rethrow_Impl) + // Should never return + bkpt #0 +NESTED_END IL_Rethrow, _TEXT + diff --git a/src/coreclr/vm/arm64/asmhelpers.S b/src/coreclr/vm/arm64/asmhelpers.S index 046482ea6e996c..cf323d61b12255 100644 --- a/src/coreclr/vm/arm64/asmhelpers.S +++ b/src/coreclr/vm/arm64/asmhelpers.S @@ -2752,3 +2752,48 @@ NESTED_END CallJittedMethodRet4Vector128, _TEXT #endif // FEATURE_INTERPRETER + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_Throw_Impl +// implementation written in C. +// +// Input state: +// x0 = Pointer to exception object +// ------------------------------------------------------------------ +NESTED_ENTRY IL_Throw, _TEXT, NoHandler + PUSH_COOP_PINVOKE_FRAME x1 + // x0 already contains exception object + // x1 contains pointer to TransitionBlock + bl C_FUNC(IL_Throw_Impl) + // Should never return + brk #0 +NESTED_END IL_Throw, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_ThrowExact_Impl +// implementation written in C. +// +// Input state: +// x0 = Pointer to exception object +// ------------------------------------------------------------------ +NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler + PUSH_COOP_PINVOKE_FRAME x1 + // x0 already contains exception object + // x1 contains pointer to TransitionBlock + bl C_FUNC(IL_ThrowExact_Impl) + // Should never return + brk #0 +NESTED_END IL_ThrowExact, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_Rethrow_Impl +// implementation written in C. +// ------------------------------------------------------------------ +NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler + PUSH_COOP_PINVOKE_FRAME x0 + // x0 contains pointer to TransitionBlock + bl C_FUNC(IL_Rethrow_Impl) + // Should never return + brk #0 +NESTED_END IL_Rethrow, _TEXT + diff --git a/src/coreclr/vm/arm64/asmhelpers.asm b/src/coreclr/vm/arm64/asmhelpers.asm index 281ca3bd0e85bc..d8caf2fc23fc32 100644 --- a/src/coreclr/vm/arm64/asmhelpers.asm +++ b/src/coreclr/vm/arm64/asmhelpers.asm @@ -2981,5 +2981,49 @@ CopyLoop #endif // FEATURE_INTERPRETER +; ------------------------------------------------------------------ +; Capture a transition block with register values and call the IL_Throw_Impl +; implementation written in C. +; +; Input state: +; x0 = Pointer to exception object +; ------------------------------------------------------------------ + NESTED_ENTRY IL_Throw, _TEXT, NoHandler + PUSH_COOP_PINVOKE_FRAME x1 + ; x0 already contains exception object + ; x1 contains pointer to TransitionBlock + bl IL_Throw_Impl + ; Should never return + brk #0 + NESTED_END IL_Throw, _TEXT + +; ------------------------------------------------------------------ +; Capture a transition block with register values and call the IL_ThrowExact_Impl +; implementation written in C. +; +; Input state: +; x0 = Pointer to exception object +; ------------------------------------------------------------------ + NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler + PUSH_COOP_PINVOKE_FRAME x1 + ; x0 already contains exception object + ; x1 contains pointer to TransitionBlock + bl IL_ThrowExact_Impl + ; Should never return + brk #0 + NESTED_END IL_ThrowExact, _TEXT + +; ------------------------------------------------------------------ +; Capture a transition block with register values and call the IL_Rethrow_Impl +; implementation written in C. +; ------------------------------------------------------------------ + NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler + PUSH_COOP_PINVOKE_FRAME x0 + ; x0 contains pointer to TransitionBlock + bl IL_Rethrow_Impl + ; Should never return + brk #0 + NESTED_END IL_Rethrow, _TEXT + ; Must be at very end of file END diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index fcc8835a49f1e4..ce3d02b9fab1b0 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10871,6 +10871,252 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_ReturnAddress = pTransitionBlock->m_ReturnAddress; } +#elif defined(TARGET_AMD64) + +void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *pTransitionBlock) +{ + LIMITED_METHOD_CONTRACT; + + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + m_Context.SegCs = 0; + m_Context.SegSs = 0; + m_Context.EFlags = 0; + +#ifdef UNIX_AMD64_ABI + // On Unix AMD64, argument registers are saved in the transition block + m_Context.Rax = 0; + m_Context.Rdi = pTransitionBlock->m_argumentRegisters.rdi; + m_Context.Rsi = pTransitionBlock->m_argumentRegisters.rsi; + m_Context.Rdx = pTransitionBlock->m_argumentRegisters.rdx; + m_Context.Rcx = pTransitionBlock->m_argumentRegisters.rcx; + m_Context.R8 = pTransitionBlock->m_argumentRegisters.r8; + m_Context.R9 = pTransitionBlock->m_argumentRegisters.r9; + + m_ContextPointers.Rdi = &m_Context.Rdi; + m_ContextPointers.Rsi = &m_Context.Rsi; + m_ContextPointers.Rdx = &m_Context.Rdx; + m_ContextPointers.Rcx = &m_Context.Rcx; + m_ContextPointers.R8 = &m_Context.R8; + m_ContextPointers.R9 = &m_Context.R9; +#else + // On Windows AMD64, argument registers are not saved in the transition block + m_Context.Rax = 0; + m_Context.Rcx = 0; + m_Context.Rdx = 0; + m_Context.R8 = 0; + m_Context.R9 = 0; +#endif + +#define CALLEE_SAVED_REGISTER(reg) \ + m_Context.reg = pTransitionBlock->m_calleeSavedRegisters.reg; \ + m_ContextPointers.reg = &m_Context.reg; + ENUM_CALLEE_SAVED_REGISTERS(); +#undef CALLEE_SAVED_REGISTER + + m_Context.Rsp = (UINT_PTR)(pTransitionBlock + 1); + m_Context.Rip = pTransitionBlock->m_ReturnAddress; + m_ReturnAddress = pTransitionBlock->m_ReturnAddress; +} + +#elif defined(TARGET_ARM) + +void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *pTransitionBlock) +{ + LIMITED_METHOD_CONTRACT; + + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + + // Copy argument registers (R0-R3) + m_Context.R0 = pTransitionBlock->m_argumentRegisters.r[0]; + m_Context.R1 = pTransitionBlock->m_argumentRegisters.r[1]; + m_Context.R2 = pTransitionBlock->m_argumentRegisters.r[2]; + m_Context.R3 = pTransitionBlock->m_argumentRegisters.r[3]; + + // Copy callee-saved registers (R4-R11, Lr) + m_Context.R4 = pTransitionBlock->m_calleeSavedRegisters.r4; + m_Context.R5 = pTransitionBlock->m_calleeSavedRegisters.r5; + m_Context.R6 = pTransitionBlock->m_calleeSavedRegisters.r6; + m_Context.R7 = pTransitionBlock->m_calleeSavedRegisters.r7; + m_Context.R8 = pTransitionBlock->m_calleeSavedRegisters.r8; + m_Context.R9 = pTransitionBlock->m_calleeSavedRegisters.r9; + m_Context.R10 = pTransitionBlock->m_calleeSavedRegisters.r10; + m_Context.R11 = pTransitionBlock->m_calleeSavedRegisters.r11; + m_Context.Lr = pTransitionBlock->m_calleeSavedRegisters.r14; // r14 is link register + + // Set up context pointers for callee-saved registers + m_ContextPointers.R4 = &m_Context.R4; + m_ContextPointers.R5 = &m_Context.R5; + m_ContextPointers.R6 = &m_Context.R6; + m_ContextPointers.R7 = &m_Context.R7; + m_ContextPointers.R8 = &m_Context.R8; + m_ContextPointers.R9 = &m_Context.R9; + m_ContextPointers.R10 = &m_Context.R10; + m_ContextPointers.R11 = &m_Context.R11; + m_ContextPointers.Lr = &m_Context.Lr; + + m_Context.Sp = (UINT_PTR)(pTransitionBlock + 1); + m_Context.Pc = pTransitionBlock->m_ReturnAddress; + m_ReturnAddress = pTransitionBlock->m_ReturnAddress; +} + +#elif defined(TARGET_ARM64) + +void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *pTransitionBlock) +{ + LIMITED_METHOD_CONTRACT; + + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + + // Copy argument registers (X0-X7) + for (int i = 0; i < 8; i++) + { + m_Context.X[i] = pTransitionBlock->m_argumentRegisters.x[i]; + } + + // Copy return buffer register (X8) + m_Context.X8 = pTransitionBlock->m_x8RetBuffReg; + + // Copy callee-saved registers (X19-X28) + m_Context.X19 = pTransitionBlock->m_calleeSavedRegisters.x19; + m_Context.X20 = pTransitionBlock->m_calleeSavedRegisters.x20; + m_Context.X21 = pTransitionBlock->m_calleeSavedRegisters.x21; + m_Context.X22 = pTransitionBlock->m_calleeSavedRegisters.x22; + m_Context.X23 = pTransitionBlock->m_calleeSavedRegisters.x23; + m_Context.X24 = pTransitionBlock->m_calleeSavedRegisters.x24; + m_Context.X25 = pTransitionBlock->m_calleeSavedRegisters.x25; + m_Context.X26 = pTransitionBlock->m_calleeSavedRegisters.x26; + m_Context.X27 = pTransitionBlock->m_calleeSavedRegisters.x27; + m_Context.X28 = pTransitionBlock->m_calleeSavedRegisters.x28; + + // Copy frame pointer and link register + m_Context.Fp = pTransitionBlock->m_calleeSavedRegisters.x29; + m_Context.Lr = pTransitionBlock->m_calleeSavedRegisters.x30; + + // Set up context pointers for callee-saved registers + m_ContextPointers.X19 = &m_Context.X19; + m_ContextPointers.X20 = &m_Context.X20; + m_ContextPointers.X21 = &m_Context.X21; + m_ContextPointers.X22 = &m_Context.X22; + m_ContextPointers.X23 = &m_Context.X23; + m_ContextPointers.X24 = &m_Context.X24; + m_ContextPointers.X25 = &m_Context.X25; + m_ContextPointers.X26 = &m_Context.X26; + m_ContextPointers.X27 = &m_Context.X27; + m_ContextPointers.X28 = &m_Context.X28; + m_ContextPointers.Fp = &m_Context.Fp; + m_ContextPointers.Lr = &m_Context.Lr; + + m_Context.Sp = (UINT_PTR)(pTransitionBlock + 1); + m_Context.Pc = pTransitionBlock->m_ReturnAddress; + m_ReturnAddress = pTransitionBlock->m_ReturnAddress; +} + +#elif defined(TARGET_LOONGARCH64) + +void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *pTransitionBlock) +{ + LIMITED_METHOD_CONTRACT; + + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + + // Copy argument registers (A0-A7) + m_Context.A0 = pTransitionBlock->m_argumentRegisters.a[0]; + m_Context.A1 = pTransitionBlock->m_argumentRegisters.a[1]; + m_Context.A2 = pTransitionBlock->m_argumentRegisters.a[2]; + m_Context.A3 = pTransitionBlock->m_argumentRegisters.a[3]; + m_Context.A4 = pTransitionBlock->m_argumentRegisters.a[4]; + m_Context.A5 = pTransitionBlock->m_argumentRegisters.a[5]; + m_Context.A6 = pTransitionBlock->m_argumentRegisters.a[6]; + m_Context.A7 = pTransitionBlock->m_argumentRegisters.a[7]; + + // Copy callee-saved registers (Fp, Ra, S0-S8) + m_Context.Fp = pTransitionBlock->m_calleeSavedRegisters.fp; + m_Context.Ra = pTransitionBlock->m_calleeSavedRegisters.ra; + m_Context.S0 = pTransitionBlock->m_calleeSavedRegisters.s0; + m_Context.S1 = pTransitionBlock->m_calleeSavedRegisters.s1; + m_Context.S2 = pTransitionBlock->m_calleeSavedRegisters.s2; + m_Context.S3 = pTransitionBlock->m_calleeSavedRegisters.s3; + m_Context.S4 = pTransitionBlock->m_calleeSavedRegisters.s4; + m_Context.S5 = pTransitionBlock->m_calleeSavedRegisters.s5; + m_Context.S6 = pTransitionBlock->m_calleeSavedRegisters.s6; + m_Context.S7 = pTransitionBlock->m_calleeSavedRegisters.s7; + m_Context.S8 = pTransitionBlock->m_calleeSavedRegisters.s8; + + // Set up context pointers for callee-saved registers + m_ContextPointers.S0 = &m_Context.S0; + m_ContextPointers.S1 = &m_Context.S1; + m_ContextPointers.S2 = &m_Context.S2; + m_ContextPointers.S3 = &m_Context.S3; + m_ContextPointers.S4 = &m_Context.S4; + m_ContextPointers.S5 = &m_Context.S5; + m_ContextPointers.S6 = &m_Context.S6; + m_ContextPointers.S7 = &m_Context.S7; + m_ContextPointers.S8 = &m_Context.S8; + m_ContextPointers.Fp = &m_Context.Fp; + m_ContextPointers.Ra = &m_Context.Ra; + + m_Context.Sp = (UINT_PTR)(pTransitionBlock + 1); + m_Context.Pc = pTransitionBlock->m_ReturnAddress; + m_ReturnAddress = pTransitionBlock->m_ReturnAddress; +} + +#elif defined(TARGET_RISCV64) + +void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *pTransitionBlock) +{ + LIMITED_METHOD_CONTRACT; + + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + + // Copy argument registers (A0-A7) + m_Context.A0 = pTransitionBlock->m_argumentRegisters.a[0]; + m_Context.A1 = pTransitionBlock->m_argumentRegisters.a[1]; + m_Context.A2 = pTransitionBlock->m_argumentRegisters.a[2]; + m_Context.A3 = pTransitionBlock->m_argumentRegisters.a[3]; + m_Context.A4 = pTransitionBlock->m_argumentRegisters.a[4]; + m_Context.A5 = pTransitionBlock->m_argumentRegisters.a[5]; + m_Context.A6 = pTransitionBlock->m_argumentRegisters.a[6]; + m_Context.A7 = pTransitionBlock->m_argumentRegisters.a[7]; + + // Copy callee-saved registers (Fp, Ra, S1-S11, Tp, Gp) + m_Context.Fp = pTransitionBlock->m_calleeSavedRegisters.fp; + m_Context.Ra = pTransitionBlock->m_calleeSavedRegisters.ra; + m_Context.S1 = pTransitionBlock->m_calleeSavedRegisters.s1; + m_Context.S2 = pTransitionBlock->m_calleeSavedRegisters.s2; + m_Context.S3 = pTransitionBlock->m_calleeSavedRegisters.s3; + m_Context.S4 = pTransitionBlock->m_calleeSavedRegisters.s4; + m_Context.S5 = pTransitionBlock->m_calleeSavedRegisters.s5; + m_Context.S6 = pTransitionBlock->m_calleeSavedRegisters.s6; + m_Context.S7 = pTransitionBlock->m_calleeSavedRegisters.s7; + m_Context.S8 = pTransitionBlock->m_calleeSavedRegisters.s8; + m_Context.S9 = pTransitionBlock->m_calleeSavedRegisters.s9; + m_Context.S10 = pTransitionBlock->m_calleeSavedRegisters.s10; + m_Context.S11 = pTransitionBlock->m_calleeSavedRegisters.s11; + m_Context.Tp = pTransitionBlock->m_calleeSavedRegisters.tp; + m_Context.Gp = pTransitionBlock->m_calleeSavedRegisters.gp; + + // Set up context pointers for callee-saved registers + m_ContextPointers.S1 = &m_Context.S1; + m_ContextPointers.S2 = &m_Context.S2; + m_ContextPointers.S3 = &m_Context.S3; + m_ContextPointers.S4 = &m_Context.S4; + m_ContextPointers.S5 = &m_Context.S5; + m_ContextPointers.S6 = &m_Context.S6; + m_ContextPointers.S7 = &m_Context.S7; + m_ContextPointers.S8 = &m_Context.S8; + m_ContextPointers.S9 = &m_Context.S9; + m_ContextPointers.S10 = &m_Context.S10; + m_ContextPointers.S11 = &m_Context.S11; + m_ContextPointers.Fp = &m_Context.Fp; + m_ContextPointers.Gp = &m_Context.Gp; + m_ContextPointers.Tp = &m_Context.Tp; + m_ContextPointers.Ra = &m_Context.Ra; + + m_Context.Sp = (UINT_PTR)(pTransitionBlock + 1); + m_Context.Pc = pTransitionBlock->m_ReturnAddress; + m_ReturnAddress = pTransitionBlock->m_ReturnAddress; +} + #endif // TARGET_X86 // diff --git a/src/coreclr/vm/frames.h b/src/coreclr/vm/frames.h index 47e1175a7d157a..cac6e19c63e531 100644 --- a/src/coreclr/vm/frames.h +++ b/src/coreclr/vm/frames.h @@ -989,9 +989,7 @@ class SoftwareExceptionFrame : public Frame LIMITED_METHOD_CONTRACT; } -#ifdef TARGET_X86 void UpdateContextFromTransitionBlock(TransitionBlock *pTransitionBlock); -#endif #endif TADDR GetReturnAddressPtr_Impl() diff --git a/src/coreclr/vm/i386/asmhelpers.asm b/src/coreclr/vm/i386/asmhelpers.asm index 2b74e0ac6d9b51..f0b53455505d95 100644 --- a/src/coreclr/vm/i386/asmhelpers.asm +++ b/src/coreclr/vm/i386/asmhelpers.asm @@ -1635,7 +1635,7 @@ FASTCALL_FUNC IL_Throw, 4 STUB_PROLOG mov edx, esp - call @IL_Throw_x86@8 + call @IL_Throw_Impl@8 STUB_EPILOG ret 4 @@ -1652,7 +1652,7 @@ FASTCALL_FUNC IL_ThrowExact, 4 STUB_PROLOG mov edx, esp - call @IL_ThrowExact_x86@8 + call @IL_ThrowExact_Impl@8 STUB_EPILOG ret 4 @@ -1666,7 +1666,7 @@ FASTCALL_FUNC IL_Rethrow, 0 STUB_PROLOG mov ecx, esp - call @IL_Rethrow_x86@4 + call @IL_Rethrow_Impl@4 STUB_EPILOG ret 4 diff --git a/src/coreclr/vm/jithelpers.cpp b/src/coreclr/vm/jithelpers.cpp index cd97c0fe3d26df..471b5a13517b1e 100644 --- a/src/coreclr/vm/jithelpers.cpp +++ b/src/coreclr/vm/jithelpers.cpp @@ -765,12 +765,8 @@ HCIMPLEND /*************************************************************/ -#if defined(TARGET_X86) EXTERN_C FCDECL1(void, IL_Throw, Object* obj); -EXTERN_C HCIMPL2(void, IL_Throw_x86, Object* obj, TransitionBlock* transitionBlock) -#else -HCIMPL1(void, IL_Throw, Object* obj) -#endif +EXTERN_C HCIMPL2(void, IL_Throw_Impl, Object* obj, TransitionBlock* transitionBlock) { FCALL_CONTRACT; @@ -782,11 +778,7 @@ HCIMPL1(void, IL_Throw, Object* obj) Thread *pThread = GetThread(); SoftwareExceptionFrame exceptionFrame; -#ifdef TARGET_X86 exceptionFrame.UpdateContextFromTransitionBlock(transitionBlock); -#else - RtlCaptureContext(exceptionFrame.GetContext()); -#endif exceptionFrame.InitAndLink(pThread); FC_CAN_TRIGGER_GC(); @@ -804,23 +796,15 @@ HCIMPLEND /*************************************************************/ -#if defined(TARGET_X86) EXTERN_C FCDECL0(void, IL_Rethrow); -EXTERN_C HCIMPL1(void, IL_Rethrow_x86, TransitionBlock* transitionBlock) -#else -HCIMPL0(void, IL_Rethrow) -#endif +EXTERN_C HCIMPL1(void, IL_Rethrow_Impl, TransitionBlock* transitionBlock) { FCALL_CONTRACT; Thread *pThread = GetThread(); SoftwareExceptionFrame exceptionFrame; -#ifdef TARGET_X86 exceptionFrame.UpdateContextFromTransitionBlock(transitionBlock); -#else - RtlCaptureContext(exceptionFrame.GetContext()); -#endif exceptionFrame.InitAndLink(pThread); FC_CAN_TRIGGER_GC(); @@ -832,12 +816,8 @@ HCIMPL0(void, IL_Rethrow) } HCIMPLEND -#if defined(TARGET_X86) EXTERN_C FCDECL1(void, IL_ThrowExact, Object* obj); -EXTERN_C HCIMPL2(void, IL_ThrowExact_x86, Object* obj, TransitionBlock* transitionBlock) -#else -HCIMPL1(void, IL_ThrowExact, Object* obj) -#endif +EXTERN_C HCIMPL2(void, IL_ThrowExact_Impl, Object* obj, TransitionBlock* transitionBlock) { FCALL_CONTRACT; @@ -850,11 +830,7 @@ HCIMPL1(void, IL_ThrowExact, Object* obj) Thread *pThread = GetThread(); SoftwareExceptionFrame exceptionFrame; -#ifdef TARGET_X86 exceptionFrame.UpdateContextFromTransitionBlock(transitionBlock); -#else - RtlCaptureContext(exceptionFrame.GetContext()); -#endif exceptionFrame.InitAndLink(pThread); FC_CAN_TRIGGER_GC(); diff --git a/src/coreclr/vm/loongarch64/asmhelpers.S b/src/coreclr/vm/loongarch64/asmhelpers.S index 457f21b34ef022..292846d7a2b6e7 100644 --- a/src/coreclr/vm/loongarch64/asmhelpers.S +++ b/src/coreclr/vm/loongarch64/asmhelpers.S @@ -1020,3 +1020,69 @@ LEAF_ENTRY ThisPtrRetBufPrecodeWorker, _TEXT move $a1, $t0 // Move temp register to first arg register for static method with return buffer EPILOG_BRANCH_REG $METHODDESC_REGISTER LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_Throw_Impl +// implementation written in C. +// +// Input state: +// $a0 = Pointer to exception object +// ------------------------------------------------------------------ +NESTED_ENTRY IL_Throw, _TEXT, NoHandler + PROLOG_SAVE_REG_PAIR_INDEXED 22, 1, 0x80 + // Spill callee saved registers + PROLOG_SAVE_REG_PAIR 23, 24, 16 + PROLOG_SAVE_REG_PAIR 25, 26, 32 + PROLOG_SAVE_REG_PAIR 27, 28, 48 + PROLOG_SAVE_REG_PAIR 29, 30, 64 + PROLOG_SAVE_REG 31, 80 + + // $a0 already contains exception object + ori $a1, $sp, 0 // $a1 = TransitionBlock* + bl C_FUNC(IL_Throw_Impl) + // Should never return + break 0 +NESTED_END IL_Throw, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_ThrowExact_Impl +// implementation written in C. +// +// Input state: +// $a0 = Pointer to exception object +// ------------------------------------------------------------------ +NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler + PROLOG_SAVE_REG_PAIR_INDEXED 22, 1, 0x80 + // Spill callee saved registers + PROLOG_SAVE_REG_PAIR 23, 24, 16 + PROLOG_SAVE_REG_PAIR 25, 26, 32 + PROLOG_SAVE_REG_PAIR 27, 28, 48 + PROLOG_SAVE_REG_PAIR 29, 30, 64 + PROLOG_SAVE_REG 31, 80 + + // $a0 already contains exception object + ori $a1, $sp, 0 // $a1 = TransitionBlock* + bl C_FUNC(IL_ThrowExact_Impl) + // Should never return + break 0 +NESTED_END IL_ThrowExact, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_Rethrow_Impl +// implementation written in C. +// ------------------------------------------------------------------ +NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler + PROLOG_SAVE_REG_PAIR_INDEXED 22, 1, 0x80 + // Spill callee saved registers + PROLOG_SAVE_REG_PAIR 23, 24, 16 + PROLOG_SAVE_REG_PAIR 25, 26, 32 + PROLOG_SAVE_REG_PAIR 27, 28, 48 + PROLOG_SAVE_REG_PAIR 29, 30, 64 + PROLOG_SAVE_REG 31, 80 + + ori $a0, $sp, 0 // $a0 = TransitionBlock* + bl C_FUNC(IL_Rethrow_Impl) + // Should never return + break 0 +NESTED_END IL_Rethrow, _TEXT + diff --git a/src/coreclr/vm/riscv64/asmhelpers.S b/src/coreclr/vm/riscv64/asmhelpers.S index 64b24957211ec8..57ed0083c9aad7 100644 --- a/src/coreclr/vm/riscv64/asmhelpers.S +++ b/src/coreclr/vm/riscv64/asmhelpers.S @@ -878,6 +878,77 @@ LEAF_ENTRY ThisPtrRetBufPrecodeWorker, _TEXT EPILOG_BRANCH_REG t2 LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_Throw_Impl +// implementation written in C. +// +// Input state: +// a0 = Pointer to exception object +// ------------------------------------------------------------------ +NESTED_ENTRY IL_Throw, _TEXT, NoHandler + PROLOG_SAVE_REG_PAIR_INDEXED fp, ra, 0xa0 + // Spill callee saved registers + PROLOG_SAVE_REG_PAIR s1, s2, 16 + PROLOG_SAVE_REG_PAIR s3, s4, 32 + PROLOG_SAVE_REG_PAIR s5, s6, 48 + PROLOG_SAVE_REG_PAIR s7, s8, 64 + PROLOG_SAVE_REG_PAIR s9, s10, 80 + PROLOG_SAVE_REG_PAIR s11, gp, 96 + PROLOG_SAVE_REG tp, 112 + + // a0 already contains exception object + addi a1, sp, 0 // a1 = TransitionBlock* + call C_FUNC(IL_Throw_Impl) + // Should never return + ebreak +NESTED_END IL_Throw, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_ThrowExact_Impl +// implementation written in C. +// +// Input state: +// a0 = Pointer to exception object +// ------------------------------------------------------------------ +NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler + PROLOG_SAVE_REG_PAIR_INDEXED fp, ra, 0xa0 + // Spill callee saved registers + PROLOG_SAVE_REG_PAIR s1, s2, 16 + PROLOG_SAVE_REG_PAIR s3, s4, 32 + PROLOG_SAVE_REG_PAIR s5, s6, 48 + PROLOG_SAVE_REG_PAIR s7, s8, 64 + PROLOG_SAVE_REG_PAIR s9, s10, 80 + PROLOG_SAVE_REG_PAIR s11, gp, 96 + PROLOG_SAVE_REG tp, 112 + + // a0 already contains exception object + addi a1, sp, 0 // a1 = TransitionBlock* + call C_FUNC(IL_ThrowExact_Impl) + // Should never return + ebreak +NESTED_END IL_ThrowExact, _TEXT + +// ------------------------------------------------------------------ +// Capture a transition block with register values and call the IL_Rethrow_Impl +// implementation written in C. +// ------------------------------------------------------------------ +NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler + PROLOG_SAVE_REG_PAIR_INDEXED fp, ra, 0xa0 + // Spill callee saved registers + PROLOG_SAVE_REG_PAIR s1, s2, 16 + PROLOG_SAVE_REG_PAIR s3, s4, 32 + PROLOG_SAVE_REG_PAIR s5, s6, 48 + PROLOG_SAVE_REG_PAIR s7, s8, 64 + PROLOG_SAVE_REG_PAIR s9, s10, 80 + PROLOG_SAVE_REG_PAIR s11, gp, 96 + PROLOG_SAVE_REG tp, 112 + + addi a0, sp, 0 // a0 = TransitionBlock* + call C_FUNC(IL_Rethrow_Impl) + // Should never return + ebreak +NESTED_END IL_Rethrow, _TEXT + #ifdef FEATURE_INTERPRETER // Align interpreter stack by adjusting it by 8 bytes From 5359b4cd9b0e4515f7d66e80effd83408e2e3a08 Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Fri, 9 Jan 2026 00:42:34 +0200 Subject: [PATCH 02/30] Handle FP regs --- src/coreclr/pal/inc/unixasmmacrosamd64.inc | 28 ++++ src/coreclr/pal/inc/unixasmmacrosarm64.inc | 38 ++++- .../pal/inc/unixasmmacrosloongarch64.inc | 29 ++++ src/coreclr/pal/inc/unixasmmacrosriscv64.inc | 30 ++++ src/coreclr/vm/amd64/AsmHelpers.asm | 9 +- src/coreclr/vm/amd64/AsmMacros.inc | 32 ++++ src/coreclr/vm/amd64/asmhelpers.S | 6 +- src/coreclr/vm/arm64/asmhelpers.S | 14 +- src/coreclr/vm/arm64/asmhelpers.asm | 18 +-- src/coreclr/vm/arm64/asmmacros.h | 38 +++++ src/coreclr/vm/excep.cpp | 140 +++++++++++++++--- src/coreclr/vm/frames.h | 2 +- src/coreclr/vm/i386/asmhelpers.asm | 3 + src/coreclr/vm/loongarch64/asmhelpers.S | 33 +---- src/coreclr/vm/riscv64/asmhelpers.S | 39 +---- 15 files changed, 355 insertions(+), 104 deletions(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index 90c8947e754297..b443b3eefb8f63 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -428,6 +428,34 @@ C_FUNC(\Name\()_End): POP_CALLEE_SAVED_REGISTERS .endm +// Pushes a full TransitionBlock on the stack including argument registers and +// floating point argument registers. Used for exception throw helpers where we +// need to capture the complete register state. +// +// Stack layout (from high to low address after prologue): +// Return address +// CalleeSavedRegisters (rbp, rbx, r15, r14, r13, r12 - 48 bytes) +// ArgumentRegisters (r9, r8, rcx, rdx, rsi, rdi - 48 bytes) <- TransitionBlock +// Padding (8 bytes for 16-byte alignment) +// FloatArgumentRegisters (xmm0-xmm7, 128 bytes) +// sp points here +// +// On exit, \target contains the TransitionBlock pointer (after float args area). +.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target + set_cfa_register rsp, 8 + + PUSH_CALLEE_SAVED_REGISTERS + PUSH_ARGUMENT_REGISTERS + + // Allocate space for float argument registers (128 bytes) + alignment (8 bytes) + alloc_stack 136 + SAVE_FLOAT_ARGUMENT_REGISTERS 0 + + END_PROLOGUE + + lea \target, [rsp + 136] +.endm + .macro INLINE_GETTHREAD // Inlined version of call C_FUNC(RhpGetThread) INLINE_GET_TLS_VAR t_CurrentThreadInfo diff --git a/src/coreclr/pal/inc/unixasmmacrosarm64.inc b/src/coreclr/pal/inc/unixasmmacrosarm64.inc index fa4265ab3fc9d2..6f17f6f5f1e1c9 100644 --- a/src/coreclr/pal/inc/unixasmmacrosarm64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosarm64.inc @@ -419,7 +419,43 @@ C_FUNC(\Name\()_End): EPILOG_RESTORE_REG_PAIR x25, x26, 64 EPILOG_RESTORE_REG_PAIR x27, x28, 80 EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 176 -.endm +.endm + +// Pushes a full TransitionBlock on the stack including argument registers and +// floating point argument registers. Used for exception throw helpers where we +// need to capture the complete register state. +// +// Stack layout (from low to high address): +// sp+0: FloatArgumentRegisters (q0-q7, 128 bytes) +// sp+128: TransitionBlock start (176 bytes) +// - CalleeSavedRegisters (fp, lr, x19-x28 - 96 bytes) +// - padding (8 bytes) +// - x8 (8 bytes) +// - ArgumentRegisters (x0-x7, 64 bytes) +// +// On exit, \target contains the TransitionBlock pointer (sp+128). +.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -176 + + // Spill callee saved registers + PROLOG_SAVE_REG_PAIR x19, x20, 16 + PROLOG_SAVE_REG_PAIR x21, x22, 32 + PROLOG_SAVE_REG_PAIR x23, x24, 48 + PROLOG_SAVE_REG_PAIR x25, x26, 64 + PROLOG_SAVE_REG_PAIR x27, x28, 80 + + // Allocate space for FloatArgumentRegisters + PROLOG_STACK_ALLOC 128 + + // Save argument registers (x8, x0-x7) at offset 232 from sp (128 + 104) + SAVE_ARGUMENT_REGISTERS sp, 232 + + // Save floating point argument registers (q0-q7) at sp+0 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 0 + + // Set target to TransitionBlock pointer + add \target, sp, #128 +.endm // ------------------------------------------------------------------ // Macro to generate Redirection Stubs diff --git a/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc b/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc index 66ebdd147535e9..964bf5449ebdb8 100644 --- a/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc @@ -404,6 +404,35 @@ C_FUNC(\Name\()_End): EPILOG_STACK_FREE 160 .endm +// Pushes a full TransitionBlock on the stack including argument registers and +// floating point argument registers. Used for exception throw helpers where we +// need to capture the complete register state. +// +// Stack layout (from low to high address): +// sp+0: FloatArgumentRegisters (fa0-fa7, 64 bytes) +// sp+64: TransitionBlock start +// - CalleeSavedRegisters (fp, ra, s0-s8 - 96 bytes) +// - ArgumentRegisters (a0-a7, 64 bytes) +// +// On exit, \target contains the TransitionBlock pointer (sp+64). +.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target + // Stack: FloatArgs(64) + CalleeSaved(96) + Args(64) = 224 bytes + PROLOG_STACK_ALLOC 224 + PROLOG_SAVE_REG_PAIR 22, 1, 64, 1 + + // Save callee-saved registers at offset 64 (after FloatArgumentRegisters) + SAVE_CALLEESAVED_REGISTERS $sp, 64 + + // Save argument registers (a0-a7) at offset 160 + SAVE_ARGUMENT_REGISTERS $sp, 160 + + // Save floating-point argument registers (fa0-fa7) at offset 0 + SAVE_FLOAT_ARGUMENT_REGISTERS $sp, 0 + + // Set target to TransitionBlock pointer + addi.d \target, $sp, 64 +.endm + // ------------------------------------------------------------------ // Macro to generate Redirection Stubs // diff --git a/src/coreclr/pal/inc/unixasmmacrosriscv64.inc b/src/coreclr/pal/inc/unixasmmacrosriscv64.inc index ead0d6b550d232..c72374af2b359e 100644 --- a/src/coreclr/pal/inc/unixasmmacrosriscv64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosriscv64.inc @@ -349,6 +349,36 @@ C_FUNC(\Name): EPILOG_STACK_FREE 192 .endm +// Pushes a full TransitionBlock on the stack including argument registers and +// floating point argument registers. Used for exception throw helpers where we +// need to capture the complete register state. +// +// Stack layout (from low to high address): +// sp+0: FloatArgumentRegisters (fa0-fa7, 64 bytes) +// sp+64: TransitionBlock start +// - CalleeSavedRegisters (fp, ra, s1-s11, tp, gp - 120 bytes) +// - padding (8 bytes) +// - ArgumentRegisters (a0-a7, 64 bytes) +// +// On exit, \target contains the TransitionBlock pointer (sp+64). +.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target + // Stack: FloatArgs(64) + CalleeSaved(120) + pad(8) + Args(64) = 256 bytes + PROLOG_STACK_ALLOC 256 + PROLOG_SAVE_REG_PAIR fp, ra, 64, 1 + + // Save callee-saved registers at offset 64 (after FloatArgumentRegisters) + SAVE_CALLEESAVED_REGISTERS sp, 64 + + // Save argument registers (a0-a7) at offset 192 + SAVE_ARGUMENT_REGISTERS sp, 192 + + // Save floating-point argument registers (fa0-fa7) at offset 0 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 0 + + // Set target to TransitionBlock pointer + addi \target, sp, 64 +.endm + // ------------------------------------------------------------------ // Macro to generate Redirection Stubs // diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index 5b42128222ed42..3e095766786aa6 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -1208,7 +1208,8 @@ endif ; FEATURE_INTERPRETER ; RCX = Pointer to exception object ;========================================================================== NESTED_ENTRY IL_Throw, _TEXT - PUSH_COOP_PINVOKE_FRAME rdx + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdx + ; RCX already contains exception object ; RDX contains pointer to TransitionBlock call IL_Throw_Impl @@ -1224,7 +1225,8 @@ NESTED_END IL_Throw, _TEXT ; RCX = Pointer to exception object ;========================================================================== NESTED_ENTRY IL_ThrowExact, _TEXT - PUSH_COOP_PINVOKE_FRAME rdx + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdx + ; RCX already contains exception object ; RDX contains pointer to TransitionBlock call IL_ThrowExact_Impl @@ -1237,7 +1239,8 @@ NESTED_END IL_ThrowExact, _TEXT ; implementation written in C. ;========================================================================== NESTED_ENTRY IL_Rethrow, _TEXT - PUSH_COOP_PINVOKE_FRAME rcx + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rcx + ; RCX contains pointer to TransitionBlock call IL_Rethrow_Impl ; Should never return diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index c6966135fc7ef4..ead844ea8a5d34 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -485,5 +485,37 @@ POP_COOP_PINVOKE_FRAME macro endm +; Pushes a full TransitionBlock on the stack including argument registers and +; floating point argument registers. Used for exception throw helpers where we +; need to capture the complete register state. +; +; Stack layout (from high to low address after prologue): +; Return address +; CalleeSavedRegisters (r15, r14, r13, r12, rbp, rbx, rsi, rdi - 64 bytes) +; Outgoing argument homes (32 bytes) + alignment (8 bytes) <- TransitionBlock +; FloatArgumentRegisters (xmm0-xmm3, 64 bytes) +; sp points here +; +; On exit, target contains the TransitionBlock pointer (after float args area). +PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target + + PUSH_CALLEE_SAVED_REGISTERS + + ; Allocate space for: outgoing args (32) + align (8) + float args (64) = 104 bytes + alloc_stack 104 + + ; Save argument registers to shadow space area + ; Shadow space is at offset 64 (after float args) from sp + SAVE_ARGUMENT_REGISTERS 64 + + ; Save float argument registers at offset 0 + SAVE_FLOAT_ARGUMENT_REGISTERS 0 + + END_PROLOGUE + + lea target, [rsp + 64] + + endm + ;; GC type flags GC_ALLOC_FINALIZE equ 1 diff --git a/src/coreclr/vm/amd64/asmhelpers.S b/src/coreclr/vm/amd64/asmhelpers.S index 148af0ac0e013f..38b3c87166d0be 100644 --- a/src/coreclr/vm/amd64/asmhelpers.S +++ b/src/coreclr/vm/amd64/asmhelpers.S @@ -1922,7 +1922,7 @@ NESTED_END CallJittedMethodRetDoubleDouble, _TEXT // rdi = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME rsi + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rsi // rdi already contains exception object // rsi contains pointer to TransitionBlock call C_FUNC(IL_Throw_Impl) @@ -1938,7 +1938,7 @@ NESTED_END IL_Throw, _TEXT // rdi = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME rsi + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rsi // rdi already contains exception object // rsi contains pointer to TransitionBlock call C_FUNC(IL_ThrowExact_Impl) @@ -1951,7 +1951,7 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME rdi + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdi // rdi contains pointer to TransitionBlock call C_FUNC(IL_Rethrow_Impl) // Should never return diff --git a/src/coreclr/vm/arm64/asmhelpers.S b/src/coreclr/vm/arm64/asmhelpers.S index cf323d61b12255..127a5c0a118245 100644 --- a/src/coreclr/vm/arm64/asmhelpers.S +++ b/src/coreclr/vm/arm64/asmhelpers.S @@ -2757,11 +2757,19 @@ NESTED_END CallJittedMethodRet4Vector128, _TEXT // Capture a transition block with register values and call the IL_Throw_Impl // implementation written in C. // +// Stack layout (from low to high address): +// sp+0: FloatArgumentRegisters (q0-q7, 128 bytes) +// sp+128: TransitionBlock start (176 bytes) +// - CalleeSavedRegisters (fp, lr, x19-x28 - 96 bytes) +// - padding (8 bytes) +// - x8 (8 bytes) +// - ArgumentRegisters (x0-x7, 64 bytes) +// // Input state: // x0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME x1 + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 // x0 already contains exception object // x1 contains pointer to TransitionBlock bl C_FUNC(IL_Throw_Impl) @@ -2777,7 +2785,7 @@ NESTED_END IL_Throw, _TEXT // x0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME x1 + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 // x0 already contains exception object // x1 contains pointer to TransitionBlock bl C_FUNC(IL_ThrowExact_Impl) @@ -2790,7 +2798,7 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME x0 + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x0 // x0 contains pointer to TransitionBlock bl C_FUNC(IL_Rethrow_Impl) // Should never return diff --git a/src/coreclr/vm/arm64/asmhelpers.asm b/src/coreclr/vm/arm64/asmhelpers.asm index d8caf2fc23fc32..048dd72ad6755b 100644 --- a/src/coreclr/vm/arm64/asmhelpers.asm +++ b/src/coreclr/vm/arm64/asmhelpers.asm @@ -2988,14 +2988,14 @@ CopyLoop ; Input state: ; x0 = Pointer to exception object ; ------------------------------------------------------------------ - NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME x1 + NESTED_ENTRY IL_Throw + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 ; x0 already contains exception object ; x1 contains pointer to TransitionBlock bl IL_Throw_Impl ; Should never return brk #0 - NESTED_END IL_Throw, _TEXT + NESTED_END IL_Throw ; ------------------------------------------------------------------ ; Capture a transition block with register values and call the IL_ThrowExact_Impl @@ -3004,26 +3004,26 @@ CopyLoop ; Input state: ; x0 = Pointer to exception object ; ------------------------------------------------------------------ - NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME x1 + NESTED_ENTRY IL_ThrowExact + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 ; x0 already contains exception object ; x1 contains pointer to TransitionBlock bl IL_ThrowExact_Impl ; Should never return brk #0 - NESTED_END IL_ThrowExact, _TEXT + NESTED_END IL_ThrowExact ; ------------------------------------------------------------------ ; Capture a transition block with register values and call the IL_Rethrow_Impl ; implementation written in C. ; ------------------------------------------------------------------ - NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME x0 + NESTED_ENTRY IL_Rethrow + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x0 ; x0 contains pointer to TransitionBlock bl IL_Rethrow_Impl ; Should never return brk #0 - NESTED_END IL_Rethrow, _TEXT + NESTED_END IL_Rethrow ; Must be at very end of file END diff --git a/src/coreclr/vm/arm64/asmmacros.h b/src/coreclr/vm/arm64/asmmacros.h index a11067633ab82e..8e4eb786df8f70 100644 --- a/src/coreclr/vm/arm64/asmmacros.h +++ b/src/coreclr/vm/arm64/asmmacros.h @@ -204,6 +204,44 @@ OFFSETOF__ee_alloc_context EQU OFFSETOF__RuntimeThreadLocals__ee_alloc_context EPILOG_RESTORE_REG_PAIR fp, lr, #176! MEND +; Pushes a full TransitionBlock on the stack including argument registers and +; floating point argument registers. Used for exception throw helpers where we +; need to capture the complete register state. +; +; Stack layout (from low to high address): +; sp+0: FloatArgumentRegisters (q0-q7, 128 bytes) +; sp+128: TransitionBlock start (176 bytes) +; - CalleeSavedRegisters (fp, lr, x19-x28 - 96 bytes) +; - padding (8 bytes) +; - x8 (8 bytes) +; - ArgumentRegisters (x0-x7, 64 bytes) +; +; On exit, $Target contains the TransitionBlock pointer (sp+128). + MACRO + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $Target + + PROLOG_SAVE_REG_PAIR fp, lr, #-176! + + ; Spill callee saved registers + PROLOG_SAVE_REG_PAIR x19, x20, #16 + PROLOG_SAVE_REG_PAIR x21, x22, #32 + PROLOG_SAVE_REG_PAIR x23, x24, #48 + PROLOG_SAVE_REG_PAIR x25, x26, #64 + PROLOG_SAVE_REG_PAIR x27, x28, #80 + + ; Allocate space for FloatArgumentRegisters + PROLOG_STACK_ALLOC 128 + + ; Save argument registers (x8, x0-x7) at offset 232 from sp (128 + 104) + SAVE_ARGUMENT_REGISTERS sp, 232 + + ; Save floating point argument registers (q0-q7) at sp+0 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 0 + + ; Set target to TransitionBlock pointer + add $Target, sp, #128 + MEND + #define GC_ALLOC_FINALIZE 1 ;----------------------------------------------------------------------------- diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index ce3d02b9fab1b0..ba4fc4c5ae9ab5 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10877,7 +10877,7 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p { LIMITED_METHOD_CONTRACT; - m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_FLOATING_POINT; m_Context.SegCs = 0; m_Context.SegSs = 0; m_Context.EFlags = 0; @@ -10885,12 +10885,12 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p #ifdef UNIX_AMD64_ABI // On Unix AMD64, argument registers are saved in the transition block m_Context.Rax = 0; - m_Context.Rdi = pTransitionBlock->m_argumentRegisters.rdi; - m_Context.Rsi = pTransitionBlock->m_argumentRegisters.rsi; - m_Context.Rdx = pTransitionBlock->m_argumentRegisters.rdx; - m_Context.Rcx = pTransitionBlock->m_argumentRegisters.rcx; - m_Context.R8 = pTransitionBlock->m_argumentRegisters.r8; - m_Context.R9 = pTransitionBlock->m_argumentRegisters.r9; + m_Context.Rdi = pTransitionBlock->m_argumentRegisters.RDI; + m_Context.Rsi = pTransitionBlock->m_argumentRegisters.RSI; + m_Context.Rdx = pTransitionBlock->m_argumentRegisters.RDX; + m_Context.Rcx = pTransitionBlock->m_argumentRegisters.RCX; + m_Context.R8 = pTransitionBlock->m_argumentRegisters.R8; + m_Context.R9 = pTransitionBlock->m_argumentRegisters.R9; m_ContextPointers.Rdi = &m_Context.Rdi; m_ContextPointers.Rsi = &m_Context.Rsi; @@ -10898,6 +10898,28 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_ContextPointers.Rcx = &m_Context.Rcx; m_ContextPointers.R8 = &m_Context.R8; m_ContextPointers.R9 = &m_Context.R9; + + // Copy floating point argument registers (xmm0-xmm7) + FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); + m_Context.Xmm0 = pFloatArgs->d[0]; + m_Context.Xmm1 = pFloatArgs->d[1]; + m_Context.Xmm2 = pFloatArgs->d[2]; + m_Context.Xmm3 = pFloatArgs->d[3]; + m_Context.Xmm4 = pFloatArgs->d[4]; + m_Context.Xmm5 = pFloatArgs->d[5]; + m_Context.Xmm6 = pFloatArgs->d[6]; + m_Context.Xmm7 = pFloatArgs->d[7]; + // Initialize remaining XMM registers to zero + memset(&m_Context.Xmm8, 0, sizeof(m_Context.Xmm8)); + memset(&m_Context.Xmm9, 0, sizeof(m_Context.Xmm9)); + memset(&m_Context.Xmm10, 0, sizeof(m_Context.Xmm10)); + memset(&m_Context.Xmm11, 0, sizeof(m_Context.Xmm11)); + memset(&m_Context.Xmm12, 0, sizeof(m_Context.Xmm12)); + memset(&m_Context.Xmm13, 0, sizeof(m_Context.Xmm13)); + memset(&m_Context.Xmm14, 0, sizeof(m_Context.Xmm14)); + memset(&m_Context.Xmm15, 0, sizeof(m_Context.Xmm15)); + // Initialize FP control/status + m_Context.MxCsr = 0x1F80; // Default MXCSR value (all exceptions masked) #else // On Windows AMD64, argument registers are not saved in the transition block m_Context.Rax = 0; @@ -10905,6 +10927,10 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.Rdx = 0; m_Context.R8 = 0; m_Context.R9 = 0; + + // Note: On Windows AMD64, floating point argument registers (xmm0-xmm3) are not currently saved + // in the transition block for IL_Throw helpers. If needed, the assembly stubs would need to be + // updated to save them. #endif #define CALLEE_SAVED_REGISTER(reg) \ @@ -10924,7 +10950,7 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p { LIMITED_METHOD_CONTRACT; - m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_FLOATING_POINT; // Copy argument registers (R0-R3) m_Context.R0 = pTransitionBlock->m_argumentRegisters.r[0]; @@ -10943,6 +10969,21 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.R11 = pTransitionBlock->m_calleeSavedRegisters.r11; m_Context.Lr = pTransitionBlock->m_calleeSavedRegisters.r14; // r14 is link register + // Copy floating point argument registers (d0-d7 / s0-s15) + FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); + for (int i = 0; i < 8; i++) + { + m_Context.D[i] = pFloatArgs->d[i]; + } + // Initialize remaining D registers (D8-D31) to zero + // D8-D15 are callee-saved, D16-D31 are caller-saved + for (int i = 8; i < 32; i++) + { + m_Context.D[i] = 0; + } + // Initialize FP status/control register + m_Context.Fpscr = 0; + // Set up context pointers for callee-saved registers m_ContextPointers.R4 = &m_Context.R4; m_ContextPointers.R5 = &m_Context.R5; @@ -10965,7 +11006,7 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p { LIMITED_METHOD_CONTRACT; - m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_FLOATING_POINT; // Copy argument registers (X0-X7) for (int i = 0; i < 8; i++) @@ -10992,6 +11033,23 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.Fp = pTransitionBlock->m_calleeSavedRegisters.x29; m_Context.Lr = pTransitionBlock->m_calleeSavedRegisters.x30; + // Copy floating point argument registers (V0-V7) + FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); + for (int i = 0; i < 8; i++) + { + m_Context.V[i] = pFloatArgs->q[i]; + } + // Initialize remaining V registers (V8-V31) to zero + // V8-V15 are callee-saved (only lower 64 bits), V16-V31 are caller-saved + for (int i = 8; i < 32; i++) + { + m_Context.V[i].Low = 0; + m_Context.V[i].High = 0; + } + // Initialize FP control/status registers + m_Context.Fpcr = 0; + m_Context.Fpsr = 0; + // Set up context pointers for callee-saved registers m_ContextPointers.X19 = &m_Context.X19; m_ContextPointers.X20 = &m_Context.X20; @@ -11017,7 +11075,7 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p { LIMITED_METHOD_CONTRACT; - m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_FLOATING_POINT; // Copy argument registers (A0-A7) m_Context.A0 = pTransitionBlock->m_argumentRegisters.a[0]; @@ -11042,6 +11100,23 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.S7 = pTransitionBlock->m_calleeSavedRegisters.s7; m_Context.S8 = pTransitionBlock->m_calleeSavedRegisters.s8; + // Copy floating point argument registers (fa0-fa7) + // F[] array in CONTEXT is 4*32 elements for LSX/LASX support. + // Each FP register takes 4 slots (for 256-bit LASX vectors). + // For 64-bit doubles, we only use the first slot of each register. + FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); + for (int i = 0; i < 8; i++) + { + memcpy(&m_Context.F[i * 4], &pFloatArgs->f[i], sizeof(double)); + } + // Initialize remaining F registers to zero + for (int i = 8; i < 32; i++) + { + memset(&m_Context.F[i * 4], 0, sizeof(double) * 4); + } + // Initialize FP control/status register + m_Context.Fcsr = 0; + // Set up context pointers for callee-saved registers m_ContextPointers.S0 = &m_Context.S0; m_ContextPointers.S1 = &m_Context.S1; @@ -11066,7 +11141,7 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p { LIMITED_METHOD_CONTRACT; - m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_FLOATING_POINT; // Copy argument registers (A0-A7) m_Context.A0 = pTransitionBlock->m_argumentRegisters.a[0]; @@ -11095,6 +11170,18 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.Tp = pTransitionBlock->m_calleeSavedRegisters.tp; m_Context.Gp = pTransitionBlock->m_calleeSavedRegisters.gp; + // Initialize all F registers to zero first + memset(m_Context.F, 0, sizeof(m_Context.F)); + // Copy floating point argument registers (fa0-fa7) + FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); + for (int i = 0; i < 8; i++) + { + // F[10-17] are fa0-fa7 in RISC-V register naming + memcpy(&m_Context.F[10 + i], &pFloatArgs->f[i], sizeof(double)); + } + // Initialize FP control/status register + m_Context.Fcsr = 0; + // Set up context pointers for callee-saved registers m_ContextPointers.S1 = &m_Context.S1; m_ContextPointers.S2 = &m_Context.S2; @@ -11126,31 +11213,36 @@ void SoftwareExceptionFrame::Init() { WRAPPER_NO_CONTRACT; - // On x86 we initialize the context state from transition block in - // UpdateContextFromTransitionBlock method. + // On x86 and when using TransitionBlock path (indicated by m_ReturnAddress being set), + // we initialize the context state from transition block in UpdateContextFromTransitionBlock method. #ifndef TARGET_X86 + // If m_ReturnAddress is already set, the context was populated from TransitionBlock + // and we should skip VirtualUnwind. + if (m_ReturnAddress == 0) + { #define CALLEE_SAVED_REGISTER(regname) m_ContextPointers.regname = NULL; - ENUM_CALLEE_SAVED_REGISTERS(); + ENUM_CALLEE_SAVED_REGISTERS(); #undef CALLEE_SAVED_REGISTER #ifndef TARGET_UNIX - Thread::VirtualUnwindCallFrame(&m_Context, &m_ContextPointers); + Thread::VirtualUnwindCallFrame(&m_Context, &m_ContextPointers); #else // !TARGET_UNIX - BOOL success = PAL_VirtualUnwind(&m_Context, &m_ContextPointers); - if (!success) - { - _ASSERTE(!"SoftwareExceptionFrame::Init failed"); - EEPOLICY_HANDLE_FATAL_ERROR(COR_E_EXECUTIONENGINE); - } + BOOL success = PAL_VirtualUnwind(&m_Context, &m_ContextPointers); + if (!success) + { + _ASSERTE(!"SoftwareExceptionFrame::Init failed"); + EEPOLICY_HANDLE_FATAL_ERROR(COR_E_EXECUTIONENGINE); + } #endif // !TARGET_UNIX #define CALLEE_SAVED_REGISTER(regname) if (m_ContextPointers.regname == NULL) m_ContextPointers.regname = &m_Context.regname; - ENUM_CALLEE_SAVED_REGISTERS(); + ENUM_CALLEE_SAVED_REGISTERS(); #undef CALLEE_SAVED_REGISTER - _ASSERTE(ExecutionManager::IsManagedCode(::GetIP(&m_Context))); + m_ReturnAddress = ::GetIP(&m_Context); + } - m_ReturnAddress = ::GetIP(&m_Context); + _ASSERTE(ExecutionManager::IsManagedCode(::GetIP(&m_Context))); #endif // !TARGET_X86 } diff --git a/src/coreclr/vm/frames.h b/src/coreclr/vm/frames.h index cac6e19c63e531..a656717b2d5158 100644 --- a/src/coreclr/vm/frames.h +++ b/src/coreclr/vm/frames.h @@ -985,7 +985,7 @@ class SoftwareExceptionFrame : public Frame public: #ifndef DACCESS_COMPILE - SoftwareExceptionFrame() : Frame(FrameIdentifier::SoftwareExceptionFrame) { + SoftwareExceptionFrame() : Frame(FrameIdentifier::SoftwareExceptionFrame), m_ReturnAddress(0) { LIMITED_METHOD_CONTRACT; } diff --git a/src/coreclr/vm/i386/asmhelpers.asm b/src/coreclr/vm/i386/asmhelpers.asm index f0b53455505d95..632f7dccf4b003 100644 --- a/src/coreclr/vm/i386/asmhelpers.asm +++ b/src/coreclr/vm/i386/asmhelpers.asm @@ -85,6 +85,9 @@ endif EXTERN @IL_Throw_x86@8:PROC EXTERN @IL_ThrowExact_x86@8:PROC EXTERN @IL_Rethrow_x86@4:PROC +EXTERN @IL_Throw_Impl@8:PROC +EXTERN @IL_ThrowExact_Impl@8:PROC +EXTERN @IL_Rethrow_Impl@4:PROC UNREFERENCED macro arg local unref diff --git a/src/coreclr/vm/loongarch64/asmhelpers.S b/src/coreclr/vm/loongarch64/asmhelpers.S index 292846d7a2b6e7..9f424c39dd30f5 100644 --- a/src/coreclr/vm/loongarch64/asmhelpers.S +++ b/src/coreclr/vm/loongarch64/asmhelpers.S @@ -1029,16 +1029,9 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT // $a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PROLOG_SAVE_REG_PAIR_INDEXED 22, 1, 0x80 - // Spill callee saved registers - PROLOG_SAVE_REG_PAIR 23, 24, 16 - PROLOG_SAVE_REG_PAIR 25, 26, 32 - PROLOG_SAVE_REG_PAIR 27, 28, 48 - PROLOG_SAVE_REG_PAIR 29, 30, 64 - PROLOG_SAVE_REG 31, 80 - + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $a1 // $a0 already contains exception object - ori $a1, $sp, 0 // $a1 = TransitionBlock* + // $a1 contains pointer to TransitionBlock bl C_FUNC(IL_Throw_Impl) // Should never return break 0 @@ -1052,16 +1045,9 @@ NESTED_END IL_Throw, _TEXT // $a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PROLOG_SAVE_REG_PAIR_INDEXED 22, 1, 0x80 - // Spill callee saved registers - PROLOG_SAVE_REG_PAIR 23, 24, 16 - PROLOG_SAVE_REG_PAIR 25, 26, 32 - PROLOG_SAVE_REG_PAIR 27, 28, 48 - PROLOG_SAVE_REG_PAIR 29, 30, 64 - PROLOG_SAVE_REG 31, 80 - + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $a1 // $a0 already contains exception object - ori $a1, $sp, 0 // $a1 = TransitionBlock* + // $a1 contains pointer to TransitionBlock bl C_FUNC(IL_ThrowExact_Impl) // Should never return break 0 @@ -1072,15 +1058,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PROLOG_SAVE_REG_PAIR_INDEXED 22, 1, 0x80 - // Spill callee saved registers - PROLOG_SAVE_REG_PAIR 23, 24, 16 - PROLOG_SAVE_REG_PAIR 25, 26, 32 - PROLOG_SAVE_REG_PAIR 27, 28, 48 - PROLOG_SAVE_REG_PAIR 29, 30, 64 - PROLOG_SAVE_REG 31, 80 - - ori $a0, $sp, 0 // $a0 = TransitionBlock* + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $a0 + // $a0 contains pointer to TransitionBlock bl C_FUNC(IL_Rethrow_Impl) // Should never return break 0 diff --git a/src/coreclr/vm/riscv64/asmhelpers.S b/src/coreclr/vm/riscv64/asmhelpers.S index 57ed0083c9aad7..0d26ef514d4f58 100644 --- a/src/coreclr/vm/riscv64/asmhelpers.S +++ b/src/coreclr/vm/riscv64/asmhelpers.S @@ -886,18 +886,9 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT // a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PROLOG_SAVE_REG_PAIR_INDEXED fp, ra, 0xa0 - // Spill callee saved registers - PROLOG_SAVE_REG_PAIR s1, s2, 16 - PROLOG_SAVE_REG_PAIR s3, s4, 32 - PROLOG_SAVE_REG_PAIR s5, s6, 48 - PROLOG_SAVE_REG_PAIR s7, s8, 64 - PROLOG_SAVE_REG_PAIR s9, s10, 80 - PROLOG_SAVE_REG_PAIR s11, gp, 96 - PROLOG_SAVE_REG tp, 112 - + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS a1 // a0 already contains exception object - addi a1, sp, 0 // a1 = TransitionBlock* + // a1 contains pointer to TransitionBlock call C_FUNC(IL_Throw_Impl) // Should never return ebreak @@ -911,18 +902,9 @@ NESTED_END IL_Throw, _TEXT // a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PROLOG_SAVE_REG_PAIR_INDEXED fp, ra, 0xa0 - // Spill callee saved registers - PROLOG_SAVE_REG_PAIR s1, s2, 16 - PROLOG_SAVE_REG_PAIR s3, s4, 32 - PROLOG_SAVE_REG_PAIR s5, s6, 48 - PROLOG_SAVE_REG_PAIR s7, s8, 64 - PROLOG_SAVE_REG_PAIR s9, s10, 80 - PROLOG_SAVE_REG_PAIR s11, gp, 96 - PROLOG_SAVE_REG tp, 112 - + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS a1 // a0 already contains exception object - addi a1, sp, 0 // a1 = TransitionBlock* + // a1 contains pointer to TransitionBlock call C_FUNC(IL_ThrowExact_Impl) // Should never return ebreak @@ -933,17 +915,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PROLOG_SAVE_REG_PAIR_INDEXED fp, ra, 0xa0 - // Spill callee saved registers - PROLOG_SAVE_REG_PAIR s1, s2, 16 - PROLOG_SAVE_REG_PAIR s3, s4, 32 - PROLOG_SAVE_REG_PAIR s5, s6, 48 - PROLOG_SAVE_REG_PAIR s7, s8, 64 - PROLOG_SAVE_REG_PAIR s9, s10, 80 - PROLOG_SAVE_REG_PAIR s11, gp, 96 - PROLOG_SAVE_REG tp, 112 - - addi a0, sp, 0 // a0 = TransitionBlock* + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS a0 + // a0 contains pointer to TransitionBlock call C_FUNC(IL_Rethrow_Impl) // Should never return ebreak From cbb923848f7f078caedff0bfde7d5a73dd1567c2 Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Fri, 9 Jan 2026 01:11:55 +0200 Subject: [PATCH 03/30] . --- src/coreclr/vm/amd64/AsmHelpers.asm | 3 +++ src/coreclr/vm/arm64/asmhelpers.asm | 3 +++ src/coreclr/vm/i386/asmhelpers.S | 6 +++--- src/coreclr/vm/i386/asmhelpers.asm | 3 --- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index 3e095766786aa6..4725a0cf8eefbb 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -11,6 +11,9 @@ extern ProfileLeave:proc extern ProfileTailcall:proc extern OnHijackWorker:proc extern JIT_RareDisableHelperWorker:proc +extern IL_Throw_Impl:proc +extern IL_ThrowExact_Impl:proc +extern IL_Rethrow_Impl:proc ifdef FEATURE_INTERPRETER extern ExecuteInterpretedMethod:proc extern GetInterpThreadContextWithPossiblyMissingThreadOrCallStub:proc diff --git a/src/coreclr/vm/arm64/asmhelpers.asm b/src/coreclr/vm/arm64/asmhelpers.asm index 048dd72ad6755b..7f510cd8ed62a5 100644 --- a/src/coreclr/vm/arm64/asmhelpers.asm +++ b/src/coreclr/vm/arm64/asmhelpers.asm @@ -23,6 +23,9 @@ #endif IMPORT HijackHandler IMPORT ThrowControlForThread + IMPORT IL_Throw_Impl + IMPORT IL_ThrowExact_Impl + IMPORT IL_Rethrow_Impl #ifdef FEATURE_INTERPRETER IMPORT GetInterpThreadContextWithPossiblyMissingThreadOrCallStub IMPORT ExecuteInterpretedMethod diff --git a/src/coreclr/vm/i386/asmhelpers.S b/src/coreclr/vm/i386/asmhelpers.S index 62578e542c87c2..29b76b1f220ffb 100644 --- a/src/coreclr/vm/i386/asmhelpers.S +++ b/src/coreclr/vm/i386/asmhelpers.S @@ -1032,7 +1032,7 @@ LEAF_ENTRY IL_Throw, _TEXT CHECK_STACK_ALIGNMENT - call C_FUNC(IL_Throw_x86) + call C_FUNC(IL_Throw_Impl) add esp, STACK_ALIGN_PADDING #undef STACK_ALIGN_PADDING @@ -1058,7 +1058,7 @@ LEAF_ENTRY IL_ThrowExact, _TEXT CHECK_STACK_ALIGNMENT - call C_FUNC(IL_ThrowExact_x86) + call C_FUNC(IL_ThrowExact_Impl) add esp, STACK_ALIGN_PADDING #undef STACK_ALIGN_PADDING @@ -1081,7 +1081,7 @@ LEAF_ENTRY IL_Rethrow, _TEXT CHECK_STACK_ALIGNMENT - call C_FUNC(IL_Rethrow_x86) + call C_FUNC(IL_Rethrow_Impl) add esp, STACK_ALIGN_PADDING #undef STACK_ALIGN_PADDING diff --git a/src/coreclr/vm/i386/asmhelpers.asm b/src/coreclr/vm/i386/asmhelpers.asm index 632f7dccf4b003..71e6edfeb3a443 100644 --- a/src/coreclr/vm/i386/asmhelpers.asm +++ b/src/coreclr/vm/i386/asmhelpers.asm @@ -82,9 +82,6 @@ EXTERN g_chained_lookup_miss_counter:DWORD EXTERN g_dispatch_cache_chain_success_counter:DWORD endif -EXTERN @IL_Throw_x86@8:PROC -EXTERN @IL_ThrowExact_x86@8:PROC -EXTERN @IL_Rethrow_x86@4:PROC EXTERN @IL_Throw_Impl@8:PROC EXTERN @IL_ThrowExact_Impl@8:PROC EXTERN @IL_Rethrow_Impl@4:PROC From b4a3ab27717cafb5df593e6c5be34d4fa2e58195 Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Fri, 9 Jan 2026 02:03:36 +0200 Subject: [PATCH 04/30] wasm --- src/coreclr/vm/excep.cpp | 12 ++++++++++++ src/coreclr/vm/jithelpers.cpp | 25 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index ba4fc4c5ae9ab5..47596394c81d09 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -11204,6 +11204,18 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_ReturnAddress = pTransitionBlock->m_ReturnAddress; } +#elif defined(TARGET_WASM) + +void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *pTransitionBlock) +{ + LIMITED_METHOD_CONTRACT; + + // WASM cannot capture execution context, so just zero everything + memset(&m_Context, 0, sizeof(m_Context)); + memset(&m_ContextPointers, 0, sizeof(m_ContextPointers)); + m_ReturnAddress = 1; // Non-zero to skip VirtualUnwind in Init() +} + #endif // TARGET_X86 // diff --git a/src/coreclr/vm/jithelpers.cpp b/src/coreclr/vm/jithelpers.cpp index 471b5a13517b1e..e65b9e7cabbfa5 100644 --- a/src/coreclr/vm/jithelpers.cpp +++ b/src/coreclr/vm/jithelpers.cpp @@ -842,6 +842,31 @@ EXTERN_C HCIMPL2(void, IL_ThrowExact_Impl, Object* obj, TransitionBlock* transi } HCIMPLEND +#ifdef TARGET_WASM +// WASM doesn't have assembly stubs, so provide thin wrapper entry points +// that call the _Impl functions with NULL (which zeros the context) +HCIMPL1(void, IL_Throw, Object* obj) +{ + FCALL_CONTRACT; + IL_Throw_Impl(obj, NULL); +} +HCIMPLEND + +HCIMPL0(void, IL_Rethrow) +{ + FCALL_CONTRACT; + IL_Rethrow_Impl(NULL); +} +HCIMPLEND + +HCIMPL1(void, IL_ThrowExact, Object* obj) +{ + FCALL_CONTRACT; + IL_ThrowExact_Impl(obj, NULL); +} +HCIMPLEND +#endif // TARGET_WASM + #ifndef STATUS_STACK_BUFFER_OVERRUN // Not defined yet in CESDK includes # define STATUS_STACK_BUFFER_OVERRUN ((NTSTATUS)0xC0000409L) #endif From 7df98b1af14ca28b8a0abfabb51049963af844eb Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Fri, 9 Jan 2026 11:02:17 +0200 Subject: [PATCH 05/30] arm et al. --- src/coreclr/pal/inc/unixasmmacrosamd64.inc | 6 ++-- src/coreclr/pal/inc/unixasmmacrosarm.inc | 20 +++++++++++++ src/coreclr/vm/amd64/AsmMacros.inc | 11 +++---- src/coreclr/vm/arm/asmhelpers.S | 35 +++++++++++----------- 4 files changed, 47 insertions(+), 25 deletions(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index b443b3eefb8f63..d0faac1e8a5c71 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -447,9 +447,11 @@ C_FUNC(\Name\()_End): PUSH_CALLEE_SAVED_REGISTERS PUSH_ARGUMENT_REGISTERS - // Allocate space for float argument registers (128 bytes) + alignment (8 bytes) + // Allocate space for alignment (8 bytes) + float argument registers (128 bytes) alloc_stack 136 - SAVE_FLOAT_ARGUMENT_REGISTERS 0 + // Save float argument registers at offset 8 (after alignment padding) + // This ensures floats are at TransitionBlock - 128 (matching GetOffsetOfFloatArgumentRegisters) + SAVE_FLOAT_ARGUMENT_REGISTERS 8 END_PROLOGUE diff --git a/src/coreclr/pal/inc/unixasmmacrosarm.inc b/src/coreclr/pal/inc/unixasmmacrosarm.inc index 54a6f7d4dc3b19..e21e350a8b1be4 100644 --- a/src/coreclr/pal/inc/unixasmmacrosarm.inc +++ b/src/coreclr/pal/inc/unixasmmacrosarm.inc @@ -288,6 +288,26 @@ C_FUNC(\Name): add \target, sp, 4 .endm +// Pushes a full TransitionBlock on the stack including float argument registers. +// On exit, \target contains the TransitionBlock pointer. +.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target + // Push argument registers (r0-r3) - these will be at highest address in TransitionBlock + PUSH_ARGUMENT_REGISTERS + PUSH_CALLEE_SAVED_REGISTERS + PROLOG_STACK_SAVE_OFFSET r7, #12 + // let r7 point the saved r7 in the stack (clang FP style) + // Allocate space for float registers (64 bytes) + padding (4 bytes) for 8-byte alignment + // GetNegSpaceSize() = 64 + 4 = 68, so floats must be at TransitionBlock - 68 + alloc_stack 68 + // Save floating point argument registers (d0-d7) at sp+0 + // This puts them at TransitionBlock - 68, matching GetOffsetOfFloatArgumentRegisters() + vstm sp, {d0-d7} + CHECK_STACK_ALIGNMENT + END_PROLOGUE + // TransitionBlock is at sp + 68 + add \target, sp, #68 +.endm + .macro POP_COOP_PINVOKE_FRAME free_stack 4 POP_CALLEE_SAVED_REGISTERS diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index ead844ea8a5d34..9f4da4be6e0f7e 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -490,13 +490,13 @@ POP_COOP_PINVOKE_FRAME macro ; need to capture the complete register state. ; ; Stack layout (from high to low address after prologue): -; Return address -; CalleeSavedRegisters (r15, r14, r13, r12, rbp, rbx, rsi, rdi - 64 bytes) -; Outgoing argument homes (32 bytes) + alignment (8 bytes) <- TransitionBlock +; Return address (m_ReturnAddress) +; CalleeSavedRegisters (r15, r14, r13, r12, rbp, rbx, rsi, rdi - 64 bytes) <- TransitionBlock starts here +; Outgoing argument homes (32 bytes) + alignment (8 bytes) ; FloatArgumentRegisters (xmm0-xmm3, 64 bytes) ; sp points here ; -; On exit, target contains the TransitionBlock pointer (after float args area). +; On exit, target contains the TransitionBlock pointer (CalleeSavedRegisters). PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target PUSH_CALLEE_SAVED_REGISTERS @@ -513,7 +513,8 @@ PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target END_PROLOGUE - lea target, [rsp + 64] + ; TransitionBlock pointer points to CalleeSavedRegisters at rsp + 104 + lea target, [rsp + 104] endm diff --git a/src/coreclr/vm/arm/asmhelpers.S b/src/coreclr/vm/arm/asmhelpers.S index 50281235a3b74b..5065a6d15c4cb4 100644 --- a/src/coreclr/vm/arm/asmhelpers.S +++ b/src/coreclr/vm/arm/asmhelpers.S @@ -906,12 +906,12 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT // r0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PROLOG_PUSH "{r0-r3,r11,lr}" - PROLOG_VPUSH "{d0-d7}" - mov r1, sp // r1 = TransitionBlock* - bl C_FUNC(IL_Throw_Impl) - // Should never return - bkpt #0 + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS r1 + // r0 already contains exception object + // r1 contains pointer to TransitionBlock + bl C_FUNC(IL_Throw_Impl) + // Should never return + EMIT_BREAKPOINT NESTED_END IL_Throw, _TEXT // ------------------------------------------------------------------ @@ -922,12 +922,12 @@ NESTED_END IL_Throw, _TEXT // r0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PROLOG_PUSH "{r0-r3,r11,lr}" - PROLOG_VPUSH "{d0-d7}" - mov r1, sp // r1 = TransitionBlock* - bl C_FUNC(IL_ThrowExact_Impl) - // Should never return - bkpt #0 + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS r1 + // r0 already contains exception object + // r1 contains pointer to TransitionBlock + bl C_FUNC(IL_ThrowExact_Impl) + // Should never return + EMIT_BREAKPOINT NESTED_END IL_ThrowExact, _TEXT // ------------------------------------------------------------------ @@ -935,11 +935,10 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PROLOG_PUSH "{r0-r3,r11,lr}" - PROLOG_VPUSH "{d0-d7}" - mov r0, sp // r0 = TransitionBlock* - bl C_FUNC(IL_Rethrow_Impl) - // Should never return - bkpt #0 + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS r0 + // r0 contains pointer to TransitionBlock + bl C_FUNC(IL_Rethrow_Impl) + // Should never return + EMIT_BREAKPOINT NESTED_END IL_Rethrow, _TEXT From fef4204be77b4cb8b183ff9e622b5582ba76f10c Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Fri, 9 Jan 2026 15:22:50 +0200 Subject: [PATCH 06/30] Account for FP callee-saved regs --- src/coreclr/pal/inc/unixasmmacrosamd64.inc | 15 +-- src/coreclr/pal/inc/unixasmmacrosarm.inc | 22 +++-- src/coreclr/pal/inc/unixasmmacrosarm64.inc | 33 ++++--- .../pal/inc/unixasmmacrosloongarch64.inc | 32 ++++--- src/coreclr/pal/inc/unixasmmacrosriscv64.inc | 43 ++++++--- src/coreclr/vm/amd64/AsmMacros.inc | 38 +++++--- src/coreclr/vm/arm64/asmmacros.h | 33 ++++--- src/coreclr/vm/excep.cpp | 95 ++++++++++++++++--- 8 files changed, 228 insertions(+), 83 deletions(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index d0faac1e8a5c71..8c6f98c0d2737b 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -436,10 +436,13 @@ C_FUNC(\Name\()_End): // Return address // CalleeSavedRegisters (rbp, rbx, r15, r14, r13, r12 - 48 bytes) // ArgumentRegisters (r9, r8, rcx, rdx, rsi, rdi - 48 bytes) <- TransitionBlock -// Padding (8 bytes for 16-byte alignment) +// Padding (16 bytes for 16-byte alignment) // FloatArgumentRegisters (xmm0-xmm7, 128 bytes) // sp points here // +// Stack alignment: After call (8) + callee-saved (48) + arg regs (48) + alloc (144) + call (8) = 256 bytes +// 256 % 16 = 0, so stack is properly aligned for the call to IL_Throw_Impl. +// // On exit, \target contains the TransitionBlock pointer (after float args area). .macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target set_cfa_register rsp, 8 @@ -447,15 +450,15 @@ C_FUNC(\Name\()_End): PUSH_CALLEE_SAVED_REGISTERS PUSH_ARGUMENT_REGISTERS - // Allocate space for alignment (8 bytes) + float argument registers (128 bytes) - alloc_stack 136 - // Save float argument registers at offset 8 (after alignment padding) + // Allocate space for alignment (16 bytes) + float argument registers (128 bytes) = 144 bytes + alloc_stack 144 + // Save float argument registers at offset 16 (after alignment padding) // This ensures floats are at TransitionBlock - 128 (matching GetOffsetOfFloatArgumentRegisters) - SAVE_FLOAT_ARGUMENT_REGISTERS 8 + SAVE_FLOAT_ARGUMENT_REGISTERS 16 END_PROLOGUE - lea \target, [rsp + 136] + lea \target, [rsp + 144] .endm .macro INLINE_GETTHREAD diff --git a/src/coreclr/pal/inc/unixasmmacrosarm.inc b/src/coreclr/pal/inc/unixasmmacrosarm.inc index e21e350a8b1be4..6988f71a5c96e3 100644 --- a/src/coreclr/pal/inc/unixasmmacrosarm.inc +++ b/src/coreclr/pal/inc/unixasmmacrosarm.inc @@ -290,22 +290,28 @@ C_FUNC(\Name): // Pushes a full TransitionBlock on the stack including float argument registers. // On exit, \target contains the TransitionBlock pointer. +// +// Stack alignment: Arguments (16) + callee-saved (32) + alloc (72) = 120 bytes +// 184 % 8 = 0, so stack is properly 8-byte aligned for ARM32. .macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target // Push argument registers (r0-r3) - these will be at highest address in TransitionBlock PUSH_ARGUMENT_REGISTERS PUSH_CALLEE_SAVED_REGISTERS PROLOG_STACK_SAVE_OFFSET r7, #12 // let r7 point the saved r7 in the stack (clang FP style) - // Allocate space for float registers (64 bytes) + padding (4 bytes) for 8-byte alignment - // GetNegSpaceSize() = 64 + 4 = 68, so floats must be at TransitionBlock - 68 - alloc_stack 68 - // Save floating point argument registers (d0-d7) at sp+0 - // This puts them at TransitionBlock - 68, matching GetOffsetOfFloatArgumentRegisters() - vstm sp, {d0-d7} + // Allocate space for float argument registers (64 bytes) + padding (8 bytes) + FP callee-saved (64 bytes) = 136 bytes + // Stack layout: [d8-d15 (64)] [padding (8)] [d0-d7 (64)] [TransitionBlock] + alloc_stack 136 + // Save floating point argument registers (d0-d7) at sp+72 (after FP callee-saved and padding) + add r12, sp, #72 + vstm r12, {d0-d7} + // Save FP callee-saved registers (d8-d15) at sp+0 + add r12, sp, #0 + vstm r12, {d8-d15} CHECK_STACK_ALIGNMENT END_PROLOGUE - // TransitionBlock is at sp + 68 - add \target, sp, #68 + // TransitionBlock is at sp + 136 + add \target, sp, #136 .endm .macro POP_COOP_PINVOKE_FRAME diff --git a/src/coreclr/pal/inc/unixasmmacrosarm64.inc b/src/coreclr/pal/inc/unixasmmacrosarm64.inc index 6f17f6f5f1e1c9..2d27459372b561 100644 --- a/src/coreclr/pal/inc/unixasmmacrosarm64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosarm64.inc @@ -423,17 +423,18 @@ C_FUNC(\Name\()_End): // Pushes a full TransitionBlock on the stack including argument registers and // floating point argument registers. Used for exception throw helpers where we -// need to capture the complete register state. +// need to capture the complete register state including FP callee-saved registers. // // Stack layout (from low to high address): -// sp+0: FloatArgumentRegisters (q0-q7, 128 bytes) -// sp+128: TransitionBlock start (176 bytes) +// sp+0: FP callee-saved registers (d8-d15, 64 bytes) +// sp+64: FloatArgumentRegisters (q0-q7, 128 bytes) +// sp+192: TransitionBlock start (176 bytes) // - CalleeSavedRegisters (fp, lr, x19-x28 - 96 bytes) // - padding (8 bytes) // - x8 (8 bytes) // - ArgumentRegisters (x0-x7, 64 bytes) // -// On exit, \target contains the TransitionBlock pointer (sp+128). +// On exit, \target contains the TransitionBlock pointer (sp+192). .macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -176 @@ -444,17 +445,27 @@ C_FUNC(\Name\()_End): PROLOG_SAVE_REG_PAIR x25, x26, 64 PROLOG_SAVE_REG_PAIR x27, x28, 80 - // Allocate space for FloatArgumentRegisters - PROLOG_STACK_ALLOC 128 + // Allocate space for FloatArgumentRegisters (128) + FP callee-saved (64) = 192 bytes + PROLOG_STACK_ALLOC 192 - // Save argument registers (x8, x0-x7) at offset 232 from sp (128 + 104) - SAVE_ARGUMENT_REGISTERS sp, 232 + // Save argument registers (x8, x0-x7) at offset 296 from sp (192 + 104) + SAVE_ARGUMENT_REGISTERS sp, 296 - // Save floating point argument registers (q0-q7) at sp+0 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 0 + // Save floating point argument registers (q0-q7) at sp+64 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 64 + + // Save FP callee-saved registers (d8-d15) at sp+0 + str d8, [sp, #0] + str d9, [sp, #8] + str d10, [sp, #16] + str d11, [sp, #24] + str d12, [sp, #32] + str d13, [sp, #40] + str d14, [sp, #48] + str d15, [sp, #56] // Set target to TransitionBlock pointer - add \target, sp, #128 + add \target, sp, #192 .endm // ------------------------------------------------------------------ diff --git a/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc b/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc index 964bf5449ebdb8..92d701598f933e 100644 --- a/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc @@ -414,23 +414,33 @@ C_FUNC(\Name\()_End): // - CalleeSavedRegisters (fp, ra, s0-s8 - 96 bytes) // - ArgumentRegisters (a0-a7, 64 bytes) // -// On exit, \target contains the TransitionBlock pointer (sp+64). +// On exit, \target contains the TransitionBlock pointer (sp+128). .macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target - // Stack: FloatArgs(64) + CalleeSaved(96) + Args(64) = 224 bytes - PROLOG_STACK_ALLOC 224 - PROLOG_SAVE_REG_PAIR 22, 1, 64, 1 + // Stack: FPCalleeSaved(64) + FloatArgs(64) + CalleeSaved(96) + Args(64) = 288 bytes + PROLOG_STACK_ALLOC 288 + PROLOG_SAVE_REG_PAIR 22, 1, 128, 1 - // Save callee-saved registers at offset 64 (after FloatArgumentRegisters) - SAVE_CALLEESAVED_REGISTERS $sp, 64 + // Save callee-saved registers at offset 128 (after FP callee-saved and FloatArgumentRegisters) + SAVE_CALLEESAVED_REGISTERS $sp, 128 - // Save argument registers (a0-a7) at offset 160 - SAVE_ARGUMENT_REGISTERS $sp, 160 + // Save argument registers (a0-a7) at offset 224 + SAVE_ARGUMENT_REGISTERS $sp, 224 - // Save floating-point argument registers (fa0-fa7) at offset 0 - SAVE_FLOAT_ARGUMENT_REGISTERS $sp, 0 + // Save floating-point argument registers (fa0-fa7) at offset 64 + SAVE_FLOAT_ARGUMENT_REGISTERS $sp, 64 + + // Save FP callee-saved registers (f24-f31) at offset 0 + fst.d $f24, $sp, 0 + fst.d $f25, $sp, 8 + fst.d $f26, $sp, 16 + fst.d $f27, $sp, 24 + fst.d $f28, $sp, 32 + fst.d $f29, $sp, 40 + fst.d $f30, $sp, 48 + fst.d $f31, $sp, 56 // Set target to TransitionBlock pointer - addi.d \target, $sp, 64 + addi.d \target, $sp, 128 .endm // ------------------------------------------------------------------ diff --git a/src/coreclr/pal/inc/unixasmmacrosriscv64.inc b/src/coreclr/pal/inc/unixasmmacrosriscv64.inc index c72374af2b359e..d244756c304eb9 100644 --- a/src/coreclr/pal/inc/unixasmmacrosriscv64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosriscv64.inc @@ -360,23 +360,38 @@ C_FUNC(\Name): // - padding (8 bytes) // - ArgumentRegisters (a0-a7, 64 bytes) // -// On exit, \target contains the TransitionBlock pointer (sp+64). +// On exit, \target contains the TransitionBlock pointer (sp+160). .macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target - // Stack: FloatArgs(64) + CalleeSaved(120) + pad(8) + Args(64) = 256 bytes - PROLOG_STACK_ALLOC 256 - PROLOG_SAVE_REG_PAIR fp, ra, 64, 1 - - // Save callee-saved registers at offset 64 (after FloatArgumentRegisters) - SAVE_CALLEESAVED_REGISTERS sp, 64 - - // Save argument registers (a0-a7) at offset 192 - SAVE_ARGUMENT_REGISTERS sp, 192 - - // Save floating-point argument registers (fa0-fa7) at offset 0 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 0 + // Stack: FPCalleeSaved(96) + FloatArgs(64) + CalleeSaved(120) + pad(8) + Args(64) = 352 bytes + PROLOG_STACK_ALLOC 352 + PROLOG_SAVE_REG_PAIR fp, ra, 160, 1 + + // Save callee-saved registers at offset 160 (after FP callee-saved and FloatArgumentRegisters) + SAVE_CALLEESAVED_REGISTERS sp, 160 + + // Save argument registers (a0-a7) at offset 288 + SAVE_ARGUMENT_REGISTERS sp, 288 + + // Save floating-point argument registers (fa0-fa7) at offset 96 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 96 + + // Save FP callee-saved registers (fs0-fs11 = f8,f9,f18-f27) at offset 0 + // RISC-V FP callee-saved: fs0=f8, fs1=f9, fs2-fs11=f18-f27 + fsd fs0, 0(sp) // f8 + fsd fs1, 8(sp) // f9 + fsd fs2, 16(sp) // f18 + fsd fs3, 24(sp) // f19 + fsd fs4, 32(sp) // f20 + fsd fs5, 40(sp) // f21 + fsd fs6, 48(sp) // f22 + fsd fs7, 56(sp) // f23 + fsd fs8, 64(sp) // f24 + fsd fs9, 72(sp) // f25 + fsd fs10, 80(sp) // f26 + fsd fs11, 88(sp) // f27 // Set target to TransitionBlock pointer - addi \target, sp, 64 + addi \target, sp, 160 .endm // ------------------------------------------------------------------ diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index 9f4da4be6e0f7e..1e65e25f65d38c 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -487,34 +487,50 @@ POP_COOP_PINVOKE_FRAME macro ; Pushes a full TransitionBlock on the stack including argument registers and ; floating point argument registers. Used for exception throw helpers where we -; need to capture the complete register state. +; need to capture the complete register state including FP callee-saved registers. ; ; Stack layout (from high to low address after prologue): ; Return address (m_ReturnAddress) ; CalleeSavedRegisters (r15, r14, r13, r12, rbp, rbx, rsi, rdi - 64 bytes) <- TransitionBlock starts here -; Outgoing argument homes (32 bytes) + alignment (8 bytes) +; Outgoing argument homes (32 bytes) ; FloatArgumentRegisters (xmm0-xmm3, 64 bytes) +; FP Callee-saved registers (xmm6-xmm15, 160 bytes) ; sp points here ; +; Stack alignment: After call to IL_Throw (8 bytes) + callee-saved (64 bytes) + alloc (256 bytes) + call (8 bytes) = 336 bytes +; 336 % 16 = 0, so stack is properly aligned for the call to IL_Throw_Impl. +; ; On exit, target contains the TransitionBlock pointer (CalleeSavedRegisters). PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target PUSH_CALLEE_SAVED_REGISTERS - ; Allocate space for: outgoing args (32) + align (8) + float args (64) = 104 bytes - alloc_stack 104 + ; Allocate space for: outgoing args (32) + float args (64) + FP callee-saved (160) = 256 bytes + alloc_stack 256 ; Save argument registers to shadow space area - ; Shadow space is at offset 64 (after float args) from sp - SAVE_ARGUMENT_REGISTERS 64 - - ; Save float argument registers at offset 0 - SAVE_FLOAT_ARGUMENT_REGISTERS 0 + ; Shadow space is at offset 224 (after float args and FP callee-saved) from sp + SAVE_ARGUMENT_REGISTERS 224 + + ; Save float argument registers at offset 160 (after FP callee-saved) + SAVE_FLOAT_ARGUMENT_REGISTERS 160 + + ; Save FP callee-saved registers (xmm6-xmm15) at offset 0 + save_xmm128_postrsp xmm6, 0 + save_xmm128_postrsp xmm7, 10h + save_xmm128_postrsp xmm8, 20h + save_xmm128_postrsp xmm9, 30h + save_xmm128_postrsp xmm10, 40h + save_xmm128_postrsp xmm11, 50h + save_xmm128_postrsp xmm12, 60h + save_xmm128_postrsp xmm13, 70h + save_xmm128_postrsp xmm14, 80h + save_xmm128_postrsp xmm15, 90h END_PROLOGUE - ; TransitionBlock pointer points to CalleeSavedRegisters at rsp + 104 - lea target, [rsp + 104] + ; TransitionBlock pointer points to CalleeSavedRegisters at rsp + 256 + lea target, [rsp + 256] endm diff --git a/src/coreclr/vm/arm64/asmmacros.h b/src/coreclr/vm/arm64/asmmacros.h index 8e4eb786df8f70..93778d775f87c9 100644 --- a/src/coreclr/vm/arm64/asmmacros.h +++ b/src/coreclr/vm/arm64/asmmacros.h @@ -206,17 +206,18 @@ OFFSETOF__ee_alloc_context EQU OFFSETOF__RuntimeThreadLocals__ee_alloc_context ; Pushes a full TransitionBlock on the stack including argument registers and ; floating point argument registers. Used for exception throw helpers where we -; need to capture the complete register state. +; need to capture the complete register state including FP callee-saved registers. ; ; Stack layout (from low to high address): -; sp+0: FloatArgumentRegisters (q0-q7, 128 bytes) -; sp+128: TransitionBlock start (176 bytes) +; sp+0: FP callee-saved registers (d8-d15, 64 bytes) +; sp+64: FloatArgumentRegisters (q0-q7, 128 bytes) +; sp+192: TransitionBlock start (176 bytes) ; - CalleeSavedRegisters (fp, lr, x19-x28 - 96 bytes) ; - padding (8 bytes) ; - x8 (8 bytes) ; - ArgumentRegisters (x0-x7, 64 bytes) ; -; On exit, $Target contains the TransitionBlock pointer (sp+128). +; On exit, $Target contains the TransitionBlock pointer (sp+192). MACRO PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $Target @@ -229,17 +230,27 @@ OFFSETOF__ee_alloc_context EQU OFFSETOF__RuntimeThreadLocals__ee_alloc_context PROLOG_SAVE_REG_PAIR x25, x26, #64 PROLOG_SAVE_REG_PAIR x27, x28, #80 - ; Allocate space for FloatArgumentRegisters - PROLOG_STACK_ALLOC 128 + ; Allocate space for FloatArgumentRegisters (128) + FP callee-saved (64) = 192 bytes + PROLOG_STACK_ALLOC 192 - ; Save argument registers (x8, x0-x7) at offset 232 from sp (128 + 104) - SAVE_ARGUMENT_REGISTERS sp, 232 + ; Save argument registers (x8, x0-x7) at offset 296 from sp (192 + 104) + SAVE_ARGUMENT_REGISTERS sp, 296 - ; Save floating point argument registers (q0-q7) at sp+0 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 0 + ; Save floating point argument registers (q0-q7) at sp+64 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 64 + + ; Save FP callee-saved registers (d8-d15) at sp+0 + str d8, [sp, #0] + str d9, [sp, #8] + str d10, [sp, #16] + str d11, [sp, #24] + str d12, [sp, #32] + str d13, [sp, #40] + str d14, [sp, #48] + str d15, [sp, #56] ; Set target to TransitionBlock pointer - add $Target, sp, #128 + add $Target, sp, #192 MEND #define GC_ALLOC_FINALIZE 1 diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index 47596394c81d09..1f801472209622 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10928,9 +10928,24 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.R8 = 0; m_Context.R9 = 0; - // Note: On Windows AMD64, floating point argument registers (xmm0-xmm3) are not currently saved - // in the transition block for IL_Throw helpers. If needed, the assembly stubs would need to be - // updated to save them. + // Read FP callee-saved registers (xmm6-xmm15) from the stack + // They are stored at negative offsets from TransitionBlock: + // Layout: [xmm6-xmm15 (160 bytes)] [xmm0-xmm3 (64 bytes)] [shadow (32 bytes)] [CalleeSavedRegs] [RetAddr] + // FP callee-saved are at TransitionBlock - 256 (160 + 64 + 32) + M128A *pFpCalleeSaved = (M128A*)((BYTE*)pTransitionBlock - 256); + m_Context.Xmm6 = pFpCalleeSaved[0]; + m_Context.Xmm7 = pFpCalleeSaved[1]; + m_Context.Xmm8 = pFpCalleeSaved[2]; + m_Context.Xmm9 = pFpCalleeSaved[3]; + m_Context.Xmm10 = pFpCalleeSaved[4]; + m_Context.Xmm11 = pFpCalleeSaved[5]; + m_Context.Xmm12 = pFpCalleeSaved[6]; + m_Context.Xmm13 = pFpCalleeSaved[7]; + m_Context.Xmm14 = pFpCalleeSaved[8]; + m_Context.Xmm15 = pFpCalleeSaved[9]; + + // Initialize FP control/status + m_Context.MxCsr = 0x1F80; // Default MXCSR value (all exceptions masked) #endif #define CALLEE_SAVED_REGISTER(reg) \ @@ -10975,9 +10990,19 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p { m_Context.D[i] = pFloatArgs->d[i]; } - // Initialize remaining D registers (D8-D31) to zero - // D8-D15 are callee-saved, D16-D31 are caller-saved - for (int i = 8; i < 32; i++) + + // Read FP callee-saved registers (d8-d15) from the stack + // They are stored at negative offset from TransitionBlock: + // Layout: [d8-d15 (64 bytes)] [padding (8 bytes)] [d0-d7 (64 bytes)] [TransitionBlock] + // FP callee-saved are at TransitionBlock - 136 (64 + 8 + 64) + UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 136); + for (int i = 0; i < 8; i++) + { + m_Context.D[8 + i] = pFpCalleeSaved[i]; + } + + // Initialize remaining D registers (D16-D31) to zero - these are caller-saved + for (int i = 16; i < 32; i++) { m_Context.D[i] = 0; } @@ -11039,9 +11064,31 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p { m_Context.V[i] = pFloatArgs->q[i]; } - // Initialize remaining V registers (V8-V31) to zero - // V8-V15 are callee-saved (only lower 64 bits), V16-V31 are caller-saved - for (int i = 8; i < 32; i++) + + // Read FP callee-saved registers (d8-d15) from the stack + // They are stored at negative offset from TransitionBlock: + // Layout: [d8-d15 (64 bytes)] [q0-q7 (128 bytes)] [TransitionBlock] + // FP callee-saved are at TransitionBlock - 192 (64 + 128) + UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 192); + m_Context.V[8].Low = pFpCalleeSaved[0]; + m_Context.V[8].High = 0; + m_Context.V[9].Low = pFpCalleeSaved[1]; + m_Context.V[9].High = 0; + m_Context.V[10].Low = pFpCalleeSaved[2]; + m_Context.V[10].High = 0; + m_Context.V[11].Low = pFpCalleeSaved[3]; + m_Context.V[11].High = 0; + m_Context.V[12].Low = pFpCalleeSaved[4]; + m_Context.V[12].High = 0; + m_Context.V[13].Low = pFpCalleeSaved[5]; + m_Context.V[13].High = 0; + m_Context.V[14].Low = pFpCalleeSaved[6]; + m_Context.V[14].High = 0; + m_Context.V[15].Low = pFpCalleeSaved[7]; + m_Context.V[15].High = 0; + + // Initialize remaining V registers (V16-V31) to zero - these are caller-saved + for (int i = 16; i < 32; i++) { m_Context.V[i].Low = 0; m_Context.V[i].High = 0; @@ -11109,8 +11156,20 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p { memcpy(&m_Context.F[i * 4], &pFloatArgs->f[i], sizeof(double)); } - // Initialize remaining F registers to zero - for (int i = 8; i < 32; i++) + + // Read FP callee-saved registers (f24-f31) from the stack + // They are stored at negative offset from TransitionBlock: + // Layout: [f24-f31 (64 bytes)] [fa0-fa7 (64 bytes)] [TransitionBlock] + // FP callee-saved are at TransitionBlock - 128 (64 + 64) + UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 128); + for (int i = 0; i < 8; i++) + { + // f24-f31 map to indices 24-31 in the F array, each taking 4 slots + memcpy(&m_Context.F[(24 + i) * 4], &pFpCalleeSaved[i], sizeof(double)); + } + + // Initialize remaining F registers (f8-f23) to zero + for (int i = 8; i < 24; i++) { memset(&m_Context.F[i * 4], 0, sizeof(double) * 4); } @@ -11179,6 +11238,20 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // F[10-17] are fa0-fa7 in RISC-V register naming memcpy(&m_Context.F[10 + i], &pFloatArgs->f[i], sizeof(double)); } + + // Read FP callee-saved registers (fs0-fs11) from the stack + // They are stored at negative offset from TransitionBlock: + // Layout: [fs0-fs11 (96 bytes)] [fa0-fa7 (64 bytes)] [TransitionBlock] + // FP callee-saved are at TransitionBlock - 160 (96 + 64) + // RISC-V FP callee-saved: fs0=f8, fs1=f9, fs2-fs11=f18-f27 + UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 160); + memcpy(&m_Context.F[8], &pFpCalleeSaved[0], sizeof(double)); // fs0 = f8 + memcpy(&m_Context.F[9], &pFpCalleeSaved[1], sizeof(double)); // fs1 = f9 + for (int i = 0; i < 10; i++) + { + memcpy(&m_Context.F[18 + i], &pFpCalleeSaved[2 + i], sizeof(double)); // fs2-fs11 = f18-f27 + } + // Initialize FP control/status register m_Context.Fcsr = 0; From 4835a5ff4cfa80a508a4d24f1df2caf8800cb0ca Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Fri, 9 Jan 2026 21:38:02 +0200 Subject: [PATCH 07/30] . --- src/coreclr/pal/inc/unixasmmacrosarm.inc | 22 ++++++++--- src/coreclr/vm/amd64/AsmMacros.inc | 48 ++++++++++++------------ src/coreclr/vm/excep.cpp | 9 +++-- src/coreclr/vm/jithelpers.cpp | 15 +++++--- 4 files changed, 56 insertions(+), 38 deletions(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosarm.inc b/src/coreclr/pal/inc/unixasmmacrosarm.inc index 6988f71a5c96e3..020743971ec75c 100644 --- a/src/coreclr/pal/inc/unixasmmacrosarm.inc +++ b/src/coreclr/pal/inc/unixasmmacrosarm.inc @@ -291,19 +291,29 @@ C_FUNC(\Name): // Pushes a full TransitionBlock on the stack including float argument registers. // On exit, \target contains the TransitionBlock pointer. // -// Stack alignment: Arguments (16) + callee-saved (32) + alloc (72) = 120 bytes -// 184 % 8 = 0, so stack is properly 8-byte aligned for ARM32. +// Stack layout (from sp going up): +// sp+0: d8-d15 (64 bytes) - FP callee-saved +// sp+64: padding (4 bytes) - to make d0-d7 8-byte aligned at TransitionBlock-68 +// sp+68: d0-d7 (64 bytes) - float argument registers (at TransitionBlock - 68) +// sp+132: padding (4 bytes) - to keep total allocation 8-byte aligned +// sp+136: TransitionBlock starts here (CalleeSavedRegisters + ArgumentRegisters pushed above) +// +// GetNegSpaceSize() for ARM32 = 64 (FloatArgumentRegisters) + 4 (padding) = 68 +// GetOffsetOfFloatArgumentRegisters() = -68 +// +// Total stack alloc: 64 + 4 + 64 + 4 = 136 bytes +// Stack: Arguments(16) + callee-saved(36) + alloc(136) = 188 bytes +// 188 % 4 = 0, properly aligned for ARM32 .macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target // Push argument registers (r0-r3) - these will be at highest address in TransitionBlock PUSH_ARGUMENT_REGISTERS PUSH_CALLEE_SAVED_REGISTERS PROLOG_STACK_SAVE_OFFSET r7, #12 // let r7 point the saved r7 in the stack (clang FP style) - // Allocate space for float argument registers (64 bytes) + padding (8 bytes) + FP callee-saved (64 bytes) = 136 bytes - // Stack layout: [d8-d15 (64)] [padding (8)] [d0-d7 (64)] [TransitionBlock] + // Allocate space for: d8-d15 (64) + padding (4) + d0-d7 (64) + padding (4) = 136 bytes alloc_stack 136 - // Save floating point argument registers (d0-d7) at sp+72 (after FP callee-saved and padding) - add r12, sp, #72 + // Save floating point argument registers (d0-d7) at sp+68 (TransitionBlock - 68) + add r12, sp, #68 vstm r12, {d0-d7} // Save FP callee-saved registers (d8-d15) at sp+0 add r12, sp, #0 diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index 1e65e25f65d38c..6213e328bd25b1 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -495,42 +495,44 @@ POP_COOP_PINVOKE_FRAME macro ; Outgoing argument homes (32 bytes) ; FloatArgumentRegisters (xmm0-xmm3, 64 bytes) ; FP Callee-saved registers (xmm6-xmm15, 160 bytes) +; Padding (8 bytes for 16-byte alignment) ; sp points here ; -; Stack alignment: After call to IL_Throw (8 bytes) + callee-saved (64 bytes) + alloc (256 bytes) + call (8 bytes) = 336 bytes -; 336 % 16 = 0, so stack is properly aligned for the call to IL_Throw_Impl. +; Stack alignment: After call to IL_Throw (8 bytes) + callee-saved (64 bytes) + alloc (264 bytes) = 336 bytes +; The XMM save area at offset 8 is 16-byte aligned (336 - 8 = 328, 328 % 16 = 8, but RSP+8 % 16 = 0). ; ; On exit, target contains the TransitionBlock pointer (CalleeSavedRegisters). PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target PUSH_CALLEE_SAVED_REGISTERS - ; Allocate space for: outgoing args (32) + float args (64) + FP callee-saved (160) = 256 bytes - alloc_stack 256 + ; Allocate space for: outgoing args (32) + float args (64) + FP callee-saved (160) + padding (8) = 264 bytes + alloc_stack 264 ; Save argument registers to shadow space area - ; Shadow space is at offset 224 (after float args and FP callee-saved) from sp - SAVE_ARGUMENT_REGISTERS 224 - - ; Save float argument registers at offset 160 (after FP callee-saved) - SAVE_FLOAT_ARGUMENT_REGISTERS 160 - - ; Save FP callee-saved registers (xmm6-xmm15) at offset 0 - save_xmm128_postrsp xmm6, 0 - save_xmm128_postrsp xmm7, 10h - save_xmm128_postrsp xmm8, 20h - save_xmm128_postrsp xmm9, 30h - save_xmm128_postrsp xmm10, 40h - save_xmm128_postrsp xmm11, 50h - save_xmm128_postrsp xmm12, 60h - save_xmm128_postrsp xmm13, 70h - save_xmm128_postrsp xmm14, 80h - save_xmm128_postrsp xmm15, 90h + ; Shadow space is at offset 232 (after float args and FP callee-saved and padding) from sp + SAVE_ARGUMENT_REGISTERS 232 + + ; Save float argument registers at offset 168 (after FP callee-saved and padding) + SAVE_FLOAT_ARGUMENT_REGISTERS 168 + + ; Save FP callee-saved registers (xmm6-xmm15) at offset 8 (after padding) + ; Offset 8 ensures 16-byte alignment for movaps + save_xmm128_postrsp xmm6, 8h + save_xmm128_postrsp xmm7, 18h + save_xmm128_postrsp xmm8, 28h + save_xmm128_postrsp xmm9, 38h + save_xmm128_postrsp xmm10, 48h + save_xmm128_postrsp xmm11, 58h + save_xmm128_postrsp xmm12, 68h + save_xmm128_postrsp xmm13, 78h + save_xmm128_postrsp xmm14, 88h + save_xmm128_postrsp xmm15, 98h END_PROLOGUE - ; TransitionBlock pointer points to CalleeSavedRegisters at rsp + 256 - lea target, [rsp + 256] + ; TransitionBlock pointer points to CalleeSavedRegisters at rsp + 264 + lea target, [rsp + 264] endm diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index 1f801472209622..8e56038ce2123a 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10930,8 +10930,9 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // Read FP callee-saved registers (xmm6-xmm15) from the stack // They are stored at negative offsets from TransitionBlock: - // Layout: [xmm6-xmm15 (160 bytes)] [xmm0-xmm3 (64 bytes)] [shadow (32 bytes)] [CalleeSavedRegs] [RetAddr] - // FP callee-saved are at TransitionBlock - 256 (160 + 64 + 32) + // Layout: [padding (8)] [xmm6-xmm15 (160 bytes)] [xmm0-xmm3 (64 bytes)] [shadow (32 bytes)] [CalleeSavedRegs] [RetAddr] + // FP callee-saved are at TransitionBlock - 256 (8 padding + 160 + 64 + 32 - 8 for the xmm start offset) + // More precisely: xmm6 is at TransitionBlock - 264 + 8 = TransitionBlock - 256 M128A *pFpCalleeSaved = (M128A*)((BYTE*)pTransitionBlock - 256); m_Context.Xmm6 = pFpCalleeSaved[0]; m_Context.Xmm7 = pFpCalleeSaved[1]; @@ -10993,8 +10994,8 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // Read FP callee-saved registers (d8-d15) from the stack // They are stored at negative offset from TransitionBlock: - // Layout: [d8-d15 (64 bytes)] [padding (8 bytes)] [d0-d7 (64 bytes)] [TransitionBlock] - // FP callee-saved are at TransitionBlock - 136 (64 + 8 + 64) + // Layout: [d8-d15 (64 bytes)] [padding (4)] [d0-d7 (64 bytes)] [padding (4)] [TransitionBlock] + // FP callee-saved are at TransitionBlock - 136 (64 + 4 + 64 + 4) UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 136); for (int i = 0; i < 8; i++) { diff --git a/src/coreclr/vm/jithelpers.cpp b/src/coreclr/vm/jithelpers.cpp index e65b9e7cabbfa5..3719f15fc59919 100644 --- a/src/coreclr/vm/jithelpers.cpp +++ b/src/coreclr/vm/jithelpers.cpp @@ -743,10 +743,8 @@ HCIMPL1(EnregisteredTypeHandle, JIT_GetClassFromMethodParam, MethodDesc* pMD) return pMT; HCIMPLEND -#include - - +#include //======================================================================== // @@ -784,9 +782,16 @@ EXTERN_C HCIMPL2(void, IL_Throw_Impl, Object* obj, TransitionBlock* transitionB FC_CAN_TRIGGER_GC(); if (oref == 0) - DispatchManagedException(kNullReferenceException); + { + // Create a NullReferenceException and throw it with the correct context + EEException ex(kNullReferenceException); + oref = ex.CreateThrowable(); + } + else + { + NormalizeThrownObject(&oref); + } - NormalizeThrownObject(&oref); DispatchManagedException(oref, exceptionFrame.GetContext()); FC_CAN_TRIGGER_GC_END(); From 2746edf80e547bc831684e6ab1d551637070fd8b Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Sat, 10 Jan 2026 18:17:07 +0200 Subject: [PATCH 08/30] . --- src/coreclr/vm/amd64/AsmMacros.inc | 58 ++++++++++++++++++------------ src/coreclr/vm/excep.cpp | 7 ++-- 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index 6213e328bd25b1..f8b2bd11fb5078 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -495,39 +495,51 @@ POP_COOP_PINVOKE_FRAME macro ; Outgoing argument homes (32 bytes) ; FloatArgumentRegisters (xmm0-xmm3, 64 bytes) ; FP Callee-saved registers (xmm6-xmm15, 160 bytes) -; Padding (8 bytes for 16-byte alignment) ; sp points here ; -; Stack alignment: After call to IL_Throw (8 bytes) + callee-saved (64 bytes) + alloc (264 bytes) = 336 bytes -; The XMM save area at offset 8 is 16-byte aligned (336 - 8 = 328, 328 % 16 = 8, but RSP+8 % 16 = 0). +; Stack alignment: After call (8) + callee-saved pushes (64) + alloc (256) = 328 bytes +; 328 mod 16 = 8, so RSP mod 16 = 8 after alloc - NOT 16-byte aligned. +; We need RSP to be 16-byte aligned for movaps AND unwind offsets must be multiples of 16. +; +; Solution: Use alloc_stack 272 (adds 16 bytes padding). 8 + 64 + 272 = 344, 344 mod 16 = 8. +; Wait, that's still not right. Let me recalculate: +; After call: RSP mod 16 = 8 (return addr pushed from 16-byte aligned stack) +; After 8 pushes (64 bytes): RSP mod 16 = (8 + 64) mod 16 = 72 mod 16 = 8 +; After alloc N: RSP mod 16 = (8 + N) mod 16 +; For RSP mod 16 = 0, need N mod 16 = 8 +; 256 mod 16 = 0, so RSP mod 16 = 8 (NOT aligned) +; 264 mod 16 = 8, so RSP mod 16 = 0 (aligned!) but offsets 0,16,32... work +; +; With alloc_stack 264: RSP is 16-byte aligned, XMM saves at 0, 16, 32, ... (multiples of 16) ; ; On exit, target contains the TransitionBlock pointer (CalleeSavedRegisters). PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target PUSH_CALLEE_SAVED_REGISTERS - ; Allocate space for: outgoing args (32) + float args (64) + FP callee-saved (160) + padding (8) = 264 bytes + ; Allocate space for: FP callee-saved (160) + float args (64) + shadow (32) + padding (8) = 264 bytes + ; This makes RSP 16-byte aligned (8 + 64 + 264 = 336, and original RSP - 336 is 16-byte aligned) alloc_stack 264 - ; Save argument registers to shadow space area - ; Shadow space is at offset 232 (after float args and FP callee-saved and padding) from sp - SAVE_ARGUMENT_REGISTERS 232 - - ; Save float argument registers at offset 168 (after FP callee-saved and padding) - SAVE_FLOAT_ARGUMENT_REGISTERS 168 - - ; Save FP callee-saved registers (xmm6-xmm15) at offset 8 (after padding) - ; Offset 8 ensures 16-byte alignment for movaps - save_xmm128_postrsp xmm6, 8h - save_xmm128_postrsp xmm7, 18h - save_xmm128_postrsp xmm8, 28h - save_xmm128_postrsp xmm9, 38h - save_xmm128_postrsp xmm10, 48h - save_xmm128_postrsp xmm11, 58h - save_xmm128_postrsp xmm12, 68h - save_xmm128_postrsp xmm13, 78h - save_xmm128_postrsp xmm14, 88h - save_xmm128_postrsp xmm15, 98h + ; Save argument registers to shadow space area at offset 224 + SAVE_ARGUMENT_REGISTERS 224 + + ; Save float argument registers at offset 160 + SAVE_FLOAT_ARGUMENT_REGISTERS 160 + + ; Save FP callee-saved registers (xmm6-xmm15) at offset 0 + ; RSP is 16-byte aligned, so offset 0, 16, 32, ... are all 16-byte aligned + ; AND these offsets are multiples of 16 as required by unwind codes + save_xmm128_postrsp xmm6, 0h + save_xmm128_postrsp xmm7, 10h + save_xmm128_postrsp xmm8, 20h + save_xmm128_postrsp xmm9, 30h + save_xmm128_postrsp xmm10, 40h + save_xmm128_postrsp xmm11, 50h + save_xmm128_postrsp xmm12, 60h + save_xmm128_postrsp xmm13, 70h + save_xmm128_postrsp xmm14, 80h + save_xmm128_postrsp xmm15, 90h END_PROLOGUE diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index 8e56038ce2123a..d9b45e9bc29d25 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10930,10 +10930,9 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // Read FP callee-saved registers (xmm6-xmm15) from the stack // They are stored at negative offsets from TransitionBlock: - // Layout: [padding (8)] [xmm6-xmm15 (160 bytes)] [xmm0-xmm3 (64 bytes)] [shadow (32 bytes)] [CalleeSavedRegs] [RetAddr] - // FP callee-saved are at TransitionBlock - 256 (8 padding + 160 + 64 + 32 - 8 for the xmm start offset) - // More precisely: xmm6 is at TransitionBlock - 264 + 8 = TransitionBlock - 256 - M128A *pFpCalleeSaved = (M128A*)((BYTE*)pTransitionBlock - 256); + // Layout: [xmm6-xmm15 (160 bytes)] [xmm0-xmm3 (64 bytes)] [shadow (32 bytes)] [padding (8 bytes)] [CalleeSavedRegs] [RetAddr] + // xmm6 is at sp+0, TransitionBlock is at sp+264, so xmm6 is at TransitionBlock - 264 + M128A *pFpCalleeSaved = (M128A*)((BYTE*)pTransitionBlock - 264); m_Context.Xmm6 = pFpCalleeSaved[0]; m_Context.Xmm7 = pFpCalleeSaved[1]; m_Context.Xmm8 = pFpCalleeSaved[2]; From 2450d886141cf5da082b33d908cbeadc61603699 Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Sat, 10 Jan 2026 21:41:19 +0200 Subject: [PATCH 09/30] fixes and reverts --- src/coreclr/vm/excep.cpp | 42 ----------------------------------- src/coreclr/vm/frames.h | 1 - src/coreclr/vm/jithelpers.cpp | 12 ++-------- 3 files changed, 2 insertions(+), 53 deletions(-) diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index d9b45e9bc29d25..da9dd59cd249f1 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -11291,46 +11291,6 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p #endif // TARGET_X86 -// -// Init a new frame -// -void SoftwareExceptionFrame::Init() -{ - WRAPPER_NO_CONTRACT; - - // On x86 and when using TransitionBlock path (indicated by m_ReturnAddress being set), - // we initialize the context state from transition block in UpdateContextFromTransitionBlock method. -#ifndef TARGET_X86 - // If m_ReturnAddress is already set, the context was populated from TransitionBlock - // and we should skip VirtualUnwind. - if (m_ReturnAddress == 0) - { -#define CALLEE_SAVED_REGISTER(regname) m_ContextPointers.regname = NULL; - ENUM_CALLEE_SAVED_REGISTERS(); -#undef CALLEE_SAVED_REGISTER - -#ifndef TARGET_UNIX - Thread::VirtualUnwindCallFrame(&m_Context, &m_ContextPointers); -#else // !TARGET_UNIX - BOOL success = PAL_VirtualUnwind(&m_Context, &m_ContextPointers); - if (!success) - { - _ASSERTE(!"SoftwareExceptionFrame::Init failed"); - EEPOLICY_HANDLE_FATAL_ERROR(COR_E_EXECUTIONENGINE); - } -#endif // !TARGET_UNIX - -#define CALLEE_SAVED_REGISTER(regname) if (m_ContextPointers.regname == NULL) m_ContextPointers.regname = &m_Context.regname; - ENUM_CALLEE_SAVED_REGISTERS(); -#undef CALLEE_SAVED_REGISTER - - m_ReturnAddress = ::GetIP(&m_Context); - } - - _ASSERTE(ExecutionManager::IsManagedCode(::GetIP(&m_Context))); -#endif // !TARGET_X86 -} - // // Init and Link in a new frame // @@ -11338,8 +11298,6 @@ void SoftwareExceptionFrame::InitAndLink(Thread *pThread) { WRAPPER_NO_CONTRACT; - Init(); - Push(pThread); } diff --git a/src/coreclr/vm/frames.h b/src/coreclr/vm/frames.h index a656717b2d5158..cfcea97ab1bc15 100644 --- a/src/coreclr/vm/frames.h +++ b/src/coreclr/vm/frames.h @@ -999,7 +999,6 @@ class SoftwareExceptionFrame : public Frame } #ifndef DACCESS_COMPILE - void Init(); void InitAndLink(Thread *pThread); #endif diff --git a/src/coreclr/vm/jithelpers.cpp b/src/coreclr/vm/jithelpers.cpp index 3719f15fc59919..1c6fbd901c46e5 100644 --- a/src/coreclr/vm/jithelpers.cpp +++ b/src/coreclr/vm/jithelpers.cpp @@ -743,7 +743,6 @@ HCIMPL1(EnregisteredTypeHandle, JIT_GetClassFromMethodParam, MethodDesc* pMD) return pMT; HCIMPLEND - #include //======================================================================== @@ -782,16 +781,9 @@ EXTERN_C HCIMPL2(void, IL_Throw_Impl, Object* obj, TransitionBlock* transitionB FC_CAN_TRIGGER_GC(); if (oref == 0) - { - // Create a NullReferenceException and throw it with the correct context - EEException ex(kNullReferenceException); - oref = ex.CreateThrowable(); - } - else - { - NormalizeThrownObject(&oref); - } + DispatchManagedException(kNullReferenceException); + NormalizeThrownObject(&oref); DispatchManagedException(oref, exceptionFrame.GetContext()); FC_CAN_TRIGGER_GC_END(); From 7fe9c53d967a97c7eafde0693c2991686c92cd16 Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Sun, 11 Jan 2026 11:02:42 +0200 Subject: [PATCH 10/30] [x64] unix: fix alignment, win: test a random thought --- src/coreclr/pal/inc/unixasmmacrosamd64.inc | 23 +++++++++++++--------- src/coreclr/vm/jithelpers.cpp | 9 +++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index 8c6f98c0d2737b..e6960a33c38bb9 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -436,29 +436,34 @@ C_FUNC(\Name\()_End): // Return address // CalleeSavedRegisters (rbp, rbx, r15, r14, r13, r12 - 48 bytes) // ArgumentRegisters (r9, r8, rcx, rdx, rsi, rdi - 48 bytes) <- TransitionBlock -// Padding (16 bytes for 16-byte alignment) +// Padding (24 bytes for 16-byte alignment) // FloatArgumentRegisters (xmm0-xmm7, 128 bytes) // sp points here // -// Stack alignment: After call (8) + callee-saved (48) + arg regs (48) + alloc (144) + call (8) = 256 bytes -// 256 % 16 = 0, so stack is properly aligned for the call to IL_Throw_Impl. +// Stack alignment calculation: +// Before call to IL_Throw: rsp is 16-byte aligned +// After call (return addr pushed): rsp % 16 = 8 +// After PUSH_CALLEE_SAVED_REGISTERS (48 bytes): rsp % 16 = 8 (48 % 16 = 0) +// After PUSH_ARGUMENT_REGISTERS (48 bytes): rsp % 16 = 8 (48 % 16 = 0) +// After alloc_stack 152: rsp % 16 = (8 - 152) % 16 = 0 <- properly aligned! // -// On exit, \target contains the TransitionBlock pointer (after float args area). +// On exit, \target contains the TransitionBlock pointer. .macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target set_cfa_register rsp, 8 PUSH_CALLEE_SAVED_REGISTERS PUSH_ARGUMENT_REGISTERS - // Allocate space for alignment (16 bytes) + float argument registers (128 bytes) = 144 bytes - alloc_stack 144 - // Save float argument registers at offset 16 (after alignment padding) + // Allocate space for padding (24 bytes) + float argument registers (128 bytes) = 152 bytes + // The extra 8 bytes (vs 144) ensures 16-byte stack alignment before the call + alloc_stack 152 + // Save float argument registers at offset 24 (after alignment padding) // This ensures floats are at TransitionBlock - 128 (matching GetOffsetOfFloatArgumentRegisters) - SAVE_FLOAT_ARGUMENT_REGISTERS 16 + SAVE_FLOAT_ARGUMENT_REGISTERS 24 END_PROLOGUE - lea \target, [rsp + 144] + lea \target, [rsp + 152] .endm .macro INLINE_GETTHREAD diff --git a/src/coreclr/vm/jithelpers.cpp b/src/coreclr/vm/jithelpers.cpp index 1c6fbd901c46e5..b314d4f938916d 100644 --- a/src/coreclr/vm/jithelpers.cpp +++ b/src/coreclr/vm/jithelpers.cpp @@ -784,6 +784,11 @@ EXTERN_C HCIMPL2(void, IL_Throw_Impl, Object* obj, TransitionBlock* transitionB DispatchManagedException(kNullReferenceException); NormalizeThrownObject(&oref); + + // Set the last thrown object before dispatching the exception. + // This is required for exception handling code that checks LastThrownObject. + pThread->SafeSetLastThrownObject(oref); + DispatchManagedException(oref, exceptionFrame.GetContext()); FC_CAN_TRIGGER_GC_END(); @@ -832,6 +837,10 @@ EXTERN_C HCIMPL2(void, IL_ThrowExact_Impl, Object* obj, TransitionBlock* transi FC_CAN_TRIGGER_GC(); + // Set the last thrown object before dispatching the exception. + // This is required for exception handling code that checks LastThrownObject. + pThread->SafeSetLastThrownObject(oref); + DispatchManagedException(oref, exceptionFrame.GetContext()); FC_CAN_TRIGGER_GC_END(); From e8287a56cda15339c31a9ca03d6e265fdd01d628 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Sun, 11 Jan 2026 21:13:45 +0200 Subject: [PATCH 11/30] linux-x64 fixes --- src/coreclr/pal/inc/unixasmmacrosamd64.inc | 73 +++++++++++++++++----- 1 file changed, 58 insertions(+), 15 deletions(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index e6960a33c38bb9..90e53a00d87932 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -147,6 +147,12 @@ C_FUNC(\Name\()_End): // the xmm registers are not supported by the libunwind .endm +// Unaligned version for use when stack alignment cannot be guaranteed +.macro save_xmm128_postrsp_unaligned Reg, Offset + __Offset = \Offset + movdqu xmmword ptr [rsp + __Offset], \Reg +.endm + .macro restore_xmm128 Reg, ofs __Offset = \ofs movdqa \Reg, xmmword ptr [rsp + __Offset] @@ -246,6 +252,20 @@ C_FUNC(\Name\()_End): .endm +// Unaligned version for cases where 16-byte stack alignment cannot be guaranteed +.macro SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED ofs + + save_xmm128_postrsp_unaligned xmm0, \ofs + save_xmm128_postrsp_unaligned xmm1, \ofs + 0x10 + save_xmm128_postrsp_unaligned xmm2, \ofs + 0x20 + save_xmm128_postrsp_unaligned xmm3, \ofs + 0x30 + save_xmm128_postrsp_unaligned xmm4, \ofs + 0x40 + save_xmm128_postrsp_unaligned xmm5, \ofs + 0x50 + save_xmm128_postrsp_unaligned xmm6, \ofs + 0x60 + save_xmm128_postrsp_unaligned xmm7, \ofs + 0x70 + +.endm + .macro RESTORE_FLOAT_ARGUMENT_REGISTERS ofs restore_xmm128 xmm0, \ofs @@ -433,19 +453,42 @@ C_FUNC(\Name\()_End): // need to capture the complete register state. // // Stack layout (from high to low address after prologue): -// Return address +// Return address (8 bytes) // CalleeSavedRegisters (rbp, rbx, r15, r14, r13, r12 - 48 bytes) -// ArgumentRegisters (r9, r8, rcx, rdx, rsi, rdi - 48 bytes) <- TransitionBlock -// Padding (24 bytes for 16-byte alignment) -// FloatArgumentRegisters (xmm0-xmm7, 128 bytes) +// ArgumentRegisters (r9, r8, rcx, rdx, rsi, rdi - 48 bytes) <- TransitionBlock pointer +// FloatArgumentRegisters (xmm0-xmm7, 128 bytes) at rsp+8 +// 8-byte alignment padding at rsp+0 // sp points here // // Stack alignment calculation: -// Before call to IL_Throw: rsp is 16-byte aligned +// Before call to IL_Throw: rsp is 16-byte aligned (ABI requirement before call) // After call (return addr pushed): rsp % 16 = 8 -// After PUSH_CALLEE_SAVED_REGISTERS (48 bytes): rsp % 16 = 8 (48 % 16 = 0) -// After PUSH_ARGUMENT_REGISTERS (48 bytes): rsp % 16 = 8 (48 % 16 = 0) -// After alloc_stack 152: rsp % 16 = (8 - 152) % 16 = 0 <- properly aligned! +// After PUSH_CALLEE_SAVED_REGISTERS (48 bytes): rsp % 16 = 8 +// After PUSH_ARGUMENT_REGISTERS (48 bytes): rsp % 16 = 8 +// After alloc_stack 136: rsp % 16 = (8 - 136 % 16) = (8 - 8) = 0 <- aligned! +// +// Stack layout for IL_Throw helpers using TransitionBlock with float registers. +// +// Stack alignment calculation: +// Before call to IL_Throw: rsp is 16-byte aligned (ABI requirement before call) +// After call (return addr pushed): rsp % 16 = 8 +// After PUSH_CALLEE_SAVED_REGISTERS (48 bytes): rsp % 16 = 8 +// After PUSH_ARGUMENT_REGISTERS (48 bytes): rsp % 16 = 8 +// After alloc_stack 136: rsp % 16 = (8 - 136 % 16) = (8 - 8) = 0 <- aligned! +// +// Stack layout (low to high addresses): +// rsp+0: 8 bytes padding (for alignment) +// rsp+8: FloatArgumentRegisters (xmm0-xmm7, 128 bytes) +// rsp+136: TransitionBlock start +// - ArgumentRegisters (rdi, rsi, rdx, rcx, r8, r9: 48 bytes) +// - CalleeSavedRegisters (r12, r13, r14, r15, rbx, rbp: 48 bytes) +// - Return address (8 bytes) +// +// TransitionBlock at rsp+136, floats at rsp+8 = TransitionBlock - 128 +// (matches GetOffsetOfFloatArgumentRegisters which returns -128) +// +// NOTE: We use SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED because rsp+8 is not +// 16-byte aligned (rsp is aligned, rsp+8 is not). // // On exit, \target contains the TransitionBlock pointer. .macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target @@ -454,16 +497,16 @@ C_FUNC(\Name\()_End): PUSH_CALLEE_SAVED_REGISTERS PUSH_ARGUMENT_REGISTERS - // Allocate space for padding (24 bytes) + float argument registers (128 bytes) = 152 bytes - // The extra 8 bytes (vs 144) ensures 16-byte stack alignment before the call - alloc_stack 152 - // Save float argument registers at offset 24 (after alignment padding) - // This ensures floats are at TransitionBlock - 128 (matching GetOffsetOfFloatArgumentRegisters) - SAVE_FLOAT_ARGUMENT_REGISTERS 24 + // Allocate 128 bytes for floats + 8 bytes padding = 136 bytes + alloc_stack 136 + // Save float argument registers at offset 8 (TransitionBlock - 128) + // Using unaligned stores because rsp+8 is not 16-byte aligned + SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED 8 END_PROLOGUE - lea \target, [rsp + 152] + // TransitionBlock starts at rsp+136 (where ArgumentRegisters are) + lea \target, [rsp + 136] .endm .macro INLINE_GETTHREAD From 8e7a5ff754b2ee20b160a2b360e5feeca53dcb23 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Sun, 11 Jan 2026 23:34:02 +0200 Subject: [PATCH 12/30] linux-x64 --- src/coreclr/vm/excep.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index da9dd59cd249f1..79197977474441 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10900,15 +10900,17 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_ContextPointers.R9 = &m_Context.R9; // Copy floating point argument registers (xmm0-xmm7) - FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); - m_Context.Xmm0 = pFloatArgs->d[0]; - m_Context.Xmm1 = pFloatArgs->d[1]; - m_Context.Xmm2 = pFloatArgs->d[2]; - m_Context.Xmm3 = pFloatArgs->d[3]; - m_Context.Xmm4 = pFloatArgs->d[4]; - m_Context.Xmm5 = pFloatArgs->d[5]; - m_Context.Xmm6 = pFloatArgs->d[6]; - m_Context.Xmm7 = pFloatArgs->d[7]; + // Use memcpy to avoid alignment issues - the source may not be 16-byte aligned + // depending on stack layout in the assembly helpers + BYTE *pFloatArgs = (BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters(); + memcpy(&m_Context.Xmm0, pFloatArgs + 0x00, sizeof(m_Context.Xmm0)); + memcpy(&m_Context.Xmm1, pFloatArgs + 0x10, sizeof(m_Context.Xmm1)); + memcpy(&m_Context.Xmm2, pFloatArgs + 0x20, sizeof(m_Context.Xmm2)); + memcpy(&m_Context.Xmm3, pFloatArgs + 0x30, sizeof(m_Context.Xmm3)); + memcpy(&m_Context.Xmm4, pFloatArgs + 0x40, sizeof(m_Context.Xmm4)); + memcpy(&m_Context.Xmm5, pFloatArgs + 0x50, sizeof(m_Context.Xmm5)); + memcpy(&m_Context.Xmm6, pFloatArgs + 0x60, sizeof(m_Context.Xmm6)); + memcpy(&m_Context.Xmm7, pFloatArgs + 0x70, sizeof(m_Context.Xmm7)); // Initialize remaining XMM registers to zero memset(&m_Context.Xmm8, 0, sizeof(m_Context.Xmm8)); memset(&m_Context.Xmm9, 0, sizeof(m_Context.Xmm9)); From 9bb83b5384853c759860c69d3408339362433ad0 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Mon, 12 Jan 2026 01:14:50 +0200 Subject: [PATCH 13/30] linux-arm32 --- src/coreclr/pal/inc/unixasmmacrosarm.inc | 33 ++++++++++++------------ 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosarm.inc b/src/coreclr/pal/inc/unixasmmacrosarm.inc index 020743971ec75c..4ea9aa35236eb8 100644 --- a/src/coreclr/pal/inc/unixasmmacrosarm.inc +++ b/src/coreclr/pal/inc/unixasmmacrosarm.inc @@ -292,36 +292,37 @@ C_FUNC(\Name): // On exit, \target contains the TransitionBlock pointer. // // Stack layout (from sp going up): -// sp+0: d8-d15 (64 bytes) - FP callee-saved -// sp+64: padding (4 bytes) - to make d0-d7 8-byte aligned at TransitionBlock-68 -// sp+68: d0-d7 (64 bytes) - float argument registers (at TransitionBlock - 68) -// sp+132: padding (4 bytes) - to keep total allocation 8-byte aligned -// sp+136: TransitionBlock starts here (CalleeSavedRegisters + ArgumentRegisters pushed above) +// sp+0: padding (4 bytes) - for 8-byte alignment +// sp+4: d8-d15 (64 bytes) - FP callee-saved +// sp+68: padding (4 bytes) - to make d0-d7 8-byte aligned at TransitionBlock-68 +// sp+72: d0-d7 (64 bytes) - float argument registers (at TransitionBlock - 68) +// sp+136: padding (4 bytes) - to keep total allocation 8-byte aligned +// sp+140: TransitionBlock starts here (CalleeSavedRegisters + ArgumentRegisters pushed above) // // GetNegSpaceSize() for ARM32 = 64 (FloatArgumentRegisters) + 4 (padding) = 68 // GetOffsetOfFloatArgumentRegisters() = -68 // -// Total stack alloc: 64 + 4 + 64 + 4 = 136 bytes -// Stack: Arguments(16) + callee-saved(36) + alloc(136) = 188 bytes -// 188 % 4 = 0, properly aligned for ARM32 +// Total stack alloc: 4 + 64 + 4 + 64 + 4 = 140 bytes +// Stack: Arguments(16) + callee-saved(36) + alloc(140) = 192 bytes +// 192 % 8 = 0, properly aligned for ARM32 (8-byte alignment required) .macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target // Push argument registers (r0-r3) - these will be at highest address in TransitionBlock PUSH_ARGUMENT_REGISTERS PUSH_CALLEE_SAVED_REGISTERS PROLOG_STACK_SAVE_OFFSET r7, #12 // let r7 point the saved r7 in the stack (clang FP style) - // Allocate space for: d8-d15 (64) + padding (4) + d0-d7 (64) + padding (4) = 136 bytes - alloc_stack 136 - // Save floating point argument registers (d0-d7) at sp+68 (TransitionBlock - 68) - add r12, sp, #68 + // Allocate space for: padding (4) + d8-d15 (64) + padding (4) + d0-d7 (64) + padding (4) = 140 bytes + alloc_stack 140 + // Save floating point argument registers (d0-d7) at sp+72 (TransitionBlock - 68) + add r12, sp, #72 vstm r12, {d0-d7} - // Save FP callee-saved registers (d8-d15) at sp+0 - add r12, sp, #0 + // Save FP callee-saved registers (d8-d15) at sp+4 + add r12, sp, #4 vstm r12, {d8-d15} CHECK_STACK_ALIGNMENT END_PROLOGUE - // TransitionBlock is at sp + 136 - add \target, sp, #136 + // TransitionBlock is at sp + 140 + add \target, sp, #140 .endm .macro POP_COOP_PINVOKE_FRAME From bf0e4c56066381211d04509291a3431e83740d9b Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Mon, 12 Jan 2026 18:00:56 +0200 Subject: [PATCH 14/30] Revert --- src/coreclr/vm/jithelpers.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/coreclr/vm/jithelpers.cpp b/src/coreclr/vm/jithelpers.cpp index b314d4f938916d..1c6fbd901c46e5 100644 --- a/src/coreclr/vm/jithelpers.cpp +++ b/src/coreclr/vm/jithelpers.cpp @@ -784,11 +784,6 @@ EXTERN_C HCIMPL2(void, IL_Throw_Impl, Object* obj, TransitionBlock* transitionB DispatchManagedException(kNullReferenceException); NormalizeThrownObject(&oref); - - // Set the last thrown object before dispatching the exception. - // This is required for exception handling code that checks LastThrownObject. - pThread->SafeSetLastThrownObject(oref); - DispatchManagedException(oref, exceptionFrame.GetContext()); FC_CAN_TRIGGER_GC_END(); @@ -837,10 +832,6 @@ EXTERN_C HCIMPL2(void, IL_ThrowExact_Impl, Object* obj, TransitionBlock* transi FC_CAN_TRIGGER_GC(); - // Set the last thrown object before dispatching the exception. - // This is required for exception handling code that checks LastThrownObject. - pThread->SafeSetLastThrownObject(oref); - DispatchManagedException(oref, exceptionFrame.GetContext()); FC_CAN_TRIGGER_GC_END(); From af084b81ab4b19f3dab301d1ba44b1c47554ed07 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Mon, 12 Jan 2026 16:59:49 +0000 Subject: [PATCH 15/30] Switch to reusing PROLOG_WITH_TRANSITION_BLOCK --- src/coreclr/pal/inc/unixasmmacrosamd64.inc | 81 ----------- src/coreclr/pal/inc/unixasmmacrosarm.inc | 28 ---- src/coreclr/pal/inc/unixasmmacrosarm64.inc | 41 ------ .../pal/inc/unixasmmacrosloongarch64.inc | 36 ----- src/coreclr/pal/inc/unixasmmacrosriscv64.inc | 42 ------ src/coreclr/vm/amd64/AsmHelpers.asm | 12 +- src/coreclr/vm/amd64/AsmMacros.inc | 51 ------- src/coreclr/vm/amd64/asmhelpers.S | 12 +- src/coreclr/vm/arm/asmhelpers.S | 12 +- src/coreclr/vm/arm64/asmhelpers.S | 20 +-- src/coreclr/vm/arm64/asmhelpers.asm | 12 +- src/coreclr/vm/arm64/asmmacros.h | 49 ------- src/coreclr/vm/excep.cpp | 131 ++++++++---------- src/coreclr/vm/loongarch64/asmhelpers.S | 12 +- src/coreclr/vm/riscv64/asmhelpers.S | 12 +- 15 files changed, 99 insertions(+), 452 deletions(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index 90e53a00d87932..90c8947e754297 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -147,12 +147,6 @@ C_FUNC(\Name\()_End): // the xmm registers are not supported by the libunwind .endm -// Unaligned version for use when stack alignment cannot be guaranteed -.macro save_xmm128_postrsp_unaligned Reg, Offset - __Offset = \Offset - movdqu xmmword ptr [rsp + __Offset], \Reg -.endm - .macro restore_xmm128 Reg, ofs __Offset = \ofs movdqa \Reg, xmmword ptr [rsp + __Offset] @@ -252,20 +246,6 @@ C_FUNC(\Name\()_End): .endm -// Unaligned version for cases where 16-byte stack alignment cannot be guaranteed -.macro SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED ofs - - save_xmm128_postrsp_unaligned xmm0, \ofs - save_xmm128_postrsp_unaligned xmm1, \ofs + 0x10 - save_xmm128_postrsp_unaligned xmm2, \ofs + 0x20 - save_xmm128_postrsp_unaligned xmm3, \ofs + 0x30 - save_xmm128_postrsp_unaligned xmm4, \ofs + 0x40 - save_xmm128_postrsp_unaligned xmm5, \ofs + 0x50 - save_xmm128_postrsp_unaligned xmm6, \ofs + 0x60 - save_xmm128_postrsp_unaligned xmm7, \ofs + 0x70 - -.endm - .macro RESTORE_FLOAT_ARGUMENT_REGISTERS ofs restore_xmm128 xmm0, \ofs @@ -448,67 +428,6 @@ C_FUNC(\Name\()_End): POP_CALLEE_SAVED_REGISTERS .endm -// Pushes a full TransitionBlock on the stack including argument registers and -// floating point argument registers. Used for exception throw helpers where we -// need to capture the complete register state. -// -// Stack layout (from high to low address after prologue): -// Return address (8 bytes) -// CalleeSavedRegisters (rbp, rbx, r15, r14, r13, r12 - 48 bytes) -// ArgumentRegisters (r9, r8, rcx, rdx, rsi, rdi - 48 bytes) <- TransitionBlock pointer -// FloatArgumentRegisters (xmm0-xmm7, 128 bytes) at rsp+8 -// 8-byte alignment padding at rsp+0 -// sp points here -// -// Stack alignment calculation: -// Before call to IL_Throw: rsp is 16-byte aligned (ABI requirement before call) -// After call (return addr pushed): rsp % 16 = 8 -// After PUSH_CALLEE_SAVED_REGISTERS (48 bytes): rsp % 16 = 8 -// After PUSH_ARGUMENT_REGISTERS (48 bytes): rsp % 16 = 8 -// After alloc_stack 136: rsp % 16 = (8 - 136 % 16) = (8 - 8) = 0 <- aligned! -// -// Stack layout for IL_Throw helpers using TransitionBlock with float registers. -// -// Stack alignment calculation: -// Before call to IL_Throw: rsp is 16-byte aligned (ABI requirement before call) -// After call (return addr pushed): rsp % 16 = 8 -// After PUSH_CALLEE_SAVED_REGISTERS (48 bytes): rsp % 16 = 8 -// After PUSH_ARGUMENT_REGISTERS (48 bytes): rsp % 16 = 8 -// After alloc_stack 136: rsp % 16 = (8 - 136 % 16) = (8 - 8) = 0 <- aligned! -// -// Stack layout (low to high addresses): -// rsp+0: 8 bytes padding (for alignment) -// rsp+8: FloatArgumentRegisters (xmm0-xmm7, 128 bytes) -// rsp+136: TransitionBlock start -// - ArgumentRegisters (rdi, rsi, rdx, rcx, r8, r9: 48 bytes) -// - CalleeSavedRegisters (r12, r13, r14, r15, rbx, rbp: 48 bytes) -// - Return address (8 bytes) -// -// TransitionBlock at rsp+136, floats at rsp+8 = TransitionBlock - 128 -// (matches GetOffsetOfFloatArgumentRegisters which returns -128) -// -// NOTE: We use SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED because rsp+8 is not -// 16-byte aligned (rsp is aligned, rsp+8 is not). -// -// On exit, \target contains the TransitionBlock pointer. -.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target - set_cfa_register rsp, 8 - - PUSH_CALLEE_SAVED_REGISTERS - PUSH_ARGUMENT_REGISTERS - - // Allocate 128 bytes for floats + 8 bytes padding = 136 bytes - alloc_stack 136 - // Save float argument registers at offset 8 (TransitionBlock - 128) - // Using unaligned stores because rsp+8 is not 16-byte aligned - SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED 8 - - END_PROLOGUE - - // TransitionBlock starts at rsp+136 (where ArgumentRegisters are) - lea \target, [rsp + 136] -.endm - .macro INLINE_GETTHREAD // Inlined version of call C_FUNC(RhpGetThread) INLINE_GET_TLS_VAR t_CurrentThreadInfo diff --git a/src/coreclr/pal/inc/unixasmmacrosarm.inc b/src/coreclr/pal/inc/unixasmmacrosarm.inc index 4ea9aa35236eb8..739886b0bc014e 100644 --- a/src/coreclr/pal/inc/unixasmmacrosarm.inc +++ b/src/coreclr/pal/inc/unixasmmacrosarm.inc @@ -297,34 +297,6 @@ C_FUNC(\Name): // sp+68: padding (4 bytes) - to make d0-d7 8-byte aligned at TransitionBlock-68 // sp+72: d0-d7 (64 bytes) - float argument registers (at TransitionBlock - 68) // sp+136: padding (4 bytes) - to keep total allocation 8-byte aligned -// sp+140: TransitionBlock starts here (CalleeSavedRegisters + ArgumentRegisters pushed above) -// -// GetNegSpaceSize() for ARM32 = 64 (FloatArgumentRegisters) + 4 (padding) = 68 -// GetOffsetOfFloatArgumentRegisters() = -68 -// -// Total stack alloc: 4 + 64 + 4 + 64 + 4 = 140 bytes -// Stack: Arguments(16) + callee-saved(36) + alloc(140) = 192 bytes -// 192 % 8 = 0, properly aligned for ARM32 (8-byte alignment required) -.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target - // Push argument registers (r0-r3) - these will be at highest address in TransitionBlock - PUSH_ARGUMENT_REGISTERS - PUSH_CALLEE_SAVED_REGISTERS - PROLOG_STACK_SAVE_OFFSET r7, #12 - // let r7 point the saved r7 in the stack (clang FP style) - // Allocate space for: padding (4) + d8-d15 (64) + padding (4) + d0-d7 (64) + padding (4) = 140 bytes - alloc_stack 140 - // Save floating point argument registers (d0-d7) at sp+72 (TransitionBlock - 68) - add r12, sp, #72 - vstm r12, {d0-d7} - // Save FP callee-saved registers (d8-d15) at sp+4 - add r12, sp, #4 - vstm r12, {d8-d15} - CHECK_STACK_ALIGNMENT - END_PROLOGUE - // TransitionBlock is at sp + 140 - add \target, sp, #140 -.endm - .macro POP_COOP_PINVOKE_FRAME free_stack 4 POP_CALLEE_SAVED_REGISTERS diff --git a/src/coreclr/pal/inc/unixasmmacrosarm64.inc b/src/coreclr/pal/inc/unixasmmacrosarm64.inc index 2d27459372b561..1c5ff88277c6e6 100644 --- a/src/coreclr/pal/inc/unixasmmacrosarm64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosarm64.inc @@ -427,47 +427,6 @@ C_FUNC(\Name\()_End): // // Stack layout (from low to high address): // sp+0: FP callee-saved registers (d8-d15, 64 bytes) -// sp+64: FloatArgumentRegisters (q0-q7, 128 bytes) -// sp+192: TransitionBlock start (176 bytes) -// - CalleeSavedRegisters (fp, lr, x19-x28 - 96 bytes) -// - padding (8 bytes) -// - x8 (8 bytes) -// - ArgumentRegisters (x0-x7, 64 bytes) -// -// On exit, \target contains the TransitionBlock pointer (sp+192). -.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target - PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -176 - - // Spill callee saved registers - PROLOG_SAVE_REG_PAIR x19, x20, 16 - PROLOG_SAVE_REG_PAIR x21, x22, 32 - PROLOG_SAVE_REG_PAIR x23, x24, 48 - PROLOG_SAVE_REG_PAIR x25, x26, 64 - PROLOG_SAVE_REG_PAIR x27, x28, 80 - - // Allocate space for FloatArgumentRegisters (128) + FP callee-saved (64) = 192 bytes - PROLOG_STACK_ALLOC 192 - - // Save argument registers (x8, x0-x7) at offset 296 from sp (192 + 104) - SAVE_ARGUMENT_REGISTERS sp, 296 - - // Save floating point argument registers (q0-q7) at sp+64 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 64 - - // Save FP callee-saved registers (d8-d15) at sp+0 - str d8, [sp, #0] - str d9, [sp, #8] - str d10, [sp, #16] - str d11, [sp, #24] - str d12, [sp, #32] - str d13, [sp, #40] - str d14, [sp, #48] - str d15, [sp, #56] - - // Set target to TransitionBlock pointer - add \target, sp, #192 -.endm - // ------------------------------------------------------------------ // Macro to generate Redirection Stubs // diff --git a/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc b/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc index 92d701598f933e..bbfc4db9bf567a 100644 --- a/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc @@ -407,42 +407,6 @@ C_FUNC(\Name\()_End): // Pushes a full TransitionBlock on the stack including argument registers and // floating point argument registers. Used for exception throw helpers where we // need to capture the complete register state. -// -// Stack layout (from low to high address): -// sp+0: FloatArgumentRegisters (fa0-fa7, 64 bytes) -// sp+64: TransitionBlock start -// - CalleeSavedRegisters (fp, ra, s0-s8 - 96 bytes) -// - ArgumentRegisters (a0-a7, 64 bytes) -// -// On exit, \target contains the TransitionBlock pointer (sp+128). -.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target - // Stack: FPCalleeSaved(64) + FloatArgs(64) + CalleeSaved(96) + Args(64) = 288 bytes - PROLOG_STACK_ALLOC 288 - PROLOG_SAVE_REG_PAIR 22, 1, 128, 1 - - // Save callee-saved registers at offset 128 (after FP callee-saved and FloatArgumentRegisters) - SAVE_CALLEESAVED_REGISTERS $sp, 128 - - // Save argument registers (a0-a7) at offset 224 - SAVE_ARGUMENT_REGISTERS $sp, 224 - - // Save floating-point argument registers (fa0-fa7) at offset 64 - SAVE_FLOAT_ARGUMENT_REGISTERS $sp, 64 - - // Save FP callee-saved registers (f24-f31) at offset 0 - fst.d $f24, $sp, 0 - fst.d $f25, $sp, 8 - fst.d $f26, $sp, 16 - fst.d $f27, $sp, 24 - fst.d $f28, $sp, 32 - fst.d $f29, $sp, 40 - fst.d $f30, $sp, 48 - fst.d $f31, $sp, 56 - - // Set target to TransitionBlock pointer - addi.d \target, $sp, 128 -.endm - // ------------------------------------------------------------------ // Macro to generate Redirection Stubs // diff --git a/src/coreclr/pal/inc/unixasmmacrosriscv64.inc b/src/coreclr/pal/inc/unixasmmacrosriscv64.inc index d244756c304eb9..4b80836486bf78 100644 --- a/src/coreclr/pal/inc/unixasmmacrosriscv64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosriscv64.inc @@ -352,48 +352,6 @@ C_FUNC(\Name): // Pushes a full TransitionBlock on the stack including argument registers and // floating point argument registers. Used for exception throw helpers where we // need to capture the complete register state. -// -// Stack layout (from low to high address): -// sp+0: FloatArgumentRegisters (fa0-fa7, 64 bytes) -// sp+64: TransitionBlock start -// - CalleeSavedRegisters (fp, ra, s1-s11, tp, gp - 120 bytes) -// - padding (8 bytes) -// - ArgumentRegisters (a0-a7, 64 bytes) -// -// On exit, \target contains the TransitionBlock pointer (sp+160). -.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target - // Stack: FPCalleeSaved(96) + FloatArgs(64) + CalleeSaved(120) + pad(8) + Args(64) = 352 bytes - PROLOG_STACK_ALLOC 352 - PROLOG_SAVE_REG_PAIR fp, ra, 160, 1 - - // Save callee-saved registers at offset 160 (after FP callee-saved and FloatArgumentRegisters) - SAVE_CALLEESAVED_REGISTERS sp, 160 - - // Save argument registers (a0-a7) at offset 288 - SAVE_ARGUMENT_REGISTERS sp, 288 - - // Save floating-point argument registers (fa0-fa7) at offset 96 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 96 - - // Save FP callee-saved registers (fs0-fs11 = f8,f9,f18-f27) at offset 0 - // RISC-V FP callee-saved: fs0=f8, fs1=f9, fs2-fs11=f18-f27 - fsd fs0, 0(sp) // f8 - fsd fs1, 8(sp) // f9 - fsd fs2, 16(sp) // f18 - fsd fs3, 24(sp) // f19 - fsd fs4, 32(sp) // f20 - fsd fs5, 40(sp) // f21 - fsd fs6, 48(sp) // f22 - fsd fs7, 56(sp) // f23 - fsd fs8, 64(sp) // f24 - fsd fs9, 72(sp) // f25 - fsd fs10, 80(sp) // f26 - fsd fs11, 88(sp) // f27 - - // Set target to TransitionBlock pointer - addi \target, sp, 160 -.endm - // ------------------------------------------------------------------ // Macro to generate Redirection Stubs // diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index 4725a0cf8eefbb..b0fdc62f60ee02 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -1211,10 +1211,10 @@ endif ; FEATURE_INTERPRETER ; RCX = Pointer to exception object ;========================================================================== NESTED_ENTRY IL_Throw, _TEXT - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdx + PROLOG_WITH_TRANSITION_BLOCK ; RCX already contains exception object - ; RDX contains pointer to TransitionBlock + lea rdx, [rsp + __PWTB_TransitionBlock] call IL_Throw_Impl ; Should never return int 3 @@ -1228,10 +1228,10 @@ NESTED_END IL_Throw, _TEXT ; RCX = Pointer to exception object ;========================================================================== NESTED_ENTRY IL_ThrowExact, _TEXT - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdx + PROLOG_WITH_TRANSITION_BLOCK ; RCX already contains exception object - ; RDX contains pointer to TransitionBlock + lea rdx, [rsp + __PWTB_TransitionBlock] call IL_ThrowExact_Impl ; Should never return int 3 @@ -1242,9 +1242,9 @@ NESTED_END IL_ThrowExact, _TEXT ; implementation written in C. ;========================================================================== NESTED_ENTRY IL_Rethrow, _TEXT - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rcx + PROLOG_WITH_TRANSITION_BLOCK - ; RCX contains pointer to TransitionBlock + lea rcx, [rsp + __PWTB_TransitionBlock] call IL_Rethrow_Impl ; Should never return int 3 diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index f8b2bd11fb5078..4c21fdefbf1fe8 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -497,56 +497,5 @@ POP_COOP_PINVOKE_FRAME macro ; FP Callee-saved registers (xmm6-xmm15, 160 bytes) ; sp points here ; -; Stack alignment: After call (8) + callee-saved pushes (64) + alloc (256) = 328 bytes -; 328 mod 16 = 8, so RSP mod 16 = 8 after alloc - NOT 16-byte aligned. -; We need RSP to be 16-byte aligned for movaps AND unwind offsets must be multiples of 16. -; -; Solution: Use alloc_stack 272 (adds 16 bytes padding). 8 + 64 + 272 = 344, 344 mod 16 = 8. -; Wait, that's still not right. Let me recalculate: -; After call: RSP mod 16 = 8 (return addr pushed from 16-byte aligned stack) -; After 8 pushes (64 bytes): RSP mod 16 = (8 + 64) mod 16 = 72 mod 16 = 8 -; After alloc N: RSP mod 16 = (8 + N) mod 16 -; For RSP mod 16 = 0, need N mod 16 = 8 -; 256 mod 16 = 0, so RSP mod 16 = 8 (NOT aligned) -; 264 mod 16 = 8, so RSP mod 16 = 0 (aligned!) but offsets 0,16,32... work -; -; With alloc_stack 264: RSP is 16-byte aligned, XMM saves at 0, 16, 32, ... (multiples of 16) -; -; On exit, target contains the TransitionBlock pointer (CalleeSavedRegisters). -PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target - - PUSH_CALLEE_SAVED_REGISTERS - - ; Allocate space for: FP callee-saved (160) + float args (64) + shadow (32) + padding (8) = 264 bytes - ; This makes RSP 16-byte aligned (8 + 64 + 264 = 336, and original RSP - 336 is 16-byte aligned) - alloc_stack 264 - - ; Save argument registers to shadow space area at offset 224 - SAVE_ARGUMENT_REGISTERS 224 - - ; Save float argument registers at offset 160 - SAVE_FLOAT_ARGUMENT_REGISTERS 160 - - ; Save FP callee-saved registers (xmm6-xmm15) at offset 0 - ; RSP is 16-byte aligned, so offset 0, 16, 32, ... are all 16-byte aligned - ; AND these offsets are multiples of 16 as required by unwind codes - save_xmm128_postrsp xmm6, 0h - save_xmm128_postrsp xmm7, 10h - save_xmm128_postrsp xmm8, 20h - save_xmm128_postrsp xmm9, 30h - save_xmm128_postrsp xmm10, 40h - save_xmm128_postrsp xmm11, 50h - save_xmm128_postrsp xmm12, 60h - save_xmm128_postrsp xmm13, 70h - save_xmm128_postrsp xmm14, 80h - save_xmm128_postrsp xmm15, 90h - - END_PROLOGUE - - ; TransitionBlock pointer points to CalleeSavedRegisters at rsp + 264 - lea target, [rsp + 264] - - endm - ;; GC type flags GC_ALLOC_FINALIZE equ 1 diff --git a/src/coreclr/vm/amd64/asmhelpers.S b/src/coreclr/vm/amd64/asmhelpers.S index 38b3c87166d0be..3141735397593b 100644 --- a/src/coreclr/vm/amd64/asmhelpers.S +++ b/src/coreclr/vm/amd64/asmhelpers.S @@ -1922,9 +1922,9 @@ NESTED_END CallJittedMethodRetDoubleDouble, _TEXT // rdi = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rsi + PROLOG_WITH_TRANSITION_BLOCK // rdi already contains exception object - // rsi contains pointer to TransitionBlock + lea rsi, [rsp + __PWTB_TransitionBlock] call C_FUNC(IL_Throw_Impl) // Should never return int3 @@ -1938,9 +1938,9 @@ NESTED_END IL_Throw, _TEXT // rdi = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rsi + PROLOG_WITH_TRANSITION_BLOCK // rdi already contains exception object - // rsi contains pointer to TransitionBlock + lea rsi, [rsp + __PWTB_TransitionBlock] call C_FUNC(IL_ThrowExact_Impl) // Should never return int3 @@ -1951,8 +1951,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdi - // rdi contains pointer to TransitionBlock + PROLOG_WITH_TRANSITION_BLOCK + lea rdi, [rsp + __PWTB_TransitionBlock] call C_FUNC(IL_Rethrow_Impl) // Should never return int3 diff --git a/src/coreclr/vm/arm/asmhelpers.S b/src/coreclr/vm/arm/asmhelpers.S index 5065a6d15c4cb4..11c0da6fa9c4d6 100644 --- a/src/coreclr/vm/arm/asmhelpers.S +++ b/src/coreclr/vm/arm/asmhelpers.S @@ -906,9 +906,9 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT // r0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS r1 + PROLOG_WITH_TRANSITION_BLOCK // r0 already contains exception object - // r1 contains pointer to TransitionBlock + add r1, sp, #__PWTB_TransitionBlock bl C_FUNC(IL_Throw_Impl) // Should never return EMIT_BREAKPOINT @@ -922,9 +922,9 @@ NESTED_END IL_Throw, _TEXT // r0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS r1 + PROLOG_WITH_TRANSITION_BLOCK // r0 already contains exception object - // r1 contains pointer to TransitionBlock + add r1, sp, #__PWTB_TransitionBlock bl C_FUNC(IL_ThrowExact_Impl) // Should never return EMIT_BREAKPOINT @@ -935,8 +935,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS r0 - // r0 contains pointer to TransitionBlock + PROLOG_WITH_TRANSITION_BLOCK + add r0, sp, #__PWTB_TransitionBlock bl C_FUNC(IL_Rethrow_Impl) // Should never return EMIT_BREAKPOINT diff --git a/src/coreclr/vm/arm64/asmhelpers.S b/src/coreclr/vm/arm64/asmhelpers.S index 127a5c0a118245..ac3c11da0af6b4 100644 --- a/src/coreclr/vm/arm64/asmhelpers.S +++ b/src/coreclr/vm/arm64/asmhelpers.S @@ -2757,21 +2757,13 @@ NESTED_END CallJittedMethodRet4Vector128, _TEXT // Capture a transition block with register values and call the IL_Throw_Impl // implementation written in C. // -// Stack layout (from low to high address): -// sp+0: FloatArgumentRegisters (q0-q7, 128 bytes) -// sp+128: TransitionBlock start (176 bytes) -// - CalleeSavedRegisters (fp, lr, x19-x28 - 96 bytes) -// - padding (8 bytes) -// - x8 (8 bytes) -// - ArgumentRegisters (x0-x7, 64 bytes) -// // Input state: // x0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 + PROLOG_WITH_TRANSITION_BLOCK // x0 already contains exception object - // x1 contains pointer to TransitionBlock + add x1, sp, #__PWTB_TransitionBlock bl C_FUNC(IL_Throw_Impl) // Should never return brk #0 @@ -2785,9 +2777,9 @@ NESTED_END IL_Throw, _TEXT // x0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 + PROLOG_WITH_TRANSITION_BLOCK // x0 already contains exception object - // x1 contains pointer to TransitionBlock + add x1, sp, #__PWTB_TransitionBlock bl C_FUNC(IL_ThrowExact_Impl) // Should never return brk #0 @@ -2798,8 +2790,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x0 - // x0 contains pointer to TransitionBlock + PROLOG_WITH_TRANSITION_BLOCK + add x0, sp, #__PWTB_TransitionBlock bl C_FUNC(IL_Rethrow_Impl) // Should never return brk #0 diff --git a/src/coreclr/vm/arm64/asmhelpers.asm b/src/coreclr/vm/arm64/asmhelpers.asm index 7f510cd8ed62a5..e3ad409e31cfb8 100644 --- a/src/coreclr/vm/arm64/asmhelpers.asm +++ b/src/coreclr/vm/arm64/asmhelpers.asm @@ -2992,9 +2992,9 @@ CopyLoop ; x0 = Pointer to exception object ; ------------------------------------------------------------------ NESTED_ENTRY IL_Throw - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 + PROLOG_WITH_TRANSITION_BLOCK ; x0 already contains exception object - ; x1 contains pointer to TransitionBlock + add x1, sp, #__PWTB_TransitionBlock bl IL_Throw_Impl ; Should never return brk #0 @@ -3008,9 +3008,9 @@ CopyLoop ; x0 = Pointer to exception object ; ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 + PROLOG_WITH_TRANSITION_BLOCK ; x0 already contains exception object - ; x1 contains pointer to TransitionBlock + add x1, sp, #__PWTB_TransitionBlock bl IL_ThrowExact_Impl ; Should never return brk #0 @@ -3021,8 +3021,8 @@ CopyLoop ; implementation written in C. ; ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x0 - ; x0 contains pointer to TransitionBlock + PROLOG_WITH_TRANSITION_BLOCK + add x0, sp, #__PWTB_TransitionBlock bl IL_Rethrow_Impl ; Should never return brk #0 diff --git a/src/coreclr/vm/arm64/asmmacros.h b/src/coreclr/vm/arm64/asmmacros.h index 93778d775f87c9..a11067633ab82e 100644 --- a/src/coreclr/vm/arm64/asmmacros.h +++ b/src/coreclr/vm/arm64/asmmacros.h @@ -204,55 +204,6 @@ OFFSETOF__ee_alloc_context EQU OFFSETOF__RuntimeThreadLocals__ee_alloc_context EPILOG_RESTORE_REG_PAIR fp, lr, #176! MEND -; Pushes a full TransitionBlock on the stack including argument registers and -; floating point argument registers. Used for exception throw helpers where we -; need to capture the complete register state including FP callee-saved registers. -; -; Stack layout (from low to high address): -; sp+0: FP callee-saved registers (d8-d15, 64 bytes) -; sp+64: FloatArgumentRegisters (q0-q7, 128 bytes) -; sp+192: TransitionBlock start (176 bytes) -; - CalleeSavedRegisters (fp, lr, x19-x28 - 96 bytes) -; - padding (8 bytes) -; - x8 (8 bytes) -; - ArgumentRegisters (x0-x7, 64 bytes) -; -; On exit, $Target contains the TransitionBlock pointer (sp+192). - MACRO - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $Target - - PROLOG_SAVE_REG_PAIR fp, lr, #-176! - - ; Spill callee saved registers - PROLOG_SAVE_REG_PAIR x19, x20, #16 - PROLOG_SAVE_REG_PAIR x21, x22, #32 - PROLOG_SAVE_REG_PAIR x23, x24, #48 - PROLOG_SAVE_REG_PAIR x25, x26, #64 - PROLOG_SAVE_REG_PAIR x27, x28, #80 - - ; Allocate space for FloatArgumentRegisters (128) + FP callee-saved (64) = 192 bytes - PROLOG_STACK_ALLOC 192 - - ; Save argument registers (x8, x0-x7) at offset 296 from sp (192 + 104) - SAVE_ARGUMENT_REGISTERS sp, 296 - - ; Save floating point argument registers (q0-q7) at sp+64 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 64 - - ; Save FP callee-saved registers (d8-d15) at sp+0 - str d8, [sp, #0] - str d9, [sp, #8] - str d10, [sp, #16] - str d11, [sp, #24] - str d12, [sp, #32] - str d13, [sp, #40] - str d14, [sp, #48] - str d15, [sp, #56] - - ; Set target to TransitionBlock pointer - add $Target, sp, #192 - MEND - #define GC_ALLOC_FINALIZE 1 ;----------------------------------------------------------------------------- diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index 79197977474441..bfed3c091052df 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10900,9 +10900,10 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_ContextPointers.R9 = &m_Context.R9; // Copy floating point argument registers (xmm0-xmm7) - // Use memcpy to avoid alignment issues - the source may not be 16-byte aligned - // depending on stack layout in the assembly helpers - BYTE *pFloatArgs = (BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters(); + // PROLOG_WITH_TRANSITION_BLOCK places floats at offset -136 from TransitionBlock + // (128 bytes for floats + 8 bytes padding for stack alignment) + // Use memcpy to avoid alignment issues + BYTE *pFloatArgs = (BYTE*)pTransitionBlock - 136; memcpy(&m_Context.Xmm0, pFloatArgs + 0x00, sizeof(m_Context.Xmm0)); memcpy(&m_Context.Xmm1, pFloatArgs + 0x10, sizeof(m_Context.Xmm1)); memcpy(&m_Context.Xmm2, pFloatArgs + 0x20, sizeof(m_Context.Xmm2)); @@ -10911,7 +10912,7 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p memcpy(&m_Context.Xmm5, pFloatArgs + 0x50, sizeof(m_Context.Xmm5)); memcpy(&m_Context.Xmm6, pFloatArgs + 0x60, sizeof(m_Context.Xmm6)); memcpy(&m_Context.Xmm7, pFloatArgs + 0x70, sizeof(m_Context.Xmm7)); - // Initialize remaining XMM registers to zero + // Initialize remaining XMM registers to zero (caller-saved on Unix) memset(&m_Context.Xmm8, 0, sizeof(m_Context.Xmm8)); memset(&m_Context.Xmm9, 0, sizeof(m_Context.Xmm9)); memset(&m_Context.Xmm10, 0, sizeof(m_Context.Xmm10)); @@ -10923,28 +10924,36 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // Initialize FP control/status m_Context.MxCsr = 0x1F80; // Default MXCSR value (all exceptions masked) #else - // On Windows AMD64, argument registers are not saved in the transition block - m_Context.Rax = 0; - m_Context.Rcx = 0; - m_Context.Rdx = 0; - m_Context.R8 = 0; - m_Context.R9 = 0; - - // Read FP callee-saved registers (xmm6-xmm15) from the stack - // They are stored at negative offsets from TransitionBlock: - // Layout: [xmm6-xmm15 (160 bytes)] [xmm0-xmm3 (64 bytes)] [shadow (32 bytes)] [padding (8 bytes)] [CalleeSavedRegs] [RetAddr] - // xmm6 is at sp+0, TransitionBlock is at sp+264, so xmm6 is at TransitionBlock - 264 - M128A *pFpCalleeSaved = (M128A*)((BYTE*)pTransitionBlock - 264); - m_Context.Xmm6 = pFpCalleeSaved[0]; - m_Context.Xmm7 = pFpCalleeSaved[1]; - m_Context.Xmm8 = pFpCalleeSaved[2]; - m_Context.Xmm9 = pFpCalleeSaved[3]; - m_Context.Xmm10 = pFpCalleeSaved[4]; - m_Context.Xmm11 = pFpCalleeSaved[5]; - m_Context.Xmm12 = pFpCalleeSaved[6]; - m_Context.Xmm13 = pFpCalleeSaved[7]; - m_Context.Xmm14 = pFpCalleeSaved[8]; - m_Context.Xmm15 = pFpCalleeSaved[9]; + // On Windows AMD64, PROLOG_WITH_TRANSITION_BLOCK saves xmm0-xmm3 (float argument registers) + // but not xmm6-xmm15 (callee-saved). The callee-saved FP registers are preserved by the + // normal calling convention - since we don't modify them, the unwinder will find them. + // + // PROLOG_WITH_TRANSITION_BLOCK layout (with extraLocals=0): + // __PWTB_FloatArgumentRegisters = 32 (SIZEOF_MAX_OUTGOING_ARGUMENT_HOMES) + // __PWTB_StackAlloc = 32 + 64 + 8 = 104 + // __PWTB_TransitionBlock = 104 + // Float offset from TransitionBlock = 32 - 104 = -72 + BYTE *pFloatArgs = (BYTE*)pTransitionBlock - 72; + memcpy(&m_Context.Xmm0, pFloatArgs + 0x00, sizeof(m_Context.Xmm0)); + memcpy(&m_Context.Xmm1, pFloatArgs + 0x10, sizeof(m_Context.Xmm1)); + memcpy(&m_Context.Xmm2, pFloatArgs + 0x20, sizeof(m_Context.Xmm2)); + memcpy(&m_Context.Xmm3, pFloatArgs + 0x30, sizeof(m_Context.Xmm3)); + + // xmm4-xmm5 are volatile but not saved (zero them) + memset(&m_Context.Xmm4, 0, sizeof(m_Context.Xmm4)); + memset(&m_Context.Xmm5, 0, sizeof(m_Context.Xmm5)); + + // xmm6-xmm15 are callee-saved - zero them here; the unwinder will restore actual values + memset(&m_Context.Xmm6, 0, sizeof(m_Context.Xmm6)); + memset(&m_Context.Xmm7, 0, sizeof(m_Context.Xmm7)); + memset(&m_Context.Xmm8, 0, sizeof(m_Context.Xmm8)); + memset(&m_Context.Xmm9, 0, sizeof(m_Context.Xmm9)); + memset(&m_Context.Xmm10, 0, sizeof(m_Context.Xmm10)); + memset(&m_Context.Xmm11, 0, sizeof(m_Context.Xmm11)); + memset(&m_Context.Xmm12, 0, sizeof(m_Context.Xmm12)); + memset(&m_Context.Xmm13, 0, sizeof(m_Context.Xmm13)); + memset(&m_Context.Xmm14, 0, sizeof(m_Context.Xmm14)); + memset(&m_Context.Xmm15, 0, sizeof(m_Context.Xmm15)); // Initialize FP control/status m_Context.MxCsr = 0x1F80; // Default MXCSR value (all exceptions masked) @@ -10987,20 +10996,18 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.Lr = pTransitionBlock->m_calleeSavedRegisters.r14; // r14 is link register // Copy floating point argument registers (d0-d7 / s0-s15) + // PROLOG_WITH_TRANSITION_BLOCK saves these at GetOffsetOfFloatArgumentRegisters() FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); for (int i = 0; i < 8; i++) { m_Context.D[i] = pFloatArgs->d[i]; } - // Read FP callee-saved registers (d8-d15) from the stack - // They are stored at negative offset from TransitionBlock: - // Layout: [d8-d15 (64 bytes)] [padding (4)] [d0-d7 (64 bytes)] [padding (4)] [TransitionBlock] - // FP callee-saved are at TransitionBlock - 136 (64 + 4 + 64 + 4) - UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 136); - for (int i = 0; i < 8; i++) + // FP callee-saved registers (d8-d15) are not saved by PROLOG_WITH_TRANSITION_BLOCK + // Zero them; the unwinder will restore actual values during stack walk + for (int i = 8; i < 16; i++) { - m_Context.D[8 + i] = pFpCalleeSaved[i]; + m_Context.D[i] = 0; } // Initialize remaining D registers (D16-D31) to zero - these are caller-saved @@ -11061,33 +11068,20 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.Lr = pTransitionBlock->m_calleeSavedRegisters.x30; // Copy floating point argument registers (V0-V7) + // PROLOG_WITH_TRANSITION_BLOCK saves these at GetOffsetOfFloatArgumentRegisters() FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); for (int i = 0; i < 8; i++) { m_Context.V[i] = pFloatArgs->q[i]; } - // Read FP callee-saved registers (d8-d15) from the stack - // They are stored at negative offset from TransitionBlock: - // Layout: [d8-d15 (64 bytes)] [q0-q7 (128 bytes)] [TransitionBlock] - // FP callee-saved are at TransitionBlock - 192 (64 + 128) - UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 192); - m_Context.V[8].Low = pFpCalleeSaved[0]; - m_Context.V[8].High = 0; - m_Context.V[9].Low = pFpCalleeSaved[1]; - m_Context.V[9].High = 0; - m_Context.V[10].Low = pFpCalleeSaved[2]; - m_Context.V[10].High = 0; - m_Context.V[11].Low = pFpCalleeSaved[3]; - m_Context.V[11].High = 0; - m_Context.V[12].Low = pFpCalleeSaved[4]; - m_Context.V[12].High = 0; - m_Context.V[13].Low = pFpCalleeSaved[5]; - m_Context.V[13].High = 0; - m_Context.V[14].Low = pFpCalleeSaved[6]; - m_Context.V[14].High = 0; - m_Context.V[15].Low = pFpCalleeSaved[7]; - m_Context.V[15].High = 0; + // FP callee-saved registers (d8-d15 / V8-V15) are not saved by PROLOG_WITH_TRANSITION_BLOCK + // Zero them; the unwinder will restore actual values during stack walk + for (int i = 8; i < 16; i++) + { + m_Context.V[i].Low = 0; + m_Context.V[i].High = 0; + } // Initialize remaining V registers (V16-V31) to zero - these are caller-saved for (int i = 16; i < 32; i++) @@ -11153,21 +11147,18 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // F[] array in CONTEXT is 4*32 elements for LSX/LASX support. // Each FP register takes 4 slots (for 256-bit LASX vectors). // For 64-bit doubles, we only use the first slot of each register. + // PROLOG_WITH_TRANSITION_BLOCK saves these at GetOffsetOfFloatArgumentRegisters() FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); for (int i = 0; i < 8; i++) { memcpy(&m_Context.F[i * 4], &pFloatArgs->f[i], sizeof(double)); } - // Read FP callee-saved registers (f24-f31) from the stack - // They are stored at negative offset from TransitionBlock: - // Layout: [f24-f31 (64 bytes)] [fa0-fa7 (64 bytes)] [TransitionBlock] - // FP callee-saved are at TransitionBlock - 128 (64 + 64) - UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 128); - for (int i = 0; i < 8; i++) + // FP callee-saved registers (f24-f31) are not saved by PROLOG_WITH_TRANSITION_BLOCK + // Zero them; the unwinder will restore actual values during stack walk + for (int i = 24; i < 32; i++) { - // f24-f31 map to indices 24-31 in the F array, each taking 4 slots - memcpy(&m_Context.F[(24 + i) * 4], &pFpCalleeSaved[i], sizeof(double)); + memset(&m_Context.F[i * 4], 0, sizeof(double) * 4); } // Initialize remaining F registers (f8-f23) to zero @@ -11233,7 +11224,9 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // Initialize all F registers to zero first memset(m_Context.F, 0, sizeof(m_Context.F)); + // Copy floating point argument registers (fa0-fa7) + // PROLOG_WITH_TRANSITION_BLOCK saves these at GetOffsetOfFloatArgumentRegisters() FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); for (int i = 0; i < 8; i++) { @@ -11241,18 +11234,8 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p memcpy(&m_Context.F[10 + i], &pFloatArgs->f[i], sizeof(double)); } - // Read FP callee-saved registers (fs0-fs11) from the stack - // They are stored at negative offset from TransitionBlock: - // Layout: [fs0-fs11 (96 bytes)] [fa0-fa7 (64 bytes)] [TransitionBlock] - // FP callee-saved are at TransitionBlock - 160 (96 + 64) - // RISC-V FP callee-saved: fs0=f8, fs1=f9, fs2-fs11=f18-f27 - UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 160); - memcpy(&m_Context.F[8], &pFpCalleeSaved[0], sizeof(double)); // fs0 = f8 - memcpy(&m_Context.F[9], &pFpCalleeSaved[1], sizeof(double)); // fs1 = f9 - for (int i = 0; i < 10; i++) - { - memcpy(&m_Context.F[18 + i], &pFpCalleeSaved[2 + i], sizeof(double)); // fs2-fs11 = f18-f27 - } + // FP callee-saved registers (fs0-fs11) are not saved by PROLOG_WITH_TRANSITION_BLOCK + // They remain zeroed; the unwinder will restore actual values during stack walk // Initialize FP control/status register m_Context.Fcsr = 0; diff --git a/src/coreclr/vm/loongarch64/asmhelpers.S b/src/coreclr/vm/loongarch64/asmhelpers.S index 9f424c39dd30f5..701594918f40f3 100644 --- a/src/coreclr/vm/loongarch64/asmhelpers.S +++ b/src/coreclr/vm/loongarch64/asmhelpers.S @@ -1029,9 +1029,9 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT // $a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $a1 + PROLOG_WITH_TRANSITION_BLOCK // $a0 already contains exception object - // $a1 contains pointer to TransitionBlock + addi.d $a1, $sp, __PWTB_TransitionBlock bl C_FUNC(IL_Throw_Impl) // Should never return break 0 @@ -1045,9 +1045,9 @@ NESTED_END IL_Throw, _TEXT // $a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $a1 + PROLOG_WITH_TRANSITION_BLOCK // $a0 already contains exception object - // $a1 contains pointer to TransitionBlock + addi.d $a1, $sp, __PWTB_TransitionBlock bl C_FUNC(IL_ThrowExact_Impl) // Should never return break 0 @@ -1058,8 +1058,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $a0 - // $a0 contains pointer to TransitionBlock + PROLOG_WITH_TRANSITION_BLOCK + addi.d $a0, $sp, __PWTB_TransitionBlock bl C_FUNC(IL_Rethrow_Impl) // Should never return break 0 diff --git a/src/coreclr/vm/riscv64/asmhelpers.S b/src/coreclr/vm/riscv64/asmhelpers.S index 0d26ef514d4f58..e34a96e141924d 100644 --- a/src/coreclr/vm/riscv64/asmhelpers.S +++ b/src/coreclr/vm/riscv64/asmhelpers.S @@ -886,9 +886,9 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT // a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS a1 + PROLOG_WITH_TRANSITION_BLOCK // a0 already contains exception object - // a1 contains pointer to TransitionBlock + addi a1, sp, __PWTB_TransitionBlock call C_FUNC(IL_Throw_Impl) // Should never return ebreak @@ -902,9 +902,9 @@ NESTED_END IL_Throw, _TEXT // a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS a1 + PROLOG_WITH_TRANSITION_BLOCK // a0 already contains exception object - // a1 contains pointer to TransitionBlock + addi a1, sp, __PWTB_TransitionBlock call C_FUNC(IL_ThrowExact_Impl) // Should never return ebreak @@ -915,8 +915,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS a0 - // a0 contains pointer to TransitionBlock + PROLOG_WITH_TRANSITION_BLOCK + addi a0, sp, __PWTB_TransitionBlock call C_FUNC(IL_Rethrow_Impl) // Should never return ebreak From b5e6e6ecb6e678af0f7bdd5dd36ca587caefecd8 Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Mon, 12 Jan 2026 22:21:33 +0200 Subject: [PATCH 16/30] Revert "Switch to reusing PROLOG_WITH_TRANSITION_BLOCK" This reverts commit af084b81ab4b19f3dab301d1ba44b1c47554ed07. --- src/coreclr/pal/inc/unixasmmacrosamd64.inc | 81 +++++++++++ src/coreclr/pal/inc/unixasmmacrosarm.inc | 28 ++++ src/coreclr/pal/inc/unixasmmacrosarm64.inc | 41 ++++++ .../pal/inc/unixasmmacrosloongarch64.inc | 36 +++++ src/coreclr/pal/inc/unixasmmacrosriscv64.inc | 42 ++++++ src/coreclr/vm/amd64/AsmHelpers.asm | 12 +- src/coreclr/vm/amd64/AsmMacros.inc | 51 +++++++ src/coreclr/vm/amd64/asmhelpers.S | 12 +- src/coreclr/vm/arm/asmhelpers.S | 12 +- src/coreclr/vm/arm64/asmhelpers.S | 20 ++- src/coreclr/vm/arm64/asmhelpers.asm | 12 +- src/coreclr/vm/arm64/asmmacros.h | 49 +++++++ src/coreclr/vm/excep.cpp | 131 ++++++++++-------- src/coreclr/vm/loongarch64/asmhelpers.S | 12 +- src/coreclr/vm/riscv64/asmhelpers.S | 12 +- 15 files changed, 452 insertions(+), 99 deletions(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index 90c8947e754297..90e53a00d87932 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -147,6 +147,12 @@ C_FUNC(\Name\()_End): // the xmm registers are not supported by the libunwind .endm +// Unaligned version for use when stack alignment cannot be guaranteed +.macro save_xmm128_postrsp_unaligned Reg, Offset + __Offset = \Offset + movdqu xmmword ptr [rsp + __Offset], \Reg +.endm + .macro restore_xmm128 Reg, ofs __Offset = \ofs movdqa \Reg, xmmword ptr [rsp + __Offset] @@ -246,6 +252,20 @@ C_FUNC(\Name\()_End): .endm +// Unaligned version for cases where 16-byte stack alignment cannot be guaranteed +.macro SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED ofs + + save_xmm128_postrsp_unaligned xmm0, \ofs + save_xmm128_postrsp_unaligned xmm1, \ofs + 0x10 + save_xmm128_postrsp_unaligned xmm2, \ofs + 0x20 + save_xmm128_postrsp_unaligned xmm3, \ofs + 0x30 + save_xmm128_postrsp_unaligned xmm4, \ofs + 0x40 + save_xmm128_postrsp_unaligned xmm5, \ofs + 0x50 + save_xmm128_postrsp_unaligned xmm6, \ofs + 0x60 + save_xmm128_postrsp_unaligned xmm7, \ofs + 0x70 + +.endm + .macro RESTORE_FLOAT_ARGUMENT_REGISTERS ofs restore_xmm128 xmm0, \ofs @@ -428,6 +448,67 @@ C_FUNC(\Name\()_End): POP_CALLEE_SAVED_REGISTERS .endm +// Pushes a full TransitionBlock on the stack including argument registers and +// floating point argument registers. Used for exception throw helpers where we +// need to capture the complete register state. +// +// Stack layout (from high to low address after prologue): +// Return address (8 bytes) +// CalleeSavedRegisters (rbp, rbx, r15, r14, r13, r12 - 48 bytes) +// ArgumentRegisters (r9, r8, rcx, rdx, rsi, rdi - 48 bytes) <- TransitionBlock pointer +// FloatArgumentRegisters (xmm0-xmm7, 128 bytes) at rsp+8 +// 8-byte alignment padding at rsp+0 +// sp points here +// +// Stack alignment calculation: +// Before call to IL_Throw: rsp is 16-byte aligned (ABI requirement before call) +// After call (return addr pushed): rsp % 16 = 8 +// After PUSH_CALLEE_SAVED_REGISTERS (48 bytes): rsp % 16 = 8 +// After PUSH_ARGUMENT_REGISTERS (48 bytes): rsp % 16 = 8 +// After alloc_stack 136: rsp % 16 = (8 - 136 % 16) = (8 - 8) = 0 <- aligned! +// +// Stack layout for IL_Throw helpers using TransitionBlock with float registers. +// +// Stack alignment calculation: +// Before call to IL_Throw: rsp is 16-byte aligned (ABI requirement before call) +// After call (return addr pushed): rsp % 16 = 8 +// After PUSH_CALLEE_SAVED_REGISTERS (48 bytes): rsp % 16 = 8 +// After PUSH_ARGUMENT_REGISTERS (48 bytes): rsp % 16 = 8 +// After alloc_stack 136: rsp % 16 = (8 - 136 % 16) = (8 - 8) = 0 <- aligned! +// +// Stack layout (low to high addresses): +// rsp+0: 8 bytes padding (for alignment) +// rsp+8: FloatArgumentRegisters (xmm0-xmm7, 128 bytes) +// rsp+136: TransitionBlock start +// - ArgumentRegisters (rdi, rsi, rdx, rcx, r8, r9: 48 bytes) +// - CalleeSavedRegisters (r12, r13, r14, r15, rbx, rbp: 48 bytes) +// - Return address (8 bytes) +// +// TransitionBlock at rsp+136, floats at rsp+8 = TransitionBlock - 128 +// (matches GetOffsetOfFloatArgumentRegisters which returns -128) +// +// NOTE: We use SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED because rsp+8 is not +// 16-byte aligned (rsp is aligned, rsp+8 is not). +// +// On exit, \target contains the TransitionBlock pointer. +.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target + set_cfa_register rsp, 8 + + PUSH_CALLEE_SAVED_REGISTERS + PUSH_ARGUMENT_REGISTERS + + // Allocate 128 bytes for floats + 8 bytes padding = 136 bytes + alloc_stack 136 + // Save float argument registers at offset 8 (TransitionBlock - 128) + // Using unaligned stores because rsp+8 is not 16-byte aligned + SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED 8 + + END_PROLOGUE + + // TransitionBlock starts at rsp+136 (where ArgumentRegisters are) + lea \target, [rsp + 136] +.endm + .macro INLINE_GETTHREAD // Inlined version of call C_FUNC(RhpGetThread) INLINE_GET_TLS_VAR t_CurrentThreadInfo diff --git a/src/coreclr/pal/inc/unixasmmacrosarm.inc b/src/coreclr/pal/inc/unixasmmacrosarm.inc index 739886b0bc014e..4ea9aa35236eb8 100644 --- a/src/coreclr/pal/inc/unixasmmacrosarm.inc +++ b/src/coreclr/pal/inc/unixasmmacrosarm.inc @@ -297,6 +297,34 @@ C_FUNC(\Name): // sp+68: padding (4 bytes) - to make d0-d7 8-byte aligned at TransitionBlock-68 // sp+72: d0-d7 (64 bytes) - float argument registers (at TransitionBlock - 68) // sp+136: padding (4 bytes) - to keep total allocation 8-byte aligned +// sp+140: TransitionBlock starts here (CalleeSavedRegisters + ArgumentRegisters pushed above) +// +// GetNegSpaceSize() for ARM32 = 64 (FloatArgumentRegisters) + 4 (padding) = 68 +// GetOffsetOfFloatArgumentRegisters() = -68 +// +// Total stack alloc: 4 + 64 + 4 + 64 + 4 = 140 bytes +// Stack: Arguments(16) + callee-saved(36) + alloc(140) = 192 bytes +// 192 % 8 = 0, properly aligned for ARM32 (8-byte alignment required) +.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target + // Push argument registers (r0-r3) - these will be at highest address in TransitionBlock + PUSH_ARGUMENT_REGISTERS + PUSH_CALLEE_SAVED_REGISTERS + PROLOG_STACK_SAVE_OFFSET r7, #12 + // let r7 point the saved r7 in the stack (clang FP style) + // Allocate space for: padding (4) + d8-d15 (64) + padding (4) + d0-d7 (64) + padding (4) = 140 bytes + alloc_stack 140 + // Save floating point argument registers (d0-d7) at sp+72 (TransitionBlock - 68) + add r12, sp, #72 + vstm r12, {d0-d7} + // Save FP callee-saved registers (d8-d15) at sp+4 + add r12, sp, #4 + vstm r12, {d8-d15} + CHECK_STACK_ALIGNMENT + END_PROLOGUE + // TransitionBlock is at sp + 140 + add \target, sp, #140 +.endm + .macro POP_COOP_PINVOKE_FRAME free_stack 4 POP_CALLEE_SAVED_REGISTERS diff --git a/src/coreclr/pal/inc/unixasmmacrosarm64.inc b/src/coreclr/pal/inc/unixasmmacrosarm64.inc index 1c5ff88277c6e6..2d27459372b561 100644 --- a/src/coreclr/pal/inc/unixasmmacrosarm64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosarm64.inc @@ -427,6 +427,47 @@ C_FUNC(\Name\()_End): // // Stack layout (from low to high address): // sp+0: FP callee-saved registers (d8-d15, 64 bytes) +// sp+64: FloatArgumentRegisters (q0-q7, 128 bytes) +// sp+192: TransitionBlock start (176 bytes) +// - CalleeSavedRegisters (fp, lr, x19-x28 - 96 bytes) +// - padding (8 bytes) +// - x8 (8 bytes) +// - ArgumentRegisters (x0-x7, 64 bytes) +// +// On exit, \target contains the TransitionBlock pointer (sp+192). +.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -176 + + // Spill callee saved registers + PROLOG_SAVE_REG_PAIR x19, x20, 16 + PROLOG_SAVE_REG_PAIR x21, x22, 32 + PROLOG_SAVE_REG_PAIR x23, x24, 48 + PROLOG_SAVE_REG_PAIR x25, x26, 64 + PROLOG_SAVE_REG_PAIR x27, x28, 80 + + // Allocate space for FloatArgumentRegisters (128) + FP callee-saved (64) = 192 bytes + PROLOG_STACK_ALLOC 192 + + // Save argument registers (x8, x0-x7) at offset 296 from sp (192 + 104) + SAVE_ARGUMENT_REGISTERS sp, 296 + + // Save floating point argument registers (q0-q7) at sp+64 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 64 + + // Save FP callee-saved registers (d8-d15) at sp+0 + str d8, [sp, #0] + str d9, [sp, #8] + str d10, [sp, #16] + str d11, [sp, #24] + str d12, [sp, #32] + str d13, [sp, #40] + str d14, [sp, #48] + str d15, [sp, #56] + + // Set target to TransitionBlock pointer + add \target, sp, #192 +.endm + // ------------------------------------------------------------------ // Macro to generate Redirection Stubs // diff --git a/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc b/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc index bbfc4db9bf567a..92d701598f933e 100644 --- a/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosloongarch64.inc @@ -407,6 +407,42 @@ C_FUNC(\Name\()_End): // Pushes a full TransitionBlock on the stack including argument registers and // floating point argument registers. Used for exception throw helpers where we // need to capture the complete register state. +// +// Stack layout (from low to high address): +// sp+0: FloatArgumentRegisters (fa0-fa7, 64 bytes) +// sp+64: TransitionBlock start +// - CalleeSavedRegisters (fp, ra, s0-s8 - 96 bytes) +// - ArgumentRegisters (a0-a7, 64 bytes) +// +// On exit, \target contains the TransitionBlock pointer (sp+128). +.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target + // Stack: FPCalleeSaved(64) + FloatArgs(64) + CalleeSaved(96) + Args(64) = 288 bytes + PROLOG_STACK_ALLOC 288 + PROLOG_SAVE_REG_PAIR 22, 1, 128, 1 + + // Save callee-saved registers at offset 128 (after FP callee-saved and FloatArgumentRegisters) + SAVE_CALLEESAVED_REGISTERS $sp, 128 + + // Save argument registers (a0-a7) at offset 224 + SAVE_ARGUMENT_REGISTERS $sp, 224 + + // Save floating-point argument registers (fa0-fa7) at offset 64 + SAVE_FLOAT_ARGUMENT_REGISTERS $sp, 64 + + // Save FP callee-saved registers (f24-f31) at offset 0 + fst.d $f24, $sp, 0 + fst.d $f25, $sp, 8 + fst.d $f26, $sp, 16 + fst.d $f27, $sp, 24 + fst.d $f28, $sp, 32 + fst.d $f29, $sp, 40 + fst.d $f30, $sp, 48 + fst.d $f31, $sp, 56 + + // Set target to TransitionBlock pointer + addi.d \target, $sp, 128 +.endm + // ------------------------------------------------------------------ // Macro to generate Redirection Stubs // diff --git a/src/coreclr/pal/inc/unixasmmacrosriscv64.inc b/src/coreclr/pal/inc/unixasmmacrosriscv64.inc index 4b80836486bf78..d244756c304eb9 100644 --- a/src/coreclr/pal/inc/unixasmmacrosriscv64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosriscv64.inc @@ -352,6 +352,48 @@ C_FUNC(\Name): // Pushes a full TransitionBlock on the stack including argument registers and // floating point argument registers. Used for exception throw helpers where we // need to capture the complete register state. +// +// Stack layout (from low to high address): +// sp+0: FloatArgumentRegisters (fa0-fa7, 64 bytes) +// sp+64: TransitionBlock start +// - CalleeSavedRegisters (fp, ra, s1-s11, tp, gp - 120 bytes) +// - padding (8 bytes) +// - ArgumentRegisters (a0-a7, 64 bytes) +// +// On exit, \target contains the TransitionBlock pointer (sp+160). +.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target + // Stack: FPCalleeSaved(96) + FloatArgs(64) + CalleeSaved(120) + pad(8) + Args(64) = 352 bytes + PROLOG_STACK_ALLOC 352 + PROLOG_SAVE_REG_PAIR fp, ra, 160, 1 + + // Save callee-saved registers at offset 160 (after FP callee-saved and FloatArgumentRegisters) + SAVE_CALLEESAVED_REGISTERS sp, 160 + + // Save argument registers (a0-a7) at offset 288 + SAVE_ARGUMENT_REGISTERS sp, 288 + + // Save floating-point argument registers (fa0-fa7) at offset 96 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 96 + + // Save FP callee-saved registers (fs0-fs11 = f8,f9,f18-f27) at offset 0 + // RISC-V FP callee-saved: fs0=f8, fs1=f9, fs2-fs11=f18-f27 + fsd fs0, 0(sp) // f8 + fsd fs1, 8(sp) // f9 + fsd fs2, 16(sp) // f18 + fsd fs3, 24(sp) // f19 + fsd fs4, 32(sp) // f20 + fsd fs5, 40(sp) // f21 + fsd fs6, 48(sp) // f22 + fsd fs7, 56(sp) // f23 + fsd fs8, 64(sp) // f24 + fsd fs9, 72(sp) // f25 + fsd fs10, 80(sp) // f26 + fsd fs11, 88(sp) // f27 + + // Set target to TransitionBlock pointer + addi \target, sp, 160 +.endm + // ------------------------------------------------------------------ // Macro to generate Redirection Stubs // diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index b0fdc62f60ee02..4725a0cf8eefbb 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -1211,10 +1211,10 @@ endif ; FEATURE_INTERPRETER ; RCX = Pointer to exception object ;========================================================================== NESTED_ENTRY IL_Throw, _TEXT - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdx ; RCX already contains exception object - lea rdx, [rsp + __PWTB_TransitionBlock] + ; RDX contains pointer to TransitionBlock call IL_Throw_Impl ; Should never return int 3 @@ -1228,10 +1228,10 @@ NESTED_END IL_Throw, _TEXT ; RCX = Pointer to exception object ;========================================================================== NESTED_ENTRY IL_ThrowExact, _TEXT - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdx ; RCX already contains exception object - lea rdx, [rsp + __PWTB_TransitionBlock] + ; RDX contains pointer to TransitionBlock call IL_ThrowExact_Impl ; Should never return int 3 @@ -1242,9 +1242,9 @@ NESTED_END IL_ThrowExact, _TEXT ; implementation written in C. ;========================================================================== NESTED_ENTRY IL_Rethrow, _TEXT - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rcx - lea rcx, [rsp + __PWTB_TransitionBlock] + ; RCX contains pointer to TransitionBlock call IL_Rethrow_Impl ; Should never return int 3 diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index 4c21fdefbf1fe8..f8b2bd11fb5078 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -497,5 +497,56 @@ POP_COOP_PINVOKE_FRAME macro ; FP Callee-saved registers (xmm6-xmm15, 160 bytes) ; sp points here ; +; Stack alignment: After call (8) + callee-saved pushes (64) + alloc (256) = 328 bytes +; 328 mod 16 = 8, so RSP mod 16 = 8 after alloc - NOT 16-byte aligned. +; We need RSP to be 16-byte aligned for movaps AND unwind offsets must be multiples of 16. +; +; Solution: Use alloc_stack 272 (adds 16 bytes padding). 8 + 64 + 272 = 344, 344 mod 16 = 8. +; Wait, that's still not right. Let me recalculate: +; After call: RSP mod 16 = 8 (return addr pushed from 16-byte aligned stack) +; After 8 pushes (64 bytes): RSP mod 16 = (8 + 64) mod 16 = 72 mod 16 = 8 +; After alloc N: RSP mod 16 = (8 + N) mod 16 +; For RSP mod 16 = 0, need N mod 16 = 8 +; 256 mod 16 = 0, so RSP mod 16 = 8 (NOT aligned) +; 264 mod 16 = 8, so RSP mod 16 = 0 (aligned!) but offsets 0,16,32... work +; +; With alloc_stack 264: RSP is 16-byte aligned, XMM saves at 0, 16, 32, ... (multiples of 16) +; +; On exit, target contains the TransitionBlock pointer (CalleeSavedRegisters). +PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target + + PUSH_CALLEE_SAVED_REGISTERS + + ; Allocate space for: FP callee-saved (160) + float args (64) + shadow (32) + padding (8) = 264 bytes + ; This makes RSP 16-byte aligned (8 + 64 + 264 = 336, and original RSP - 336 is 16-byte aligned) + alloc_stack 264 + + ; Save argument registers to shadow space area at offset 224 + SAVE_ARGUMENT_REGISTERS 224 + + ; Save float argument registers at offset 160 + SAVE_FLOAT_ARGUMENT_REGISTERS 160 + + ; Save FP callee-saved registers (xmm6-xmm15) at offset 0 + ; RSP is 16-byte aligned, so offset 0, 16, 32, ... are all 16-byte aligned + ; AND these offsets are multiples of 16 as required by unwind codes + save_xmm128_postrsp xmm6, 0h + save_xmm128_postrsp xmm7, 10h + save_xmm128_postrsp xmm8, 20h + save_xmm128_postrsp xmm9, 30h + save_xmm128_postrsp xmm10, 40h + save_xmm128_postrsp xmm11, 50h + save_xmm128_postrsp xmm12, 60h + save_xmm128_postrsp xmm13, 70h + save_xmm128_postrsp xmm14, 80h + save_xmm128_postrsp xmm15, 90h + + END_PROLOGUE + + ; TransitionBlock pointer points to CalleeSavedRegisters at rsp + 264 + lea target, [rsp + 264] + + endm + ;; GC type flags GC_ALLOC_FINALIZE equ 1 diff --git a/src/coreclr/vm/amd64/asmhelpers.S b/src/coreclr/vm/amd64/asmhelpers.S index 3141735397593b..38b3c87166d0be 100644 --- a/src/coreclr/vm/amd64/asmhelpers.S +++ b/src/coreclr/vm/amd64/asmhelpers.S @@ -1922,9 +1922,9 @@ NESTED_END CallJittedMethodRetDoubleDouble, _TEXT // rdi = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rsi // rdi already contains exception object - lea rsi, [rsp + __PWTB_TransitionBlock] + // rsi contains pointer to TransitionBlock call C_FUNC(IL_Throw_Impl) // Should never return int3 @@ -1938,9 +1938,9 @@ NESTED_END IL_Throw, _TEXT // rdi = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rsi // rdi already contains exception object - lea rsi, [rsp + __PWTB_TransitionBlock] + // rsi contains pointer to TransitionBlock call C_FUNC(IL_ThrowExact_Impl) // Should never return int3 @@ -1951,8 +1951,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK - lea rdi, [rsp + __PWTB_TransitionBlock] + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdi + // rdi contains pointer to TransitionBlock call C_FUNC(IL_Rethrow_Impl) // Should never return int3 diff --git a/src/coreclr/vm/arm/asmhelpers.S b/src/coreclr/vm/arm/asmhelpers.S index 11c0da6fa9c4d6..5065a6d15c4cb4 100644 --- a/src/coreclr/vm/arm/asmhelpers.S +++ b/src/coreclr/vm/arm/asmhelpers.S @@ -906,9 +906,9 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT // r0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS r1 // r0 already contains exception object - add r1, sp, #__PWTB_TransitionBlock + // r1 contains pointer to TransitionBlock bl C_FUNC(IL_Throw_Impl) // Should never return EMIT_BREAKPOINT @@ -922,9 +922,9 @@ NESTED_END IL_Throw, _TEXT // r0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS r1 // r0 already contains exception object - add r1, sp, #__PWTB_TransitionBlock + // r1 contains pointer to TransitionBlock bl C_FUNC(IL_ThrowExact_Impl) // Should never return EMIT_BREAKPOINT @@ -935,8 +935,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK - add r0, sp, #__PWTB_TransitionBlock + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS r0 + // r0 contains pointer to TransitionBlock bl C_FUNC(IL_Rethrow_Impl) // Should never return EMIT_BREAKPOINT diff --git a/src/coreclr/vm/arm64/asmhelpers.S b/src/coreclr/vm/arm64/asmhelpers.S index ac3c11da0af6b4..127a5c0a118245 100644 --- a/src/coreclr/vm/arm64/asmhelpers.S +++ b/src/coreclr/vm/arm64/asmhelpers.S @@ -2757,13 +2757,21 @@ NESTED_END CallJittedMethodRet4Vector128, _TEXT // Capture a transition block with register values and call the IL_Throw_Impl // implementation written in C. // +// Stack layout (from low to high address): +// sp+0: FloatArgumentRegisters (q0-q7, 128 bytes) +// sp+128: TransitionBlock start (176 bytes) +// - CalleeSavedRegisters (fp, lr, x19-x28 - 96 bytes) +// - padding (8 bytes) +// - x8 (8 bytes) +// - ArgumentRegisters (x0-x7, 64 bytes) +// // Input state: // x0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 // x0 already contains exception object - add x1, sp, #__PWTB_TransitionBlock + // x1 contains pointer to TransitionBlock bl C_FUNC(IL_Throw_Impl) // Should never return brk #0 @@ -2777,9 +2785,9 @@ NESTED_END IL_Throw, _TEXT // x0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 // x0 already contains exception object - add x1, sp, #__PWTB_TransitionBlock + // x1 contains pointer to TransitionBlock bl C_FUNC(IL_ThrowExact_Impl) // Should never return brk #0 @@ -2790,8 +2798,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK - add x0, sp, #__PWTB_TransitionBlock + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x0 + // x0 contains pointer to TransitionBlock bl C_FUNC(IL_Rethrow_Impl) // Should never return brk #0 diff --git a/src/coreclr/vm/arm64/asmhelpers.asm b/src/coreclr/vm/arm64/asmhelpers.asm index e3ad409e31cfb8..7f510cd8ed62a5 100644 --- a/src/coreclr/vm/arm64/asmhelpers.asm +++ b/src/coreclr/vm/arm64/asmhelpers.asm @@ -2992,9 +2992,9 @@ CopyLoop ; x0 = Pointer to exception object ; ------------------------------------------------------------------ NESTED_ENTRY IL_Throw - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 ; x0 already contains exception object - add x1, sp, #__PWTB_TransitionBlock + ; x1 contains pointer to TransitionBlock bl IL_Throw_Impl ; Should never return brk #0 @@ -3008,9 +3008,9 @@ CopyLoop ; x0 = Pointer to exception object ; ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x1 ; x0 already contains exception object - add x1, sp, #__PWTB_TransitionBlock + ; x1 contains pointer to TransitionBlock bl IL_ThrowExact_Impl ; Should never return brk #0 @@ -3021,8 +3021,8 @@ CopyLoop ; implementation written in C. ; ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow - PROLOG_WITH_TRANSITION_BLOCK - add x0, sp, #__PWTB_TransitionBlock + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS x0 + ; x0 contains pointer to TransitionBlock bl IL_Rethrow_Impl ; Should never return brk #0 diff --git a/src/coreclr/vm/arm64/asmmacros.h b/src/coreclr/vm/arm64/asmmacros.h index a11067633ab82e..93778d775f87c9 100644 --- a/src/coreclr/vm/arm64/asmmacros.h +++ b/src/coreclr/vm/arm64/asmmacros.h @@ -204,6 +204,55 @@ OFFSETOF__ee_alloc_context EQU OFFSETOF__RuntimeThreadLocals__ee_alloc_context EPILOG_RESTORE_REG_PAIR fp, lr, #176! MEND +; Pushes a full TransitionBlock on the stack including argument registers and +; floating point argument registers. Used for exception throw helpers where we +; need to capture the complete register state including FP callee-saved registers. +; +; Stack layout (from low to high address): +; sp+0: FP callee-saved registers (d8-d15, 64 bytes) +; sp+64: FloatArgumentRegisters (q0-q7, 128 bytes) +; sp+192: TransitionBlock start (176 bytes) +; - CalleeSavedRegisters (fp, lr, x19-x28 - 96 bytes) +; - padding (8 bytes) +; - x8 (8 bytes) +; - ArgumentRegisters (x0-x7, 64 bytes) +; +; On exit, $Target contains the TransitionBlock pointer (sp+192). + MACRO + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $Target + + PROLOG_SAVE_REG_PAIR fp, lr, #-176! + + ; Spill callee saved registers + PROLOG_SAVE_REG_PAIR x19, x20, #16 + PROLOG_SAVE_REG_PAIR x21, x22, #32 + PROLOG_SAVE_REG_PAIR x23, x24, #48 + PROLOG_SAVE_REG_PAIR x25, x26, #64 + PROLOG_SAVE_REG_PAIR x27, x28, #80 + + ; Allocate space for FloatArgumentRegisters (128) + FP callee-saved (64) = 192 bytes + PROLOG_STACK_ALLOC 192 + + ; Save argument registers (x8, x0-x7) at offset 296 from sp (192 + 104) + SAVE_ARGUMENT_REGISTERS sp, 296 + + ; Save floating point argument registers (q0-q7) at sp+64 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 64 + + ; Save FP callee-saved registers (d8-d15) at sp+0 + str d8, [sp, #0] + str d9, [sp, #8] + str d10, [sp, #16] + str d11, [sp, #24] + str d12, [sp, #32] + str d13, [sp, #40] + str d14, [sp, #48] + str d15, [sp, #56] + + ; Set target to TransitionBlock pointer + add $Target, sp, #192 + MEND + #define GC_ALLOC_FINALIZE 1 ;----------------------------------------------------------------------------- diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index bfed3c091052df..79197977474441 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10900,10 +10900,9 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_ContextPointers.R9 = &m_Context.R9; // Copy floating point argument registers (xmm0-xmm7) - // PROLOG_WITH_TRANSITION_BLOCK places floats at offset -136 from TransitionBlock - // (128 bytes for floats + 8 bytes padding for stack alignment) - // Use memcpy to avoid alignment issues - BYTE *pFloatArgs = (BYTE*)pTransitionBlock - 136; + // Use memcpy to avoid alignment issues - the source may not be 16-byte aligned + // depending on stack layout in the assembly helpers + BYTE *pFloatArgs = (BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters(); memcpy(&m_Context.Xmm0, pFloatArgs + 0x00, sizeof(m_Context.Xmm0)); memcpy(&m_Context.Xmm1, pFloatArgs + 0x10, sizeof(m_Context.Xmm1)); memcpy(&m_Context.Xmm2, pFloatArgs + 0x20, sizeof(m_Context.Xmm2)); @@ -10912,7 +10911,7 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p memcpy(&m_Context.Xmm5, pFloatArgs + 0x50, sizeof(m_Context.Xmm5)); memcpy(&m_Context.Xmm6, pFloatArgs + 0x60, sizeof(m_Context.Xmm6)); memcpy(&m_Context.Xmm7, pFloatArgs + 0x70, sizeof(m_Context.Xmm7)); - // Initialize remaining XMM registers to zero (caller-saved on Unix) + // Initialize remaining XMM registers to zero memset(&m_Context.Xmm8, 0, sizeof(m_Context.Xmm8)); memset(&m_Context.Xmm9, 0, sizeof(m_Context.Xmm9)); memset(&m_Context.Xmm10, 0, sizeof(m_Context.Xmm10)); @@ -10924,36 +10923,28 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // Initialize FP control/status m_Context.MxCsr = 0x1F80; // Default MXCSR value (all exceptions masked) #else - // On Windows AMD64, PROLOG_WITH_TRANSITION_BLOCK saves xmm0-xmm3 (float argument registers) - // but not xmm6-xmm15 (callee-saved). The callee-saved FP registers are preserved by the - // normal calling convention - since we don't modify them, the unwinder will find them. - // - // PROLOG_WITH_TRANSITION_BLOCK layout (with extraLocals=0): - // __PWTB_FloatArgumentRegisters = 32 (SIZEOF_MAX_OUTGOING_ARGUMENT_HOMES) - // __PWTB_StackAlloc = 32 + 64 + 8 = 104 - // __PWTB_TransitionBlock = 104 - // Float offset from TransitionBlock = 32 - 104 = -72 - BYTE *pFloatArgs = (BYTE*)pTransitionBlock - 72; - memcpy(&m_Context.Xmm0, pFloatArgs + 0x00, sizeof(m_Context.Xmm0)); - memcpy(&m_Context.Xmm1, pFloatArgs + 0x10, sizeof(m_Context.Xmm1)); - memcpy(&m_Context.Xmm2, pFloatArgs + 0x20, sizeof(m_Context.Xmm2)); - memcpy(&m_Context.Xmm3, pFloatArgs + 0x30, sizeof(m_Context.Xmm3)); - - // xmm4-xmm5 are volatile but not saved (zero them) - memset(&m_Context.Xmm4, 0, sizeof(m_Context.Xmm4)); - memset(&m_Context.Xmm5, 0, sizeof(m_Context.Xmm5)); - - // xmm6-xmm15 are callee-saved - zero them here; the unwinder will restore actual values - memset(&m_Context.Xmm6, 0, sizeof(m_Context.Xmm6)); - memset(&m_Context.Xmm7, 0, sizeof(m_Context.Xmm7)); - memset(&m_Context.Xmm8, 0, sizeof(m_Context.Xmm8)); - memset(&m_Context.Xmm9, 0, sizeof(m_Context.Xmm9)); - memset(&m_Context.Xmm10, 0, sizeof(m_Context.Xmm10)); - memset(&m_Context.Xmm11, 0, sizeof(m_Context.Xmm11)); - memset(&m_Context.Xmm12, 0, sizeof(m_Context.Xmm12)); - memset(&m_Context.Xmm13, 0, sizeof(m_Context.Xmm13)); - memset(&m_Context.Xmm14, 0, sizeof(m_Context.Xmm14)); - memset(&m_Context.Xmm15, 0, sizeof(m_Context.Xmm15)); + // On Windows AMD64, argument registers are not saved in the transition block + m_Context.Rax = 0; + m_Context.Rcx = 0; + m_Context.Rdx = 0; + m_Context.R8 = 0; + m_Context.R9 = 0; + + // Read FP callee-saved registers (xmm6-xmm15) from the stack + // They are stored at negative offsets from TransitionBlock: + // Layout: [xmm6-xmm15 (160 bytes)] [xmm0-xmm3 (64 bytes)] [shadow (32 bytes)] [padding (8 bytes)] [CalleeSavedRegs] [RetAddr] + // xmm6 is at sp+0, TransitionBlock is at sp+264, so xmm6 is at TransitionBlock - 264 + M128A *pFpCalleeSaved = (M128A*)((BYTE*)pTransitionBlock - 264); + m_Context.Xmm6 = pFpCalleeSaved[0]; + m_Context.Xmm7 = pFpCalleeSaved[1]; + m_Context.Xmm8 = pFpCalleeSaved[2]; + m_Context.Xmm9 = pFpCalleeSaved[3]; + m_Context.Xmm10 = pFpCalleeSaved[4]; + m_Context.Xmm11 = pFpCalleeSaved[5]; + m_Context.Xmm12 = pFpCalleeSaved[6]; + m_Context.Xmm13 = pFpCalleeSaved[7]; + m_Context.Xmm14 = pFpCalleeSaved[8]; + m_Context.Xmm15 = pFpCalleeSaved[9]; // Initialize FP control/status m_Context.MxCsr = 0x1F80; // Default MXCSR value (all exceptions masked) @@ -10996,18 +10987,20 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.Lr = pTransitionBlock->m_calleeSavedRegisters.r14; // r14 is link register // Copy floating point argument registers (d0-d7 / s0-s15) - // PROLOG_WITH_TRANSITION_BLOCK saves these at GetOffsetOfFloatArgumentRegisters() FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); for (int i = 0; i < 8; i++) { m_Context.D[i] = pFloatArgs->d[i]; } - // FP callee-saved registers (d8-d15) are not saved by PROLOG_WITH_TRANSITION_BLOCK - // Zero them; the unwinder will restore actual values during stack walk - for (int i = 8; i < 16; i++) + // Read FP callee-saved registers (d8-d15) from the stack + // They are stored at negative offset from TransitionBlock: + // Layout: [d8-d15 (64 bytes)] [padding (4)] [d0-d7 (64 bytes)] [padding (4)] [TransitionBlock] + // FP callee-saved are at TransitionBlock - 136 (64 + 4 + 64 + 4) + UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 136); + for (int i = 0; i < 8; i++) { - m_Context.D[i] = 0; + m_Context.D[8 + i] = pFpCalleeSaved[i]; } // Initialize remaining D registers (D16-D31) to zero - these are caller-saved @@ -11068,20 +11061,33 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.Lr = pTransitionBlock->m_calleeSavedRegisters.x30; // Copy floating point argument registers (V0-V7) - // PROLOG_WITH_TRANSITION_BLOCK saves these at GetOffsetOfFloatArgumentRegisters() FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); for (int i = 0; i < 8; i++) { m_Context.V[i] = pFloatArgs->q[i]; } - // FP callee-saved registers (d8-d15 / V8-V15) are not saved by PROLOG_WITH_TRANSITION_BLOCK - // Zero them; the unwinder will restore actual values during stack walk - for (int i = 8; i < 16; i++) - { - m_Context.V[i].Low = 0; - m_Context.V[i].High = 0; - } + // Read FP callee-saved registers (d8-d15) from the stack + // They are stored at negative offset from TransitionBlock: + // Layout: [d8-d15 (64 bytes)] [q0-q7 (128 bytes)] [TransitionBlock] + // FP callee-saved are at TransitionBlock - 192 (64 + 128) + UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 192); + m_Context.V[8].Low = pFpCalleeSaved[0]; + m_Context.V[8].High = 0; + m_Context.V[9].Low = pFpCalleeSaved[1]; + m_Context.V[9].High = 0; + m_Context.V[10].Low = pFpCalleeSaved[2]; + m_Context.V[10].High = 0; + m_Context.V[11].Low = pFpCalleeSaved[3]; + m_Context.V[11].High = 0; + m_Context.V[12].Low = pFpCalleeSaved[4]; + m_Context.V[12].High = 0; + m_Context.V[13].Low = pFpCalleeSaved[5]; + m_Context.V[13].High = 0; + m_Context.V[14].Low = pFpCalleeSaved[6]; + m_Context.V[14].High = 0; + m_Context.V[15].Low = pFpCalleeSaved[7]; + m_Context.V[15].High = 0; // Initialize remaining V registers (V16-V31) to zero - these are caller-saved for (int i = 16; i < 32; i++) @@ -11147,18 +11153,21 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // F[] array in CONTEXT is 4*32 elements for LSX/LASX support. // Each FP register takes 4 slots (for 256-bit LASX vectors). // For 64-bit doubles, we only use the first slot of each register. - // PROLOG_WITH_TRANSITION_BLOCK saves these at GetOffsetOfFloatArgumentRegisters() FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); for (int i = 0; i < 8; i++) { memcpy(&m_Context.F[i * 4], &pFloatArgs->f[i], sizeof(double)); } - // FP callee-saved registers (f24-f31) are not saved by PROLOG_WITH_TRANSITION_BLOCK - // Zero them; the unwinder will restore actual values during stack walk - for (int i = 24; i < 32; i++) + // Read FP callee-saved registers (f24-f31) from the stack + // They are stored at negative offset from TransitionBlock: + // Layout: [f24-f31 (64 bytes)] [fa0-fa7 (64 bytes)] [TransitionBlock] + // FP callee-saved are at TransitionBlock - 128 (64 + 64) + UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 128); + for (int i = 0; i < 8; i++) { - memset(&m_Context.F[i * 4], 0, sizeof(double) * 4); + // f24-f31 map to indices 24-31 in the F array, each taking 4 slots + memcpy(&m_Context.F[(24 + i) * 4], &pFpCalleeSaved[i], sizeof(double)); } // Initialize remaining F registers (f8-f23) to zero @@ -11224,9 +11233,7 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // Initialize all F registers to zero first memset(m_Context.F, 0, sizeof(m_Context.F)); - // Copy floating point argument registers (fa0-fa7) - // PROLOG_WITH_TRANSITION_BLOCK saves these at GetOffsetOfFloatArgumentRegisters() FloatArgumentRegisters *pFloatArgs = (FloatArgumentRegisters*)((BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters()); for (int i = 0; i < 8; i++) { @@ -11234,8 +11241,18 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p memcpy(&m_Context.F[10 + i], &pFloatArgs->f[i], sizeof(double)); } - // FP callee-saved registers (fs0-fs11) are not saved by PROLOG_WITH_TRANSITION_BLOCK - // They remain zeroed; the unwinder will restore actual values during stack walk + // Read FP callee-saved registers (fs0-fs11) from the stack + // They are stored at negative offset from TransitionBlock: + // Layout: [fs0-fs11 (96 bytes)] [fa0-fa7 (64 bytes)] [TransitionBlock] + // FP callee-saved are at TransitionBlock - 160 (96 + 64) + // RISC-V FP callee-saved: fs0=f8, fs1=f9, fs2-fs11=f18-f27 + UINT64 *pFpCalleeSaved = (UINT64*)((BYTE*)pTransitionBlock - 160); + memcpy(&m_Context.F[8], &pFpCalleeSaved[0], sizeof(double)); // fs0 = f8 + memcpy(&m_Context.F[9], &pFpCalleeSaved[1], sizeof(double)); // fs1 = f9 + for (int i = 0; i < 10; i++) + { + memcpy(&m_Context.F[18 + i], &pFpCalleeSaved[2 + i], sizeof(double)); // fs2-fs11 = f18-f27 + } // Initialize FP control/status register m_Context.Fcsr = 0; diff --git a/src/coreclr/vm/loongarch64/asmhelpers.S b/src/coreclr/vm/loongarch64/asmhelpers.S index 701594918f40f3..9f424c39dd30f5 100644 --- a/src/coreclr/vm/loongarch64/asmhelpers.S +++ b/src/coreclr/vm/loongarch64/asmhelpers.S @@ -1029,9 +1029,9 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT // $a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $a1 // $a0 already contains exception object - addi.d $a1, $sp, __PWTB_TransitionBlock + // $a1 contains pointer to TransitionBlock bl C_FUNC(IL_Throw_Impl) // Should never return break 0 @@ -1045,9 +1045,9 @@ NESTED_END IL_Throw, _TEXT // $a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $a1 // $a0 already contains exception object - addi.d $a1, $sp, __PWTB_TransitionBlock + // $a1 contains pointer to TransitionBlock bl C_FUNC(IL_ThrowExact_Impl) // Should never return break 0 @@ -1058,8 +1058,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK - addi.d $a0, $sp, __PWTB_TransitionBlock + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS $a0 + // $a0 contains pointer to TransitionBlock bl C_FUNC(IL_Rethrow_Impl) // Should never return break 0 diff --git a/src/coreclr/vm/riscv64/asmhelpers.S b/src/coreclr/vm/riscv64/asmhelpers.S index e34a96e141924d..0d26ef514d4f58 100644 --- a/src/coreclr/vm/riscv64/asmhelpers.S +++ b/src/coreclr/vm/riscv64/asmhelpers.S @@ -886,9 +886,9 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT // a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS a1 // a0 already contains exception object - addi a1, sp, __PWTB_TransitionBlock + // a1 contains pointer to TransitionBlock call C_FUNC(IL_Throw_Impl) // Should never return ebreak @@ -902,9 +902,9 @@ NESTED_END IL_Throw, _TEXT // a0 = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS a1 // a0 already contains exception object - addi a1, sp, __PWTB_TransitionBlock + // a1 contains pointer to TransitionBlock call C_FUNC(IL_ThrowExact_Impl) // Should never return ebreak @@ -915,8 +915,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK - addi a0, sp, __PWTB_TransitionBlock + PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS a0 + // a0 contains pointer to TransitionBlock call C_FUNC(IL_Rethrow_Impl) // Should never return ebreak From 0147c5c8e7d60dd44396e8a3047064d025579da6 Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Mon, 12 Jan 2026 22:32:44 +0200 Subject: [PATCH 17/30] Address RBP feedback --- src/coreclr/pal/inc/unixasmmacrosamd64.inc | 5 +++++ src/coreclr/vm/amd64/AsmMacros.inc | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index 90e53a00d87932..8f6ec93097a7a0 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -495,6 +495,11 @@ C_FUNC(\Name\()_End): set_cfa_register rsp, 8 PUSH_CALLEE_SAVED_REGISTERS + + // Set RBP as frame pointer to facilitate stack walking for 3rd party tools. + // After PUSH_CALLEE_SAVED_REGISTERS, saved RBP is at rsp+40 (5 regs * 8 bytes above current rsp) + lea rbp, [rsp + 40] + PUSH_ARGUMENT_REGISTERS // Allocate 128 bytes for floats + 8 bytes padding = 136 bytes diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index f8b2bd11fb5078..65d0f3a359392f 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -517,6 +517,11 @@ PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target PUSH_CALLEE_SAVED_REGISTERS + ; Set RBP as frame pointer to facilitate stack walking for 3rd party tools. + ; After PUSH_CALLEE_SAVED_REGISTERS, saved RBP is at rsp+24 (3 regs * 8 bytes above current rsp) + ; Push order: r15, r14, r13, r12, rbp, rbx, rsi, rdi - so rbp is at offset 24 from rsp + lea rbp, [rsp + 24] + ; Allocate space for: FP callee-saved (160) + float args (64) + shadow (32) + padding (8) = 264 bytes ; This makes RSP 16-byte aligned (8 + 64 + 264 = 336, and original RSP - 336 is 16-byte aligned) alloc_stack 264 From 22571b2bf116a8dcee1703d3aaa6331601e6ba88 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Tue, 13 Jan 2026 01:02:06 +0200 Subject: [PATCH 18/30] Prevent FP exceptions during JIT on AMD64 Windows Added MXCSR reset to prevent floating point exceptions during JIT compilation on AMD64 Windows. --- src/coreclr/vm/prestub.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/coreclr/vm/prestub.cpp b/src/coreclr/vm/prestub.cpp index db92be31a188f3..3804dc264cdb17 100644 --- a/src/coreclr/vm/prestub.cpp +++ b/src/coreclr/vm/prestub.cpp @@ -43,6 +43,10 @@ #include "gdbjit.h" #endif // FEATURE_GDBJIT +#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS) +#include +#endif + #ifndef DACCESS_COMPILE EXTERN_C void STDCALL ThePreStubPatch(); @@ -1805,6 +1809,15 @@ extern "C" PCODE STDCALL PreStubWorker(TransitionBlock* pTransitionBlock, Method { PCODE pbRetVal = (PCODE)NULL; +#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS) + // Reset MXCSR to default value to prevent FP exceptions during JIT compilation. + // During exception handling, MXCSR can become corrupted (e.g., when RtlRestoreContext + // is called with a context that doesn't include CONTEXT_FLOATING_POINT). If the JIT + // is invoked during exception handling (e.g., to compile a catch handler), it may + // crash with an FP exception due to unmasked floating point exceptions. + _mm_setcsr(0x1F80); +#endif + PreserveLastErrorHolder preserveLastError; STATIC_CONTRACT_THROWS; From b109060eccc8942b26a421eafdf49a1cce97fd34 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Tue, 13 Jan 2026 01:17:27 +0200 Subject: [PATCH 19/30] Remove obsolete comment (now that Init() is gone) --- src/coreclr/vm/excep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index 79197977474441..2ad497051f0ad7 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -11288,7 +11288,7 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // WASM cannot capture execution context, so just zero everything memset(&m_Context, 0, sizeof(m_Context)); memset(&m_ContextPointers, 0, sizeof(m_ContextPointers)); - m_ReturnAddress = 1; // Non-zero to skip VirtualUnwind in Init() + m_ReturnAddress = 0; } #endif // TARGET_X86 From dd15723961b237bd5ee760ab0d61cd8522851722 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Tue, 13 Jan 2026 13:46:18 +0200 Subject: [PATCH 20/30] Enhance Context.asm with MXCSR reset explanation --- src/coreclr/vm/amd64/Context.asm | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/coreclr/vm/amd64/Context.asm b/src/coreclr/vm/amd64/Context.asm index 8edafc38f63d06..d7c2d9832570b1 100644 --- a/src/coreclr/vm/amd64/Context.asm +++ b/src/coreclr/vm/amd64/Context.asm @@ -27,6 +27,14 @@ NESTED_ENTRY ClrRestoreNonvolatileContextWorker, _TEXT test byte ptr [r10 + OFFSETOF__CONTEXT__ContextFlags], CONTEXT_FLOATING_POINT je Done_Restore_CONTEXT_FLOATING_POINT fxrstor [r10 + OFFSETOF__CONTEXT__FltSave] + ; Reset MXCSR to the default value after restoring the floating point context. + ; This is necessary because the CONTEXT may have been captured during Windows SEH dispatch + ; which can corrupt MXCSR (setting it to 0x20 with all FP exception masks cleared). + ; Without this reset, FP operations after exception handling would trigger exceptions. + ; 0x1F80 = default MXCSR: all exception masks set, round to nearest, no exceptions pending + push 1F80h + ldmxcsr [rsp] + add rsp, 8 Done_Restore_CONTEXT_FLOATING_POINT: test byte ptr [r10 + OFFSETOF__CONTEXT__ContextFlags], CONTEXT_INTEGER From d11f9fc6fc0beafbc4985789c57dfb5c2f4b1132 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Tue, 13 Jan 2026 23:08:17 +0200 Subject: [PATCH 21/30] Address fb --- src/coreclr/vm/amd64/AsmMacros.inc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index 65d0f3a359392f..f8b2bd11fb5078 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -517,11 +517,6 @@ PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target PUSH_CALLEE_SAVED_REGISTERS - ; Set RBP as frame pointer to facilitate stack walking for 3rd party tools. - ; After PUSH_CALLEE_SAVED_REGISTERS, saved RBP is at rsp+24 (3 regs * 8 bytes above current rsp) - ; Push order: r15, r14, r13, r12, rbp, rbx, rsi, rdi - so rbp is at offset 24 from rsp - lea rbp, [rsp + 24] - ; Allocate space for: FP callee-saved (160) + float args (64) + shadow (32) + padding (8) = 264 bytes ; This makes RSP 16-byte aligned (8 + 64 + 264 = 336, and original RSP - 336 is 16-byte aligned) alloc_stack 264 From fbf03ab433783ab021a257e56263dce1fe5a9879 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Tue, 13 Jan 2026 23:43:51 +0000 Subject: [PATCH 22/30] Address fb --- src/coreclr/pal/inc/unixasmmacrosamd64.inc | 37 --------------- src/coreclr/vm/amd64/asmhelpers.S | 12 ++--- src/coreclr/vm/excep.cpp | 54 +++++----------------- 3 files changed, 18 insertions(+), 85 deletions(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index 8f6ec93097a7a0..606f993dbc1ec6 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -477,43 +477,6 @@ C_FUNC(\Name\()_End): // After alloc_stack 136: rsp % 16 = (8 - 136 % 16) = (8 - 8) = 0 <- aligned! // // Stack layout (low to high addresses): -// rsp+0: 8 bytes padding (for alignment) -// rsp+8: FloatArgumentRegisters (xmm0-xmm7, 128 bytes) -// rsp+136: TransitionBlock start -// - ArgumentRegisters (rdi, rsi, rdx, rcx, r8, r9: 48 bytes) -// - CalleeSavedRegisters (r12, r13, r14, r15, rbx, rbp: 48 bytes) -// - Return address (8 bytes) -// -// TransitionBlock at rsp+136, floats at rsp+8 = TransitionBlock - 128 -// (matches GetOffsetOfFloatArgumentRegisters which returns -128) -// -// NOTE: We use SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED because rsp+8 is not -// 16-byte aligned (rsp is aligned, rsp+8 is not). -// -// On exit, \target contains the TransitionBlock pointer. -.macro PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS target - set_cfa_register rsp, 8 - - PUSH_CALLEE_SAVED_REGISTERS - - // Set RBP as frame pointer to facilitate stack walking for 3rd party tools. - // After PUSH_CALLEE_SAVED_REGISTERS, saved RBP is at rsp+40 (5 regs * 8 bytes above current rsp) - lea rbp, [rsp + 40] - - PUSH_ARGUMENT_REGISTERS - - // Allocate 128 bytes for floats + 8 bytes padding = 136 bytes - alloc_stack 136 - // Save float argument registers at offset 8 (TransitionBlock - 128) - // Using unaligned stores because rsp+8 is not 16-byte aligned - SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED 8 - - END_PROLOGUE - - // TransitionBlock starts at rsp+136 (where ArgumentRegisters are) - lea \target, [rsp + 136] -.endm - .macro INLINE_GETTHREAD // Inlined version of call C_FUNC(RhpGetThread) INLINE_GET_TLS_VAR t_CurrentThreadInfo diff --git a/src/coreclr/vm/amd64/asmhelpers.S b/src/coreclr/vm/amd64/asmhelpers.S index 38b3c87166d0be..3141735397593b 100644 --- a/src/coreclr/vm/amd64/asmhelpers.S +++ b/src/coreclr/vm/amd64/asmhelpers.S @@ -1922,9 +1922,9 @@ NESTED_END CallJittedMethodRetDoubleDouble, _TEXT // rdi = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_Throw, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rsi + PROLOG_WITH_TRANSITION_BLOCK // rdi already contains exception object - // rsi contains pointer to TransitionBlock + lea rsi, [rsp + __PWTB_TransitionBlock] call C_FUNC(IL_Throw_Impl) // Should never return int3 @@ -1938,9 +1938,9 @@ NESTED_END IL_Throw, _TEXT // rdi = Pointer to exception object // ------------------------------------------------------------------ NESTED_ENTRY IL_ThrowExact, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rsi + PROLOG_WITH_TRANSITION_BLOCK // rdi already contains exception object - // rsi contains pointer to TransitionBlock + lea rsi, [rsp + __PWTB_TransitionBlock] call C_FUNC(IL_ThrowExact_Impl) // Should never return int3 @@ -1951,8 +1951,8 @@ NESTED_END IL_ThrowExact, _TEXT // implementation written in C. // ------------------------------------------------------------------ NESTED_ENTRY IL_Rethrow, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdi - // rdi contains pointer to TransitionBlock + PROLOG_WITH_TRANSITION_BLOCK + lea rdi, [rsp + __PWTB_TransitionBlock] call C_FUNC(IL_Rethrow_Impl) // Should never return int3 diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index 2ad497051f0ad7..95163e5aaaf5ab 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10877,53 +10877,23 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p { LIMITED_METHOD_CONTRACT; - m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_FLOATING_POINT; +#ifdef UNIX_AMD64_ABI + // On Unix AMD64, there are no non-volatile FP registers, so we only need + // control registers and integer callee-saved registers. We don't need to + // capture argument registers or FP state for exception handling. + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; m_Context.SegCs = 0; m_Context.SegSs = 0; m_Context.EFlags = 0; - -#ifdef UNIX_AMD64_ABI - // On Unix AMD64, argument registers are saved in the transition block m_Context.Rax = 0; - m_Context.Rdi = pTransitionBlock->m_argumentRegisters.RDI; - m_Context.Rsi = pTransitionBlock->m_argumentRegisters.RSI; - m_Context.Rdx = pTransitionBlock->m_argumentRegisters.RDX; - m_Context.Rcx = pTransitionBlock->m_argumentRegisters.RCX; - m_Context.R8 = pTransitionBlock->m_argumentRegisters.R8; - m_Context.R9 = pTransitionBlock->m_argumentRegisters.R9; - - m_ContextPointers.Rdi = &m_Context.Rdi; - m_ContextPointers.Rsi = &m_Context.Rsi; - m_ContextPointers.Rdx = &m_Context.Rdx; - m_ContextPointers.Rcx = &m_Context.Rcx; - m_ContextPointers.R8 = &m_Context.R8; - m_ContextPointers.R9 = &m_Context.R9; - - // Copy floating point argument registers (xmm0-xmm7) - // Use memcpy to avoid alignment issues - the source may not be 16-byte aligned - // depending on stack layout in the assembly helpers - BYTE *pFloatArgs = (BYTE*)pTransitionBlock + TransitionBlock::GetOffsetOfFloatArgumentRegisters(); - memcpy(&m_Context.Xmm0, pFloatArgs + 0x00, sizeof(m_Context.Xmm0)); - memcpy(&m_Context.Xmm1, pFloatArgs + 0x10, sizeof(m_Context.Xmm1)); - memcpy(&m_Context.Xmm2, pFloatArgs + 0x20, sizeof(m_Context.Xmm2)); - memcpy(&m_Context.Xmm3, pFloatArgs + 0x30, sizeof(m_Context.Xmm3)); - memcpy(&m_Context.Xmm4, pFloatArgs + 0x40, sizeof(m_Context.Xmm4)); - memcpy(&m_Context.Xmm5, pFloatArgs + 0x50, sizeof(m_Context.Xmm5)); - memcpy(&m_Context.Xmm6, pFloatArgs + 0x60, sizeof(m_Context.Xmm6)); - memcpy(&m_Context.Xmm7, pFloatArgs + 0x70, sizeof(m_Context.Xmm7)); - // Initialize remaining XMM registers to zero - memset(&m_Context.Xmm8, 0, sizeof(m_Context.Xmm8)); - memset(&m_Context.Xmm9, 0, sizeof(m_Context.Xmm9)); - memset(&m_Context.Xmm10, 0, sizeof(m_Context.Xmm10)); - memset(&m_Context.Xmm11, 0, sizeof(m_Context.Xmm11)); - memset(&m_Context.Xmm12, 0, sizeof(m_Context.Xmm12)); - memset(&m_Context.Xmm13, 0, sizeof(m_Context.Xmm13)); - memset(&m_Context.Xmm14, 0, sizeof(m_Context.Xmm14)); - memset(&m_Context.Xmm15, 0, sizeof(m_Context.Xmm15)); - // Initialize FP control/status - m_Context.MxCsr = 0x1F80; // Default MXCSR value (all exceptions masked) #else - // On Windows AMD64, argument registers are not saved in the transition block + // On Windows AMD64, we need FP state because xmm6-xmm15 are non-volatile + m_Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_FLOATING_POINT; + m_Context.SegCs = 0; + m_Context.SegSs = 0; + m_Context.EFlags = 0; + + // Argument registers are not saved in the transition block m_Context.Rax = 0; m_Context.Rcx = 0; m_Context.Rdx = 0; From c1146d40ed75a2b494798ee45f212e0bc2851f5a Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Tue, 13 Jan 2026 23:45:56 +0000 Subject: [PATCH 23/30] . --- src/coreclr/pal/inc/unixasmmacrosamd64.inc | 49 ---------------------- 1 file changed, 49 deletions(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index 606f993dbc1ec6..90c8947e754297 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -147,12 +147,6 @@ C_FUNC(\Name\()_End): // the xmm registers are not supported by the libunwind .endm -// Unaligned version for use when stack alignment cannot be guaranteed -.macro save_xmm128_postrsp_unaligned Reg, Offset - __Offset = \Offset - movdqu xmmword ptr [rsp + __Offset], \Reg -.endm - .macro restore_xmm128 Reg, ofs __Offset = \ofs movdqa \Reg, xmmword ptr [rsp + __Offset] @@ -252,20 +246,6 @@ C_FUNC(\Name\()_End): .endm -// Unaligned version for cases where 16-byte stack alignment cannot be guaranteed -.macro SAVE_FLOAT_ARGUMENT_REGISTERS_UNALIGNED ofs - - save_xmm128_postrsp_unaligned xmm0, \ofs - save_xmm128_postrsp_unaligned xmm1, \ofs + 0x10 - save_xmm128_postrsp_unaligned xmm2, \ofs + 0x20 - save_xmm128_postrsp_unaligned xmm3, \ofs + 0x30 - save_xmm128_postrsp_unaligned xmm4, \ofs + 0x40 - save_xmm128_postrsp_unaligned xmm5, \ofs + 0x50 - save_xmm128_postrsp_unaligned xmm6, \ofs + 0x60 - save_xmm128_postrsp_unaligned xmm7, \ofs + 0x70 - -.endm - .macro RESTORE_FLOAT_ARGUMENT_REGISTERS ofs restore_xmm128 xmm0, \ofs @@ -448,35 +428,6 @@ C_FUNC(\Name\()_End): POP_CALLEE_SAVED_REGISTERS .endm -// Pushes a full TransitionBlock on the stack including argument registers and -// floating point argument registers. Used for exception throw helpers where we -// need to capture the complete register state. -// -// Stack layout (from high to low address after prologue): -// Return address (8 bytes) -// CalleeSavedRegisters (rbp, rbx, r15, r14, r13, r12 - 48 bytes) -// ArgumentRegisters (r9, r8, rcx, rdx, rsi, rdi - 48 bytes) <- TransitionBlock pointer -// FloatArgumentRegisters (xmm0-xmm7, 128 bytes) at rsp+8 -// 8-byte alignment padding at rsp+0 -// sp points here -// -// Stack alignment calculation: -// Before call to IL_Throw: rsp is 16-byte aligned (ABI requirement before call) -// After call (return addr pushed): rsp % 16 = 8 -// After PUSH_CALLEE_SAVED_REGISTERS (48 bytes): rsp % 16 = 8 -// After PUSH_ARGUMENT_REGISTERS (48 bytes): rsp % 16 = 8 -// After alloc_stack 136: rsp % 16 = (8 - 136 % 16) = (8 - 8) = 0 <- aligned! -// -// Stack layout for IL_Throw helpers using TransitionBlock with float registers. -// -// Stack alignment calculation: -// Before call to IL_Throw: rsp is 16-byte aligned (ABI requirement before call) -// After call (return addr pushed): rsp % 16 = 8 -// After PUSH_CALLEE_SAVED_REGISTERS (48 bytes): rsp % 16 = 8 -// After PUSH_ARGUMENT_REGISTERS (48 bytes): rsp % 16 = 8 -// After alloc_stack 136: rsp % 16 = (8 - 136 % 16) = (8 - 8) = 0 <- aligned! -// -// Stack layout (low to high addresses): .macro INLINE_GETTHREAD // Inlined version of call C_FUNC(RhpGetThread) INLINE_GET_TLS_VAR t_CurrentThreadInfo From 3cac3e04c89701f663ae4cbfd35f89278f6ee85e Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Wed, 14 Jan 2026 01:37:13 +0000 Subject: [PATCH 24/30] Move MXCSR reset from PreStubWorker to CallEHFilterFunclet --- src/coreclr/vm/amd64/AsmHelpers.asm | 18 +++++++++++++++++- src/coreclr/vm/prestub.cpp | 9 --------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index 4725a0cf8eefbb..c64068ae2f3f9d 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -521,8 +521,16 @@ NESTED_ENTRY CallEHFunclet, _TEXT movdqa xmm14, [r8 + OFFSETOF__CONTEXT__Xmm14] movdqa xmm15, [r8 + OFFSETOF__CONTEXT__Xmm15] - ; Save the SP of this function. + ; Save the SP of this function. mov [r9], rsp + + ; Reset MXCSR to default value before invoking managed code. + ; During exception handling, MXCSR can become corrupted. + ; 0x1F80 = default MXCSR: all exception masks set, round to nearest + push 1F80h + ldmxcsr [rsp] + add rsp, 8 + ; Invoke the funclet call rdx @@ -547,6 +555,14 @@ NESTED_ENTRY CallEHFilterFunclet, _TEXT mov [r9], rsp ; Restore RBP to match main function RBP mov rbp, rdx + + ; Reset MXCSR to default value before invoking managed code. + ; During exception handling, MXCSR can become corrupted. + ; 0x1F80 = default MXCSR: all exception masks set, round to nearest + push 1F80h + ldmxcsr [rsp] + add rsp, 8 + ; Invoke the filter funclet call r8 diff --git a/src/coreclr/vm/prestub.cpp b/src/coreclr/vm/prestub.cpp index 3804dc264cdb17..e6fd11f68c07e6 100644 --- a/src/coreclr/vm/prestub.cpp +++ b/src/coreclr/vm/prestub.cpp @@ -1809,15 +1809,6 @@ extern "C" PCODE STDCALL PreStubWorker(TransitionBlock* pTransitionBlock, Method { PCODE pbRetVal = (PCODE)NULL; -#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS) - // Reset MXCSR to default value to prevent FP exceptions during JIT compilation. - // During exception handling, MXCSR can become corrupted (e.g., when RtlRestoreContext - // is called with a context that doesn't include CONTEXT_FLOATING_POINT). If the JIT - // is invoked during exception handling (e.g., to compile a catch handler), it may - // crash with an FP exception due to unmasked floating point exceptions. - _mm_setcsr(0x1F80); -#endif - PreserveLastErrorHolder preserveLastError; STATIC_CONTRACT_THROWS; From b4d2eab8603e00499d5afbc49b7bb24e771cf1f4 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Thu, 15 Jan 2026 15:23:56 +0000 Subject: [PATCH 25/30] Fix FltSave initialization in UpdateContextFromTransitionBlock Co-authored-by: Jan Vorlicek --- src/coreclr/vm/amd64/AsmHelpers.asm | 14 -------------- src/coreclr/vm/amd64/Context.asm | 8 -------- src/coreclr/vm/excep.cpp | 8 ++++++-- 3 files changed, 6 insertions(+), 24 deletions(-) diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index 3538e8fc5b312e..52b60f31e45cef 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -523,13 +523,6 @@ NESTED_ENTRY CallEHFunclet, _TEXT ; Save the SP of this function. mov [r9], rsp - ; Reset MXCSR to default value before invoking managed code. - ; During exception handling, MXCSR can become corrupted. - ; 0x1F80 = default MXCSR: all exception masks set, round to nearest - push 1F80h - ldmxcsr [rsp] - add rsp, 8 - ; Invoke the funclet call rdx @@ -555,13 +548,6 @@ NESTED_ENTRY CallEHFilterFunclet, _TEXT ; Restore RBP to match main function RBP mov rbp, rdx - ; Reset MXCSR to default value before invoking managed code. - ; During exception handling, MXCSR can become corrupted. - ; 0x1F80 = default MXCSR: all exception masks set, round to nearest - push 1F80h - ldmxcsr [rsp] - add rsp, 8 - ; Invoke the filter funclet call r8 diff --git a/src/coreclr/vm/amd64/Context.asm b/src/coreclr/vm/amd64/Context.asm index d7c2d9832570b1..8edafc38f63d06 100644 --- a/src/coreclr/vm/amd64/Context.asm +++ b/src/coreclr/vm/amd64/Context.asm @@ -27,14 +27,6 @@ NESTED_ENTRY ClrRestoreNonvolatileContextWorker, _TEXT test byte ptr [r10 + OFFSETOF__CONTEXT__ContextFlags], CONTEXT_FLOATING_POINT je Done_Restore_CONTEXT_FLOATING_POINT fxrstor [r10 + OFFSETOF__CONTEXT__FltSave] - ; Reset MXCSR to the default value after restoring the floating point context. - ; This is necessary because the CONTEXT may have been captured during Windows SEH dispatch - ; which can corrupt MXCSR (setting it to 0x20 with all FP exception masks cleared). - ; Without this reset, FP operations after exception handling would trigger exceptions. - ; 0x1F80 = default MXCSR: all exception masks set, round to nearest, no exceptions pending - push 1F80h - ldmxcsr [rsp] - add rsp, 8 Done_Restore_CONTEXT_FLOATING_POINT: test byte ptr [r10 + OFFSETOF__CONTEXT__ContextFlags], CONTEXT_INTEGER diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index 95163e5aaaf5ab..b1dce1e756073e 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10905,6 +10905,7 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // Layout: [xmm6-xmm15 (160 bytes)] [xmm0-xmm3 (64 bytes)] [shadow (32 bytes)] [padding (8 bytes)] [CalleeSavedRegs] [RetAddr] // xmm6 is at sp+0, TransitionBlock is at sp+264, so xmm6 is at TransitionBlock - 264 M128A *pFpCalleeSaved = (M128A*)((BYTE*)pTransitionBlock - 264); + m_Context.Xmm6 = pFpCalleeSaved[0]; m_Context.Xmm7 = pFpCalleeSaved[1]; m_Context.Xmm8 = pFpCalleeSaved[2]; @@ -10916,8 +10917,11 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.Xmm14 = pFpCalleeSaved[8]; m_Context.Xmm15 = pFpCalleeSaved[9]; - // Initialize FP control/status - m_Context.MxCsr = 0x1F80; // Default MXCSR value (all exceptions masked) + // Initialize FP control/status in FltSave - this is what fxrstor restores from + m_Context.FltSave.ControlWord = 0x27F; // Default x87 control word + m_Context.FltSave.MxCsr = 0x1F80; // Default MXCSR value (all exceptions masked) + m_Context.FltSave.MxCsr_Mask = 0x1FFF; // MXCSR mask + m_Context.MxCsr = 0x1F80; // Default MXCSR value (all exceptions masked) #endif #define CALLEE_SAVED_REGISTER(reg) \ From 77c9ebb2e6fb7b9615a2b42ac90ad13800dc2420 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Thu, 15 Jan 2026 18:09:51 +0200 Subject: [PATCH 26/30] Allocate shadow space for Windows x64 ABI calls Co-authored-by: Jan Vorlicek --- src/coreclr/vm/amd64/AsmHelpers.asm | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index 52b60f31e45cef..4ba300297a9d1b 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -1214,6 +1214,10 @@ endif ; FEATURE_INTERPRETER NESTED_ENTRY IL_Throw, _TEXT PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdx + ; Allocate shadow space for the call (required by Windows x64 ABI) + ; Without this, the callee's shadow space writes would overwrite our saved xmm6 + sub rsp, 20h + ; RCX already contains exception object ; RDX contains pointer to TransitionBlock call IL_Throw_Impl @@ -1231,6 +1235,9 @@ NESTED_END IL_Throw, _TEXT NESTED_ENTRY IL_ThrowExact, _TEXT PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdx + ; Allocate shadow space for the call (required by Windows x64 ABI) + sub rsp, 20h + ; RCX already contains exception object ; RDX contains pointer to TransitionBlock call IL_ThrowExact_Impl @@ -1245,6 +1252,9 @@ NESTED_END IL_ThrowExact, _TEXT NESTED_ENTRY IL_Rethrow, _TEXT PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rcx + ; Allocate shadow space for the call (required by Windows x64 ABI) + sub rsp, 20h + ; RCX contains pointer to TransitionBlock call IL_Rethrow_Impl ; Should never return From 7cb82b1c411c32586b1edb06b1a129d4fba2a323 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Thu, 15 Jan 2026 18:07:31 +0000 Subject: [PATCH 27/30] Address jkotas feedback and bring back ldmxcsr --- src/coreclr/vm/amd64/AsmHelpers.asm | 29 ++++++++++++------- src/coreclr/vm/amd64/AsmMacros.inc | 45 +++++++++++++++-------------- src/coreclr/vm/amd64/Context.asm | 7 +++++ src/coreclr/vm/excep.cpp | 4 +-- 4 files changed, 51 insertions(+), 34 deletions(-) diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index 4ba300297a9d1b..6996f11643e15a 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -523,6 +523,14 @@ NESTED_ENTRY CallEHFunclet, _TEXT ; Save the SP of this function. mov [r9], rsp + ; Reset MXCSR to default value before invoking managed code. + ; This is needed for foreign thread exceptions where CONTEXT is captured by Windows SEH, + ; which can leave MXCSR in a bad state (e.g., 0x20 with FP exception masks cleared). + ; 0x1F80 = default MXCSR: all exception masks set, round to nearest + push 1F80h + ldmxcsr [rsp] + add rsp, 8 + ; Invoke the funclet call rdx @@ -548,6 +556,14 @@ NESTED_ENTRY CallEHFilterFunclet, _TEXT ; Restore RBP to match main function RBP mov rbp, rdx + ; Reset MXCSR to default value before invoking managed code. + ; This is needed for foreign thread exceptions where CONTEXT is captured by Windows SEH, + ; which can leave MXCSR in a bad state (e.g., 0x20 with FP exception masks cleared). + ; 0x1F80 = default MXCSR: all exception masks set, round to nearest + push 1F80h + ldmxcsr [rsp] + add rsp, 8 + ; Invoke the filter funclet call r8 @@ -1212,12 +1228,9 @@ endif ; FEATURE_INTERPRETER ; RCX = Pointer to exception object ;========================================================================== NESTED_ENTRY IL_Throw, _TEXT + ; Shadow space for the call is included in PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdx - ; Allocate shadow space for the call (required by Windows x64 ABI) - ; Without this, the callee's shadow space writes would overwrite our saved xmm6 - sub rsp, 20h - ; RCX already contains exception object ; RDX contains pointer to TransitionBlock call IL_Throw_Impl @@ -1233,11 +1246,9 @@ NESTED_END IL_Throw, _TEXT ; RCX = Pointer to exception object ;========================================================================== NESTED_ENTRY IL_ThrowExact, _TEXT + ; Shadow space for the call is included in PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rdx - ; Allocate shadow space for the call (required by Windows x64 ABI) - sub rsp, 20h - ; RCX already contains exception object ; RDX contains pointer to TransitionBlock call IL_ThrowExact_Impl @@ -1250,11 +1261,9 @@ NESTED_END IL_ThrowExact, _TEXT ; implementation written in C. ;========================================================================== NESTED_ENTRY IL_Rethrow, _TEXT + ; Shadow space for the call is included in PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS rcx - ; Allocate shadow space for the call (required by Windows x64 ABI) - sub rsp, 20h - ; RCX contains pointer to TransitionBlock call IL_Rethrow_Impl ; Should never return diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index f8b2bd11fb5078..470a346f20d6ab 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -510,41 +510,42 @@ POP_COOP_PINVOKE_FRAME macro ; 256 mod 16 = 0, so RSP mod 16 = 8 (NOT aligned) ; 264 mod 16 = 8, so RSP mod 16 = 0 (aligned!) but offsets 0,16,32... work ; -; With alloc_stack 264: RSP is 16-byte aligned, XMM saves at 0, 16, 32, ... (multiples of 16) +; With alloc_stack 296: RSP is 16-byte aligned, XMM saves at 32, 48, 64, ... (multiples of 16) ; ; On exit, target contains the TransitionBlock pointer (CalleeSavedRegisters). PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target PUSH_CALLEE_SAVED_REGISTERS - ; Allocate space for: FP callee-saved (160) + float args (64) + shadow (32) + padding (8) = 264 bytes - ; This makes RSP 16-byte aligned (8 + 64 + 264 = 336, and original RSP - 336 is 16-byte aligned) - alloc_stack 264 + ; Allocate space for: shadow for call (32) + FP callee-saved (160) + float args (64) + arg regs (32) + padding (8) = 296 bytes + ; Shadow space at offset 0 is reserved for the call to IL_Throw_Impl etc. + ; This makes RSP 16-byte aligned (8 + 64 + 296 = 368, and original RSP - 368 is 16-byte aligned) + alloc_stack 296 - ; Save argument registers to shadow space area at offset 224 - SAVE_ARGUMENT_REGISTERS 224 + ; Save argument registers at offset 256 (32 + 160 + 64) + SAVE_ARGUMENT_REGISTERS 256 - ; Save float argument registers at offset 160 - SAVE_FLOAT_ARGUMENT_REGISTERS 160 + ; Save float argument registers at offset 192 (32 + 160) + SAVE_FLOAT_ARGUMENT_REGISTERS 192 - ; Save FP callee-saved registers (xmm6-xmm15) at offset 0 - ; RSP is 16-byte aligned, so offset 0, 16, 32, ... are all 16-byte aligned + ; Save FP callee-saved registers (xmm6-xmm15) at offset 32 (after shadow space) + ; RSP is 16-byte aligned, so offset 32, 48, 64, ... are all 16-byte aligned ; AND these offsets are multiples of 16 as required by unwind codes - save_xmm128_postrsp xmm6, 0h - save_xmm128_postrsp xmm7, 10h - save_xmm128_postrsp xmm8, 20h - save_xmm128_postrsp xmm9, 30h - save_xmm128_postrsp xmm10, 40h - save_xmm128_postrsp xmm11, 50h - save_xmm128_postrsp xmm12, 60h - save_xmm128_postrsp xmm13, 70h - save_xmm128_postrsp xmm14, 80h - save_xmm128_postrsp xmm15, 90h + save_xmm128_postrsp xmm6, 20h + save_xmm128_postrsp xmm7, 30h + save_xmm128_postrsp xmm8, 40h + save_xmm128_postrsp xmm9, 50h + save_xmm128_postrsp xmm10, 60h + save_xmm128_postrsp xmm11, 70h + save_xmm128_postrsp xmm12, 80h + save_xmm128_postrsp xmm13, 90h + save_xmm128_postrsp xmm14, 0A0h + save_xmm128_postrsp xmm15, 0B0h END_PROLOGUE - ; TransitionBlock pointer points to CalleeSavedRegisters at rsp + 264 - lea target, [rsp + 264] + ; TransitionBlock pointer points to CalleeSavedRegisters at rsp + 296 + lea target, [rsp + 296] endm diff --git a/src/coreclr/vm/amd64/Context.asm b/src/coreclr/vm/amd64/Context.asm index 8edafc38f63d06..bdb6dbb3b3c684 100644 --- a/src/coreclr/vm/amd64/Context.asm +++ b/src/coreclr/vm/amd64/Context.asm @@ -27,6 +27,13 @@ NESTED_ENTRY ClrRestoreNonvolatileContextWorker, _TEXT test byte ptr [r10 + OFFSETOF__CONTEXT__ContextFlags], CONTEXT_FLOATING_POINT je Done_Restore_CONTEXT_FLOATING_POINT fxrstor [r10 + OFFSETOF__CONTEXT__FltSave] + ; Reset MXCSR to default value after restoring floating point context. + ; This is needed for contexts captured by Windows SEH (foreign thread/hardware exceptions), + ; which can have FltSave.MxCsr set to a bad value (e.g., 0x20 with FP exception masks cleared). + ; 0x1F80 = default MXCSR: all exception masks set, round to nearest, no exceptions pending + push 1F80h + ldmxcsr [rsp] + add rsp, 8 Done_Restore_CONTEXT_FLOATING_POINT: test byte ptr [r10 + OFFSETOF__CONTEXT__ContextFlags], CONTEXT_INTEGER diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index 044690933a0f55..e339ea68270898 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10777,8 +10777,8 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p // Read FP callee-saved registers (xmm6-xmm15) from the stack // They are stored at negative offsets from TransitionBlock: - // Layout: [xmm6-xmm15 (160 bytes)] [xmm0-xmm3 (64 bytes)] [shadow (32 bytes)] [padding (8 bytes)] [CalleeSavedRegs] [RetAddr] - // xmm6 is at sp+0, TransitionBlock is at sp+264, so xmm6 is at TransitionBlock - 264 + // Layout: [shadow (32)] [xmm6-xmm15 (160)] [xmm0-xmm3 (64)] [arg regs (32)] [padding (8)] [CalleeSavedRegs] [RetAddr] + // xmm6 is at sp+32, TransitionBlock is at sp+296, so xmm6 is at TransitionBlock - 264 M128A *pFpCalleeSaved = (M128A*)((BYTE*)pTransitionBlock - 264); m_Context.Xmm6 = pFpCalleeSaved[0]; From d39d981dff3f47cfffe4a0cf1be6d8708b890d03 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Thu, 15 Jan 2026 22:29:29 +0200 Subject: [PATCH 28/30] Remove MXCSR reset in Context.asm --- src/coreclr/vm/amd64/Context.asm | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/coreclr/vm/amd64/Context.asm b/src/coreclr/vm/amd64/Context.asm index bdb6dbb3b3c684..8edafc38f63d06 100644 --- a/src/coreclr/vm/amd64/Context.asm +++ b/src/coreclr/vm/amd64/Context.asm @@ -27,13 +27,6 @@ NESTED_ENTRY ClrRestoreNonvolatileContextWorker, _TEXT test byte ptr [r10 + OFFSETOF__CONTEXT__ContextFlags], CONTEXT_FLOATING_POINT je Done_Restore_CONTEXT_FLOATING_POINT fxrstor [r10 + OFFSETOF__CONTEXT__FltSave] - ; Reset MXCSR to default value after restoring floating point context. - ; This is needed for contexts captured by Windows SEH (foreign thread/hardware exceptions), - ; which can have FltSave.MxCsr set to a bad value (e.g., 0x20 with FP exception masks cleared). - ; 0x1F80 = default MXCSR: all exception masks set, round to nearest, no exceptions pending - push 1F80h - ldmxcsr [rsp] - add rsp, 8 Done_Restore_CONTEXT_FLOATING_POINT: test byte ptr [r10 + OFFSETOF__CONTEXT__ContextFlags], CONTEXT_INTEGER From 9275d5c2bdc2e7acf8d955afc4366f9db91bf378 Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Fri, 16 Jan 2026 01:28:25 +0200 Subject: [PATCH 29/30] Remove MXCSR reset before managed code invocation --- src/coreclr/vm/amd64/AsmHelpers.asm | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index 6996f11643e15a..dc8c5eeec42f91 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -523,14 +523,6 @@ NESTED_ENTRY CallEHFunclet, _TEXT ; Save the SP of this function. mov [r9], rsp - ; Reset MXCSR to default value before invoking managed code. - ; This is needed for foreign thread exceptions where CONTEXT is captured by Windows SEH, - ; which can leave MXCSR in a bad state (e.g., 0x20 with FP exception masks cleared). - ; 0x1F80 = default MXCSR: all exception masks set, round to nearest - push 1F80h - ldmxcsr [rsp] - add rsp, 8 - ; Invoke the funclet call rdx @@ -556,14 +548,6 @@ NESTED_ENTRY CallEHFilterFunclet, _TEXT ; Restore RBP to match main function RBP mov rbp, rdx - ; Reset MXCSR to default value before invoking managed code. - ; This is needed for foreign thread exceptions where CONTEXT is captured by Windows SEH, - ; which can leave MXCSR in a bad state (e.g., 0x20 with FP exception masks cleared). - ; 0x1F80 = default MXCSR: all exception masks set, round to nearest - push 1F80h - ldmxcsr [rsp] - add rsp, 8 - ; Invoke the filter funclet call r8 From a78797a237a338b91bebd5829c8b8f879a5afb9f Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Fri, 16 Jan 2026 07:00:29 +0000 Subject: [PATCH 30/30] Cleanups fb --- src/coreclr/vm/amd64/AsmMacros.inc | 16 +--------------- src/coreclr/vm/excep.cpp | 12 ------------ src/coreclr/vm/prestub.cpp | 4 ---- 3 files changed, 1 insertion(+), 31 deletions(-) diff --git a/src/coreclr/vm/amd64/AsmMacros.inc b/src/coreclr/vm/amd64/AsmMacros.inc index 470a346f20d6ab..07531371d6627d 100644 --- a/src/coreclr/vm/amd64/AsmMacros.inc +++ b/src/coreclr/vm/amd64/AsmMacros.inc @@ -495,23 +495,9 @@ POP_COOP_PINVOKE_FRAME macro ; Outgoing argument homes (32 bytes) ; FloatArgumentRegisters (xmm0-xmm3, 64 bytes) ; FP Callee-saved registers (xmm6-xmm15, 160 bytes) +; Shadow space for call (32 bytes) ; sp points here ; -; Stack alignment: After call (8) + callee-saved pushes (64) + alloc (256) = 328 bytes -; 328 mod 16 = 8, so RSP mod 16 = 8 after alloc - NOT 16-byte aligned. -; We need RSP to be 16-byte aligned for movaps AND unwind offsets must be multiples of 16. -; -; Solution: Use alloc_stack 272 (adds 16 bytes padding). 8 + 64 + 272 = 344, 344 mod 16 = 8. -; Wait, that's still not right. Let me recalculate: -; After call: RSP mod 16 = 8 (return addr pushed from 16-byte aligned stack) -; After 8 pushes (64 bytes): RSP mod 16 = (8 + 64) mod 16 = 72 mod 16 = 8 -; After alloc N: RSP mod 16 = (8 + N) mod 16 -; For RSP mod 16 = 0, need N mod 16 = 8 -; 256 mod 16 = 0, so RSP mod 16 = 8 (NOT aligned) -; 264 mod 16 = 8, so RSP mod 16 = 0 (aligned!) but offsets 0,16,32... work -; -; With alloc_stack 296: RSP is 16-byte aligned, XMM saves at 32, 48, 64, ... (multiples of 16) -; ; On exit, target contains the TransitionBlock pointer (CalleeSavedRegisters). PUSH_COOP_PINVOKE_FRAME_WITH_FLOATS macro target diff --git a/src/coreclr/vm/excep.cpp b/src/coreclr/vm/excep.cpp index e339ea68270898..8b670c6ea4911d 100644 --- a/src/coreclr/vm/excep.cpp +++ b/src/coreclr/vm/excep.cpp @@ -10768,13 +10768,6 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.SegSs = 0; m_Context.EFlags = 0; - // Argument registers are not saved in the transition block - m_Context.Rax = 0; - m_Context.Rcx = 0; - m_Context.Rdx = 0; - m_Context.R8 = 0; - m_Context.R9 = 0; - // Read FP callee-saved registers (xmm6-xmm15) from the stack // They are stored at negative offsets from TransitionBlock: // Layout: [shadow (32)] [xmm6-xmm15 (160)] [xmm0-xmm3 (64)] [arg regs (32)] [padding (8)] [CalleeSavedRegs] [RetAddr] @@ -10852,11 +10845,6 @@ void SoftwareExceptionFrame::UpdateContextFromTransitionBlock(TransitionBlock *p m_Context.D[8 + i] = pFpCalleeSaved[i]; } - // Initialize remaining D registers (D16-D31) to zero - these are caller-saved - for (int i = 16; i < 32; i++) - { - m_Context.D[i] = 0; - } // Initialize FP status/control register m_Context.Fpscr = 0; diff --git a/src/coreclr/vm/prestub.cpp b/src/coreclr/vm/prestub.cpp index 8056e88cdf899d..d5d4cc1daf8bba 100644 --- a/src/coreclr/vm/prestub.cpp +++ b/src/coreclr/vm/prestub.cpp @@ -43,10 +43,6 @@ #include "gdbjit.h" #endif // FEATURE_GDBJIT -#if defined(TARGET_AMD64) && defined(TARGET_WINDOWS) -#include -#endif - #ifndef DACCESS_COMPILE EXTERN_C void STDCALL ThePreStubPatch();