From 4620ee65e16866e64bc64ea1194b77f0b53a6fa8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 18:44:26 +0000 Subject: [PATCH 01/15] Initial plan From b8e515377659bf35ae9599c3c5de7c8d3bebf881 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 19:18:55 +0000 Subject: [PATCH 02/15] Save all volatile argument registers (RDX, R8, R9) in GC probe hijack frames On AMD64, when the GC hijacks a thread via RhpGcProbeHijack, the hijack stub creates a PInvokeTransitionFrame. Previously on Windows, only RAX and RCX were saved, and on Unix only RAX, RCX, and RDX were saved. If the GC info for the managed frame at the hijack point reports other volatile registers (RDX, R8, R9) as live GC references, the StackFrameIterator would find NULL save locations and crash (AV in SVR::GCHeap::Promote). Windows changes: - FixupHijackedCallstack now uses R10/R11 for thread pointer instead of RDX/R8, preserving all volatile argument registers (RAX, RCX, RDX, R8, R9) - PUSH_PROBE_FRAME/POP_PROBE_FRAME extended to save/restore RDX, R8, R9 - RhpGcProbeHijack flags include PTFF_SAVE_RDX + PTFF_SAVE_R8 + PTFF_SAVE_R9 - AsmMacros.inc gains PTFF_SAVE_RDX, PTFF_SAVE_R8, PTFF_SAVE_R9 definitions Unix changes: - FixupHijackedCallstack now also saves/restores R8 and R9 across GETTHREAD - Uses R10 instead of R8 for scratch in hijack fixup and bitmask passing - PUSH_PROBE_FRAME/POP_PROBE_FRAME extended to save/restore R8, R9 - RhpGcProbeHijack flags include PTFF_SAVE_R8 + PTFF_SAVE_R9 - unixasmmacrosamd64.inc gains PTFF_SAVE_R8, PTFF_SAVE_R9 definitions Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/411be3e7-43a1-4bc3-b896-e27d92fe37c3 Co-authored-by: mangod9 <61718172+mangod9@users.noreply.github.com> --- .../nativeaot/Runtime/amd64/AsmMacros.inc | 3 + src/coreclr/nativeaot/Runtime/amd64/GcProbe.S | 40 +++++++++----- .../nativeaot/Runtime/amd64/GcProbe.asm | 55 +++++++++++-------- .../Runtime/unix/unixasmmacrosamd64.inc | 2 + 4 files changed, 62 insertions(+), 38 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc b/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc index 37e1f6d000dfcb..41b05098f86e7c 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc +++ b/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc @@ -359,6 +359,9 @@ PTFF_SAVE_ALL_PRESERVED equ 000000F7h ;; NOTE: RBP is not included in this set PTFF_SAVE_RSP equ 00008000h PTFF_SAVE_RAX equ 00000100h ;; RAX is saved in hijack handler - in case it contains a GC ref PTFF_SAVE_RCX equ 00000200h ;; RCX is saved in hijack handler - in case it contains a GC ref +PTFF_SAVE_RDX equ 00000400h ;; RDX is saved in hijack handler - in case it contains a GC ref +PTFF_SAVE_R8 equ 00000800h ;; R8 is saved in hijack handler - in case it contains a GC ref +PTFF_SAVE_R9 equ 00001000h ;; R9 is saved in hijack handler - in case it contains a GC ref PTFF_SAVE_ALL_SCRATCH equ 00007F00h PTFF_THREAD_HIJACK equ 00100000h ;; indicates that this is a frame for a hijacked call diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S index 07f43434a8532d..d31420b6148703 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S @@ -6,12 +6,13 @@ #include // -// See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves RAX/RCX/RDX and accepts the register -// bitmask in R8 +// See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves volatile argument registers +// and accepts the register bitmask // // On entry: // - BITMASK: bitmask describing pushes, may be volatile register or constant value // - RAX: managed function return value, may be an object or byref +// - RCX, RDX, R8, R9: may contain objects or byrefs at the hijack point // - preserved regs: need to stay preserved, may contain objects or byrefs // // INVARIANTS @@ -19,10 +20,12 @@ // - All preserved registers remain unchanged from their values in managed code. // .macro PUSH_PROBE_FRAME threadReg, trashReg, BITMASK + push_register r9 // save R9, it might contain an objectref + push_register r8 // save R8, it might contain an objectref push_register rdx // save RDX, it might contain an objectref push_register rcx // save RCX, it might contain an objectref (async continuation) push_register rax // save RAX, it might contain an objectref - lea \trashReg, [rsp + 0x20] + lea \trashReg, [rsp + 0x30] push_register \trashReg // save caller`s RSP push_nonvol_reg r15 // save preserved registers push_nonvol_reg r14 // .. @@ -32,11 +35,11 @@ push_register \BITMASK // save the register bitmask passed in by caller push_register \threadReg // Thread * (unused by stackwalker) push_nonvol_reg rbp // save caller`s RBP - mov \trashReg, [rsp + 12*8] // Find the return address + mov \trashReg, [rsp + 14*8] // Find the return address push_register \trashReg // save m_RIP lea \trashReg, [rsp + 0] // trashReg == address of frame - // allocate space for xmm0, xmm1 and alignment + // allocate space for xmm0, xmm1 alloc_stack 0x20 + 0 // save xmm0 and xmm1 in case they are used as return values @@ -67,6 +70,8 @@ pop rax pop rcx pop rdx + pop r8 + pop r9 .endm // @@ -78,16 +83,20 @@ // // Register state on exit: // R11: thread pointer -// RAX, RCX, RDX preserved, other volatile regs trashed +// RAX, RCX, RDX, R8, R9 preserved, R10 trashed // .macro FixupHijackedCallstack - // preserve RAX, RDX as they may contain return values + // preserve volatile argument registers across INLINE_GETTHREAD push rax push rdx // preserve RCX as it may contain async continuation return value push rcx + // preserve R8 and R9 as they may contain GC refs + push r8 + push r9 + // align stack sub rsp, 0x8 @@ -97,19 +106,22 @@ add rsp, 0x8 + pop r9 + pop r8 + pop rcx pop rdx pop rax // Fix the stack by pushing the original return address - mov r8, [r11 + OFFSETOF__Thread__m_pvHijackedReturnAddress] - push r8 + mov r10, [r11 + OFFSETOF__Thread__m_pvHijackedReturnAddress] + push r10 // Clear hijack state - xor r8, r8 - mov [r11 + OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation], r8 - mov [r11 + OFFSETOF__Thread__m_pvHijackedReturnAddress], r8 + xor r10, r10 + mov [r11 + OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation], r10 + mov [r11 + OFFSETOF__Thread__m_pvHijackedReturnAddress], r10 .endm // @@ -124,12 +136,12 @@ NESTED_ENTRY RhpGcProbeHijack, _TEXT, NoHandler ret LOCAL_LABEL(WaitForGC): - mov r8d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_SAVE_RDX + PTFF_THREAD_HIJACK + mov r10d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_SAVE_RDX + PTFF_SAVE_R8 + PTFF_SAVE_R9 + PTFF_THREAD_HIJACK jmp C_FUNC(RhpWaitForGC) NESTED_END RhpGcProbeHijack, _TEXT NESTED_ENTRY RhpWaitForGC, _TEXT, NoHandler - PUSH_PROBE_FRAME r11, rax, r8 + PUSH_PROBE_FRAME r11, rax, r10 END_PROLOGUE mov rbx, r11 diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm index b66d950b1f7327..084f91da193fce 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm @@ -4,13 +4,14 @@ include AsmMacros.inc ;; -;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves RAX/RCX and accepts -;; the register bitmask +;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves volatile argument registers +;; and accepts the register bitmask ;; ;; On entry: ;; - BITMASK: bitmask describing pushes, a volatile register ;; - RAX: managed function return value, may be an object or byref ;; - RCX: managed function return value (async continuation), may be an object +;; - RDX, R8, R9: may contain objects or byrefs at the hijack point ;; - preserved regs: need to stay preserved, may contain objects or byrefs ;; ;; INVARIANTS @@ -19,9 +20,12 @@ include AsmMacros.inc ;; PUSH_PROBE_FRAME macro threadReg, trashReg, BITMASK + push_vol_reg r9 ; save R9, it might contain an objectref + push_vol_reg r8 ; save R8, it might contain an objectref + push_vol_reg rdx ; save RDX, it might contain an objectref push_vol_reg rcx ; save RCX, it might contain an objectref (async continuation) push_vol_reg rax ; save RAX, it might contain an objectref - lea trashReg, [rsp + 18h] + lea trashReg, [rsp + 30h] push_vol_reg trashReg ; save caller's RSP push_nonvol_reg r15 ; save preserved registers push_nonvol_reg r14 ; .. @@ -33,12 +37,12 @@ PUSH_PROBE_FRAME macro threadReg, trashReg, BITMASK push_vol_reg BITMASK ; save the register bitmask passed in by caller push_vol_reg threadReg ; Thread * (unused by stackwalker) push_nonvol_reg rbp ; save caller's RBP - mov trashReg, [rsp + 13*8] ; Find the return address + mov trashReg, [rsp + 16*8] ; Find the return address push_vol_reg trashReg ; save m_RIP lea trashReg, [rsp + 0] ; trashReg == address of frame - ;; allocate scratch space and any required alignment - alloc_stack 20h + 10h + 8 + ;; allocate scratch space + alloc_stack 20h + 10h ;; save xmm0 in case it's being used as a return value movdqa [rsp + 20h], xmm0 @@ -54,7 +58,7 @@ endm ;; POP_PROBE_FRAME macro movdqa xmm0, [rsp + 20h] - add rsp, 20h + 10h + 8 + 8 ; deallocate stack and discard saved m_RIP + add rsp, 20h + 10h + 8 ; deallocate scratch space and discard saved m_RIP pop rbp pop rax ; discard Thread* pop rax ; discard BITMASK @@ -68,6 +72,9 @@ POP_PROBE_FRAME macro pop rax ; discard caller RSP pop rax pop rcx + pop rdx + pop r8 + pop r9 endm ;; @@ -78,21 +85,21 @@ endm ;; All registers correct for return to the original return address. ;; ;; Register state on exit: -;; RDX: thread pointer -;; RAX/RCX: preserved, other volatile regs trashed +;; R10: thread pointer +;; RAX/RCX/RDX/R8/R9: preserved, R11 trashed ;; FixupHijackedCallstack macro - ;; rdx <- GetThread(), TRASHES r8 - INLINE_GETTHREAD rdx, r8 + ;; r10 <- GetThread(), TRASHES r11 + INLINE_GETTHREAD r10, r11 ;; Fix the stack by pushing the original return address - mov r8, [rdx + OFFSETOF__Thread__m_pvHijackedReturnAddress] - push r8 + mov r11, [r10 + OFFSETOF__Thread__m_pvHijackedReturnAddress] + push r11 ;; Clear hijack state - xor r8, r8 - mov [rdx + OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation], r8 - mov [rdx + OFFSETOF__Thread__m_pvHijackedReturnAddress], r8 + xor r11, r11 + mov [r10 + OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation], r11 + mov [r10 + OFFSETOF__Thread__m_pvHijackedReturnAddress], r11 endm ;; @@ -106,15 +113,15 @@ NESTED_ENTRY RhpGcProbeHijack, _TEXT jnz @f ret @@: - mov r8d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_THREAD_HIJACK + mov r11d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_SAVE_RDX + PTFF_SAVE_R8 + PTFF_SAVE_R9 + PTFF_THREAD_HIJACK jmp RhpWaitForGC NESTED_END RhpGcProbeHijack, _TEXT NESTED_ENTRY RhpWaitForGC, _TEXT - PUSH_PROBE_FRAME rdx, rax, r8 + PUSH_PROBE_FRAME r10, rax, r11 END_PROLOGUE - mov rbx, rdx + mov rbx, r10 mov rcx, [rbx + OFFSETOF__Thread__m_pDeferredTransitionFrame] call RhpWaitForGC2 @@ -147,7 +154,7 @@ ifdef FEATURE_GC_STRESS ;; LEAF_ENTRY RhpGcStressHijack, _TEXT FixupHijackedCallstack - mov r8d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + mov r11d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_SAVE_RDX + PTFF_SAVE_R8 + PTFF_SAVE_R9 jmp RhpGcStressProbe LEAF_END RhpGcStressHijack, _TEXT @@ -157,15 +164,15 @@ LEAF_END RhpGcStressHijack, _TEXT ;; This worker performs the GC Stress work and returns to the original return address. ;; ;; Register state on entry: -;; RDX: thread pointer -;; R8: register bitmask +;; R10: thread pointer +;; R11: register bitmask ;; ;; Register state on exit: -;; Scratch registers, except for RAX/RCX, have been trashed +;; Scratch registers, except for RAX/RCX/RDX/R8/R9, have been trashed ;; All other registers restored as they were when the hijack was first reached. ;; NESTED_ENTRY RhpGcStressProbe, _TEXT - PUSH_PROBE_FRAME rdx, rax, r8 + PUSH_PROBE_FRAME r10, rax, r11 END_PROLOGUE call RhpStressGc diff --git a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc index 7138bfc4326fea..fb0d14377a4895 100644 --- a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc +++ b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc @@ -245,6 +245,8 @@ C_FUNC(\Name): #define PTFF_SAVE_RAX 0x00000100 // RAX is saved in hijack handler - in case it contains a GC ref #define PTFF_SAVE_RCX 0x00000200 // RCX is saved in hijack handler - in case it contains a GC ref #define PTFF_SAVE_RDX 0x00000400 // RDX is saved in hijack handler - in case it contains a GC ref +#define PTFF_SAVE_R8 0x00000800 // R8 is saved in hijack handler - in case it contains a GC ref +#define PTFF_SAVE_R9 0x00001000 // R9 is saved in hijack handler - in case it contains a GC ref #define PTFF_SAVE_ALL_SCRATCH 0x00007F00 #define PTFF_THREAD_HIJACK 0x00100000 // indicates that this is a frame for a hijacked call From 4184db5460fd9ff655a20ba4b4e97fc656401bb9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 19:25:32 +0000 Subject: [PATCH 03/15] Improve alloc_stack comment to describe what is being allocated Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/411be3e7-43a1-4bc3-b896-e27d92fe37c3 Co-authored-by: mangod9 <61718172+mangod9@users.noreply.github.com> --- src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm index 084f91da193fce..e611bb8b42b5b1 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm @@ -41,7 +41,7 @@ PUSH_PROBE_FRAME macro threadReg, trashReg, BITMASK push_vol_reg trashReg ; save m_RIP lea trashReg, [rsp + 0] ; trashReg == address of frame - ;; allocate scratch space + ;; allocate scratch space (20h home space + 10h for xmm0) alloc_stack 20h + 10h ;; save xmm0 in case it's being used as a return value From d671e50d29df716a17cbeee86ebf441a2f0b5224 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 21:27:44 +0000 Subject: [PATCH 04/15] Also save RSI and RDI in Unix AMD64 GC probe hijack frames On Unix System V ABI, RSI and RDI are volatile argument-passing registers (1st and 2nd integer args) that could contain live GC references at hijack points. On Windows, they are already saved as callee-saved registers in PTFF_SAVE_ALL_PRESERVED. Unix changes: - FixupHijackedCallstack now saves/restores RSI and RDI across GETTHREAD - PUSH_PROBE_FRAME saves RSI and RDI between R12 and RBX, matching the StackFrameIterator's flag processing order (RBX, RSI, RDI, R12...) - POP_PROBE_FRAME restores RSI and RDI in correct order - RhpGcProbeHijack flags include PTFF_SAVE_RSI + PTFF_SAVE_RDI - unixasmmacrosamd64.inc gains PTFF_SAVE_RSI and PTFF_SAVE_RDI definitions Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/85ca9ec1-057b-4a3c-8182-e6da274d44b6 Co-authored-by: mangod9 <61718172+mangod9@users.noreply.github.com> --- src/coreclr/nativeaot/Runtime/amd64/GcProbe.S | 18 +++++++++++++----- .../Runtime/unix/unixasmmacrosamd64.inc | 2 ++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S index d31420b6148703..610b7df89312fe 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S @@ -12,7 +12,7 @@ // On entry: // - BITMASK: bitmask describing pushes, may be volatile register or constant value // - RAX: managed function return value, may be an object or byref -// - RCX, RDX, R8, R9: may contain objects or byrefs at the hijack point +// - RSI, RDI, RCX, RDX, R8, R9: may contain objects or byrefs at the hijack point // - preserved regs: need to stay preserved, may contain objects or byrefs // // INVARIANTS @@ -31,11 +31,13 @@ push_nonvol_reg r14 // .. push_nonvol_reg r13 // .. push_nonvol_reg r12 // .. + push_register rdi // save RDI, volatile on Unix, might contain an objectref + push_register rsi // save RSI, volatile on Unix, might contain an objectref push_nonvol_reg rbx // .. push_register \BITMASK // save the register bitmask passed in by caller push_register \threadReg // Thread * (unused by stackwalker) push_nonvol_reg rbp // save caller`s RBP - mov \trashReg, [rsp + 14*8] // Find the return address + mov \trashReg, [rsp + 16*8] // Find the return address push_register \trashReg // save m_RIP lea \trashReg, [rsp + 0] // trashReg == address of frame @@ -62,6 +64,8 @@ pop rax // discard Thread* pop rax // discard BITMASK pop rbx + pop rsi + pop rdi pop r12 pop r13 pop r14 @@ -83,7 +87,7 @@ // // Register state on exit: // R11: thread pointer -// RAX, RCX, RDX, R8, R9 preserved, R10 trashed +// RAX, RCX, RDX, RSI, RDI, R8, R9 preserved, R10 trashed // .macro FixupHijackedCallstack // preserve volatile argument registers across INLINE_GETTHREAD @@ -93,7 +97,9 @@ // preserve RCX as it may contain async continuation return value push rcx - // preserve R8 and R9 as they may contain GC refs + // preserve RSI, RDI, R8 and R9 as they may contain GC refs + push rsi + push rdi push r8 push r9 @@ -108,6 +114,8 @@ pop r9 pop r8 + pop rdi + pop rsi pop rcx @@ -136,7 +144,7 @@ NESTED_ENTRY RhpGcProbeHijack, _TEXT, NoHandler ret LOCAL_LABEL(WaitForGC): - mov r10d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_SAVE_RDX + PTFF_SAVE_R8 + PTFF_SAVE_R9 + PTFF_THREAD_HIJACK + mov r10d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RSI + PTFF_SAVE_RDI + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_SAVE_RDX + PTFF_SAVE_R8 + PTFF_SAVE_R9 + PTFF_THREAD_HIJACK jmp C_FUNC(RhpWaitForGC) NESTED_END RhpGcProbeHijack, _TEXT diff --git a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc index fb0d14377a4895..a7cc2943ebffa9 100644 --- a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc +++ b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc @@ -242,6 +242,8 @@ C_FUNC(\Name): #define PTFF_SAVE_R15 0x00000080 #define PTFF_SAVE_ALL_PRESERVED 0x000000F1 // NOTE: RBP is not included in this set! #define PTFF_SAVE_RSP 0x00008000 +#define PTFF_SAVE_RSI 0x00000002 // RSI is saved in hijack handler - in case it contains a GC ref +#define PTFF_SAVE_RDI 0x00000004 // RDI is saved in hijack handler - in case it contains a GC ref #define PTFF_SAVE_RAX 0x00000100 // RAX is saved in hijack handler - in case it contains a GC ref #define PTFF_SAVE_RCX 0x00000200 // RCX is saved in hijack handler - in case it contains a GC ref #define PTFF_SAVE_RDX 0x00000400 // RDX is saved in hijack handler - in case it contains a GC ref From c80100deaf4b2020186cbe56a5c932028437875b Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Mon, 13 Apr 2026 19:35:39 -0700 Subject: [PATCH 05/15] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/coreclr/nativeaot/Runtime/amd64/GcProbe.S | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S index 610b7df89312fe..205c126c72d83c 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S @@ -91,36 +91,36 @@ // .macro FixupHijackedCallstack // preserve volatile argument registers across INLINE_GETTHREAD - push rax - push rdx + push_register rax + push_register rdx // preserve RCX as it may contain async continuation return value - push rcx + push_register rcx // preserve RSI, RDI, R8 and R9 as they may contain GC refs - push rsi - push rdi - push r8 - push r9 + push_register rsi + push_register rdi + push_register r8 + push_register r9 // align stack - sub rsp, 0x8 + alloc_stack 0x8 // rax = GetThread(), makes nested calls INLINE_GETTHREAD mov r11, rax - add rsp, 0x8 + free_stack 0x8 - pop r9 - pop r8 - pop rdi - pop rsi + pop_register r9 + pop_register r8 + pop_register rdi + pop_register rsi - pop rcx + pop_register rcx - pop rdx - pop rax + pop_register rdx + pop_register rax // Fix the stack by pushing the original return address mov r10, [r11 + OFFSETOF__Thread__m_pvHijackedReturnAddress] From 87b210ec3d44b72e4967ea5210f88770711c2b5e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Apr 2026 03:02:53 +0000 Subject: [PATCH 06/15] Use pop_register/pop_nonvol_reg/free_stack in POP_PROBE_FRAME for CFI correctness Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/a7c68e06-600b-4b18-94f3-4405e5cac54c Co-authored-by: jkotas <6668460+jkotas@users.noreply.github.com> --- src/coreclr/nativeaot/Runtime/amd64/GcProbe.S | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S index 205c126c72d83c..f7585fdd6f4ba9 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S @@ -59,23 +59,23 @@ .macro POP_PROBE_FRAME movdqa xmm1, [rsp + 0] movdqa xmm0, [rsp + 0x10] - add rsp, 0x20 + 8 // skip xmm0, xmm1 and discard RIP - pop rbp - pop rax // discard Thread* - pop rax // discard BITMASK - pop rbx - pop rsi - pop rdi - pop r12 - pop r13 - pop r14 - pop r15 - pop rax // discard caller RSP - pop rax - pop rcx - pop rdx - pop r8 - pop r9 + free_stack 0x20 + 8 // skip xmm0, xmm1 and discard RIP + pop_nonvol_reg rbp + pop_register rax // discard Thread* + pop_register rax // discard BITMASK + pop_nonvol_reg rbx + pop_register rsi + pop_register rdi + pop_nonvol_reg r12 + pop_nonvol_reg r13 + pop_nonvol_reg r14 + pop_nonvol_reg r15 + pop_register rax // discard caller RSP + pop_register rax + pop_register rcx + pop_register rdx + pop_register r8 + pop_register r9 .endm // From a2a12bbf50095d6039e367a29527330b47dbaa8f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Apr 2026 03:29:15 +0000 Subject: [PATCH 07/15] Save volatile argument registers x3-x7 in GC probe hijack frames on ARM64 Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/9a42d894-2da9-4674-90a7-581df544b3ee Co-authored-by: mangod9 <61718172+mangod9@users.noreply.github.com> --- src/coreclr/nativeaot/Runtime/arm64/GcProbe.S | 76 +++++++++++-------- .../nativeaot/Runtime/arm64/GcProbe.asm | 61 ++++++++------- .../Runtime/unix/unixasmmacrosarm64.inc | 5 ++ 3 files changed, 83 insertions(+), 59 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S index 093a3ec84ecfbc..d7e628bff70128 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S @@ -4,13 +4,14 @@ #include #include "AsmOffsets.inc" -#define PROBE_FRAME_SIZE 0xD0 // 4 * 8 for fixed part of PInvokeTransitionFrame (fp, lr, m_pThread, m_Flags) + - // 10 * 8 for callee saved registers + - // 1 * 8 for caller SP + - // 3 * 8 for int returns (x0, x1, x2) + - // 4 * 16 for FP/HFA/HVA returns - -// See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return registers +#define PROBE_FRAME_SIZE 0x100 // 4 * 8 for fixed part of PInvokeTransitionFrame (fp, lr, m_pThread, m_Flags) + + // 10 * 8 for callee saved registers + + // 1 * 8 for caller SP + + // 8 * 8 for int return/argument regs (x0..x7) + + // 1 * 8 for padding (16-byte alignment for q regs) + + // 4 * 16 for FP/HFA/HVA returns + +// See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return/argument registers // and accepts the register bitmask // Call this macro first in the method (no further prolog instructions can be added after this). // @@ -37,13 +38,15 @@ // Slot at [sp, #0x70] is reserved for caller sp - // Save the integer return registers, x2 might contain an objectref (async continuation) + // Save the integer return/argument registers stp x0, x1, [sp, #0x78] - str x2, [sp, #0x88] + stp x2, x3, [sp, #0x88] + stp x4, x5, [sp, #0x98] + stp x6, x7, [sp, #0xA8] // Save the FP/HFA/HVA return registers - stp q0, q1, [sp, #0x90] - stp q2, q3, [sp, #0xB0] + stp q0, q1, [sp, #0xC0] + stp q2, q3, [sp, #0xE0] // Perform the rest of the PInvokeTransitionFrame initialization. // str \threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] // Thread * (unused by stackwalker) @@ -65,13 +68,15 @@ // .macro POP_PROBE_FRAME - // Restore the integer return registers + // Restore the integer return/argument registers ldp x0, x1, [sp, #0x78] - ldr x2, [sp, #0x88] + ldp x2, x3, [sp, #0x88] + ldp x4, x5, [sp, #0x98] + ldp x6, x7, [sp, #0xA8] // Restore the FP/HFA/HVA return registers - ldp q0, q1, [sp, #0x90] - ldp q2, q3, [sp, #0xB0] + ldp q0, q1, [sp, #0xC0] + ldp q2, q3, [sp, #0xE0] // Restore callee saved registers EPILOG_RESTORE_REG_PAIR x19, x20, 0x20 @@ -91,32 +96,43 @@ // All registers correct for return to the original return address. // // Register state on exit: -// x4: thread pointer -// x0, x1, x2: preserved +// x9: thread pointer +// x0-x7 preserved, x10 trashed // .macro FixupHijackedCallstack - // Store thread pointer temporarily in x3, then move it to x4 - // x4 <- GetThread() + // x9 <- GetThread() #ifdef FEATURE_EMULATED_TLS - GETTHREAD_ETLS_3 + // Save x0-x7 around RhpGetThread call (it trashes all scratch registers) + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -0x50 + stp x0, x1, [sp, #0x10] + stp x2, x3, [sp, #0x20] + stp x4, x5, [sp, #0x30] + stp x6, x7, [sp, #0x40] + + bl C_FUNC(RhpGetThread) + mov x9, x0 + + ldp x0, x1, [sp, #0x10] + ldp x2, x3, [sp, #0x20] + ldp x4, x5, [sp, #0x30] + ldp x6, x7, [sp, #0x40] + EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 0x50 #else - INLINE_GETTHREAD x3 + INLINE_GETTHREAD x9 #endif - mov x4, x3 - // // Fix the stack by restoring the original return address // - ldr lr, [x4, #OFFSETOF__Thread__m_pvHijackedReturnAddress] + ldr lr, [x9, #OFFSETOF__Thread__m_pvHijackedReturnAddress] // // Clear hijack state // // Clear m_ppvHijackedReturnAddressLocation and m_pvHijackedReturnAddress - stp xzr, xzr, [x4, #OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation] + stp xzr, xzr, [x9, #OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation] .endm // @@ -125,12 +141,12 @@ NESTED_ENTRY RhpGcProbeHijack, _TEXT, NoHandler FixupHijackedCallstack - PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 3 - tbnz x3, #TrapThreadsFlags_TrapThreads_Bit, LOCAL_LABEL(WaitForGC) + PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 10 + tbnz x10, #TrapThreadsFlags_TrapThreads_Bit, LOCAL_LABEL(WaitForGC) ret LOCAL_LABEL(WaitForGC): - mov x12, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2 + mov x12, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2 + PTFF_SAVE_X3 + PTFF_SAVE_X4 + PTFF_SAVE_X5 + PTFF_SAVE_X6 + PTFF_SAVE_X7 movk x12, PTFF_THREAD_HIJACK_HI, lsl #32 b C_FUNC(RhpWaitForGC) NESTED_END RhpGcProbeHijack @@ -138,9 +154,9 @@ NESTED_END RhpGcProbeHijack .global C_FUNC(RhpThrowHwEx) NESTED_ENTRY RhpWaitForGC, _TEXT, NoHandler - PUSH_PROBE_FRAME x4, x3, x12 + PUSH_PROBE_FRAME x9, x10, x12 - ldr x0, [x4, #OFFSETOF__Thread__m_pDeferredTransitionFrame] + ldr x0, [x9, #OFFSETOF__Thread__m_pDeferredTransitionFrame] bl C_FUNC(RhpWaitForGC2) POP_PROBE_FRAME diff --git a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm index 0ebcfe1c675d22..1ab7225dca9d60 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm +++ b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm @@ -14,11 +14,12 @@ field OFFSETOF__PInvokeTransitionFrame__m_PreservedRegs field 10 * 8 ; x19..x28 m_CallersSP field 8 ; SP at routine entry - field 3 * 8 ; x0..x2 + field 8 * 8 ; x0..x7 + field 8 ; padding for 16-byte alignment of q regs field 4 * 16; q0..q3 PROBE_FRAME_SIZE field 0 - ;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return registers + ;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return/argument registers ;; and accepts the register bitmask ;; Call this macro first in the method (no further prolog instructions can be added after this). ;; @@ -46,13 +47,15 @@ PROBE_FRAME_SIZE field 0 ;; Slot at [sp, #0x70] is reserved for caller sp - ;; Save the integer return registers, x2 might contain an objectref (async continuation) + ;; Save the integer return/argument registers PROLOG_NOP stp x0, x1, [sp, #0x78] - PROLOG_NOP str x2, [sp, #0x88] + PROLOG_NOP stp x2, x3, [sp, #0x88] + PROLOG_NOP stp x4, x5, [sp, #0x98] + PROLOG_NOP stp x6, x7, [sp, #0xA8] ;; Save the FP/HFA/HVA return registers - PROLOG_NOP stp q0, q1, [sp, #0x90] - PROLOG_NOP stp q2, q3, [sp, #0xB0] + PROLOG_NOP stp q0, q1, [sp, #0xC0] + PROLOG_NOP stp q2, q3, [sp, #0xE0] ;; Perform the rest of the PInvokeTransitionFrame initialization. ;; str $threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] ; Thread * (unused by stackwalker) @@ -76,13 +79,15 @@ PROBE_FRAME_SIZE field 0 MACRO POP_PROBE_FRAME - ;; Restore the integer return registers + ;; Restore the integer return/argument registers PROLOG_NOP ldp x0, x1, [sp, #0x78] - PROLOG_NOP ldr x2, [sp, #0x88] + PROLOG_NOP ldp x2, x3, [sp, #0x88] + PROLOG_NOP ldp x4, x5, [sp, #0x98] + PROLOG_NOP ldp x6, x7, [sp, #0xA8] ; Restore the FP/HFA/HVA return registers - EPILOG_NOP ldp q0, q1, [sp, #0x90] - EPILOG_NOP ldp q2, q3, [sp, #0xB0] + EPILOG_NOP ldp q0, q1, [sp, #0xC0] + EPILOG_NOP ldp q2, q3, [sp, #0xE0] ;; Restore callee saved registers EPILOG_RESTORE_REG_PAIR x19, x20, #0x20 @@ -102,26 +107,26 @@ PROBE_FRAME_SIZE field 0 ;; All registers correct for return to the original return address. ;; ;; Register state on exit: -;; x4: thread pointer -;; x3: trashed +;; x9: thread pointer +;; x0-x7 preserved, x10 trashed ;; MACRO FixupHijackedCallstack - ;; x4 <- GetThread(), TRASHES x3 - INLINE_GETTHREAD x4, x3 + ;; x9 <- GetThread(), TRASHES x10 + INLINE_GETTHREAD x9, x10 ;; ;; Fix the stack by restoring the original return address ;; - ldr lr, [x4, #OFFSETOF__Thread__m_pvHijackedReturnAddress] + ldr lr, [x9, #OFFSETOF__Thread__m_pvHijackedReturnAddress] ;; ;; Clear hijack state ;; ASSERT OFFSETOF__Thread__m_pvHijackedReturnAddress == (OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation + 8) ;; Clear m_ppvHijackedReturnAddressLocation and m_pvHijackedReturnAddress - stp xzr, xzr, [x4, #OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation] + stp xzr, xzr, [x9, #OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation] MEND MACRO @@ -147,13 +152,13 @@ PROBE_FRAME_SIZE field 0 LABELED_RETURN_ADDRESS RhpGcProbeHijack FixupHijackedCallstack - ldr x3, =RhpTrapThreads - ldr w3, [x3] - tbnz x3, #TrapThreadsFlags_TrapThreads_Bit, WaitForGC + ldr x10, =RhpTrapThreads + ldr w10, [x10] + tbnz x10, #TrapThreadsFlags_TrapThreads_Bit, WaitForGC ret WaitForGC - mov x12, #(DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2) + mov x12, #(DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2 + PTFF_SAVE_X3 + PTFF_SAVE_X4 + PTFF_SAVE_X5 + PTFF_SAVE_X6 + PTFF_SAVE_X7) movk x12, #PTFF_THREAD_HIJACK_HI, lsl #32 b RhpWaitForGC NESTED_END RhpGcProbeHijackWrapper @@ -161,9 +166,9 @@ WaitForGC EXTERN RhpThrowHwEx NESTED_ENTRY RhpWaitForGC - PUSH_PROBE_FRAME x4, x3, x12 + PUSH_PROBE_FRAME x9, x10, x12 - ldr x0, [x4, #OFFSETOF__Thread__m_pDeferredTransitionFrame] + ldr x0, [x9, #OFFSETOF__Thread__m_pDeferredTransitionFrame] bl RhpWaitForGC2 POP_PROBE_FRAME @@ -193,7 +198,7 @@ WaitForGC ;; LEAF_ENTRY RhpGcStressHijack FixupHijackedCallstack - mov x12, #(DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2) + mov x12, #(DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2 + PTFF_SAVE_X3 + PTFF_SAVE_X4 + PTFF_SAVE_X5 + PTFF_SAVE_X6 + PTFF_SAVE_X7) b RhpGcStressProbe LEAF_END RhpGcStressHijack ;; @@ -202,18 +207,16 @@ WaitForGC ;; This worker performs the GC Stress work and returns to the original return address. ;; ;; Register state on entry: -;; x0: hijacked function return value -;; x1: hijacked function return value -;; x2: hijacked function async continuation value -;; x4: thread pointer +;; x0-x7: hijacked function return/argument values +;; x9: thread pointer ;; w12: register bitmask ;; ;; Register state on exit: -;; Scratch registers, except for x0, x1, x2, have been trashed +;; Scratch registers, except for x0-x7, have been trashed ;; All other registers restored as they were when the hijack was first reached. ;; NESTED_ENTRY RhpGcStressProbe - PUSH_PROBE_FRAME x4, x3, x12 + PUSH_PROBE_FRAME x9, x10, x12 bl RhpStressGc diff --git a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc index 5bf15b8e8f6a79..3027a17d6c8fc1 100644 --- a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc +++ b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc @@ -331,6 +331,11 @@ C_FUNC(\Name): #define PTFF_SAVE_X0 0x00000800 #define PTFF_SAVE_X1 0x00001000 #define PTFF_SAVE_X2 0x00002000 +#define PTFF_SAVE_X3 0x00004000 +#define PTFF_SAVE_X4 0x00008000 +#define PTFF_SAVE_X5 0x00010000 +#define PTFF_SAVE_X6 0x00020000 +#define PTFF_SAVE_X7 0x00040000 #define PTFF_SAVE_ALL_PRESERVED 0x000003FF // NOTE: x19-x28 #define PTFF_THREAD_HIJACK_HI 0x00000001 // upper 32 bits of the PTFF_THREAD_HIJACK From b1486c363c16207aa0070f6edfbe31e8bc6b72eb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Apr 2026 15:14:53 +0000 Subject: [PATCH 08/15] Update RBM_INTERFACELOOKUP_FOR_SLOT_TRASH and save all FP argument registers in probe frames Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/2bbc56c9-47e8-42f8-903e-51a79bbb96c6 Co-authored-by: jakobbotsch <7887810+jakobbotsch@users.noreply.github.com> --- src/coreclr/jit/targetamd64.h | 2 +- src/coreclr/jit/targetarm64.h | 2 +- src/coreclr/nativeaot/Runtime/amd64/GcProbe.S | 56 +++++++++++++++---- .../nativeaot/Runtime/amd64/GcProbe.asm | 18 ++++-- src/coreclr/nativeaot/Runtime/arm64/GcProbe.S | 40 ++++++++++--- .../nativeaot/Runtime/arm64/GcProbe.asm | 10 +++- 6 files changed, 98 insertions(+), 30 deletions(-) diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index d0f8fe23831d78..9c144b61d2febe 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -540,7 +540,7 @@ // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. #define RBM_INIT_PINVOKE_FRAME_TRASH RBM_CALLEE_TRASH -#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_RAX | RBM_R10 | RBM_R11) +#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_INT_CALLEE_TRASH_INIT & ~RBM_ARG_REGS) #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH & ~(RBM_R10 | RBM_RCX)) #define RBM_VALIDATE_INDIRECT_CALL_TRASH_ALL (RBM_INT_CALLEE_TRASH_ALL & ~(RBM_R10 | RBM_RCX)) diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index 3f26dfab09ee93..6719b524b69404 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -263,7 +263,7 @@ // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. #define RBM_INIT_PINVOKE_FRAME_TRASH RBM_CALLEE_TRASH -#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_R12 | RBM_R13 | RBM_R14 | RBM_R15) +#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_INT_CALLEE_TRASH & ~RBM_ARG_REGS) #define RBM_INTERFACELOOKUP_FOR_SLOT_RETURN RBM_R15 #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH & ~(RBM_R0 | RBM_R1 | RBM_R2 | RBM_R3 | RBM_R4 | RBM_R5 | RBM_R6 | RBM_R7 | RBM_R8 | RBM_R15)) #define REG_VALIDATE_INDIRECT_CALL_ADDR REG_R15 diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S index f7585fdd6f4ba9..de0d863c42fd14 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S @@ -41,12 +41,18 @@ push_register \trashReg // save m_RIP lea \trashReg, [rsp + 0] // trashReg == address of frame - // allocate space for xmm0, xmm1 - alloc_stack 0x20 + 0 - - // save xmm0 and xmm1 in case they are used as return values - movdqa [rsp + 0x10], xmm0 - movdqa [rsp + 0] , xmm1 + // allocate space for xmm0..xmm7 (FP argument registers) + alloc_stack 0x80 + 0 + + // save FP argument registers in case they contain live values at the hijack point + movdqa [rsp + 0x70], xmm0 + movdqa [rsp + 0x60], xmm1 + movdqa [rsp + 0x50], xmm2 + movdqa [rsp + 0x40], xmm3 + movdqa [rsp + 0x30], xmm4 + movdqa [rsp + 0x20], xmm5 + movdqa [rsp + 0x10], xmm6 + movdqa [rsp + 0x00], xmm7 // link the frame into the Thread mov [\threadReg + OFFSETOF__Thread__m_pDeferredTransitionFrame], \trashReg @@ -57,9 +63,15 @@ // registers and return value to their values from before the probe was called (while also updating any // object refs or byrefs). .macro POP_PROBE_FRAME - movdqa xmm1, [rsp + 0] - movdqa xmm0, [rsp + 0x10] - free_stack 0x20 + 8 // skip xmm0, xmm1 and discard RIP + movdqa xmm7, [rsp + 0x00] + movdqa xmm6, [rsp + 0x10] + movdqa xmm5, [rsp + 0x20] + movdqa xmm4, [rsp + 0x30] + movdqa xmm3, [rsp + 0x40] + movdqa xmm2, [rsp + 0x50] + movdqa xmm1, [rsp + 0x60] + movdqa xmm0, [rsp + 0x70] + free_stack 0x80 + 8 // skip xmm0..xmm7 and discard RIP pop_nonvol_reg rbp pop_register rax // discard Thread* pop_register rax // discard BITMASK @@ -103,14 +115,34 @@ push_register r8 push_register r9 - // align stack - alloc_stack 0x8 + // allocate space for xmm0..xmm7 + alignment (0x80 for xmm regs + 0x8 for 16-byte alignment) + alloc_stack 0x88 + + // save FP argument registers that would be clobbered by INLINE_GETTHREAD call + movdqa [rsp + 0x70], xmm0 + movdqa [rsp + 0x60], xmm1 + movdqa [rsp + 0x50], xmm2 + movdqa [rsp + 0x40], xmm3 + movdqa [rsp + 0x30], xmm4 + movdqa [rsp + 0x20], xmm5 + movdqa [rsp + 0x10], xmm6 + movdqa [rsp + 0x00], xmm7 // rax = GetThread(), makes nested calls INLINE_GETTHREAD mov r11, rax - free_stack 0x8 + // restore FP argument registers + movdqa xmm7, [rsp + 0x00] + movdqa xmm6, [rsp + 0x10] + movdqa xmm5, [rsp + 0x20] + movdqa xmm4, [rsp + 0x30] + movdqa xmm3, [rsp + 0x40] + movdqa xmm2, [rsp + 0x50] + movdqa xmm1, [rsp + 0x60] + movdqa xmm0, [rsp + 0x70] + + free_stack 0x88 pop_register r9 pop_register r8 diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm index e611bb8b42b5b1..ad4254edf586f3 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm @@ -41,11 +41,14 @@ PUSH_PROBE_FRAME macro threadReg, trashReg, BITMASK push_vol_reg trashReg ; save m_RIP lea trashReg, [rsp + 0] ; trashReg == address of frame - ;; allocate scratch space (20h home space + 10h for xmm0) - alloc_stack 20h + 10h + ;; allocate scratch space (20h home space + 40h for xmm0..xmm3) + alloc_stack 20h + 40h - ;; save xmm0 in case it's being used as a return value - movdqa [rsp + 20h], xmm0 + ;; save xmm argument registers in case they contain live values at the hijack point + movdqa [rsp + 20h + 00h], xmm0 + movdqa [rsp + 20h + 10h], xmm1 + movdqa [rsp + 20h + 20h], xmm2 + movdqa [rsp + 20h + 30h], xmm3 ;; link the frame into the Thread mov [threadReg + OFFSETOF__Thread__m_pDeferredTransitionFrame], trashReg @@ -57,8 +60,11 @@ endm ;; object refs or byrefs). ;; POP_PROBE_FRAME macro - movdqa xmm0, [rsp + 20h] - add rsp, 20h + 10h + 8 ; deallocate scratch space and discard saved m_RIP + movdqa xmm0, [rsp + 20h + 00h] + movdqa xmm1, [rsp + 20h + 10h] + movdqa xmm2, [rsp + 20h + 20h] + movdqa xmm3, [rsp + 20h + 30h] + add rsp, 20h + 40h + 8 ; deallocate scratch space and discard saved m_RIP pop rbp pop rax ; discard Thread* pop rax ; discard BITMASK diff --git a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S index d7e628bff70128..5e93141833bb06 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S @@ -4,12 +4,12 @@ #include #include "AsmOffsets.inc" -#define PROBE_FRAME_SIZE 0x100 // 4 * 8 for fixed part of PInvokeTransitionFrame (fp, lr, m_pThread, m_Flags) + +#define PROBE_FRAME_SIZE 0x140 // 4 * 8 for fixed part of PInvokeTransitionFrame (fp, lr, m_pThread, m_Flags) + // 10 * 8 for callee saved registers + // 1 * 8 for caller SP + // 8 * 8 for int return/argument regs (x0..x7) + // 1 * 8 for padding (16-byte alignment for q regs) + - // 4 * 16 for FP/HFA/HVA returns + // 8 * 16 for FP argument registers (q0..q7) // See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return/argument registers // and accepts the register bitmask @@ -44,9 +44,11 @@ stp x4, x5, [sp, #0x98] stp x6, x7, [sp, #0xA8] - // Save the FP/HFA/HVA return registers + // Save the FP argument registers stp q0, q1, [sp, #0xC0] stp q2, q3, [sp, #0xE0] + stp q4, q5, [sp, #0x100] + stp q6, q7, [sp, #0x120] // Perform the rest of the PInvokeTransitionFrame initialization. // str \threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] // Thread * (unused by stackwalker) @@ -74,9 +76,11 @@ ldp x4, x5, [sp, #0x98] ldp x6, x7, [sp, #0xA8] - // Restore the FP/HFA/HVA return registers + // Restore the FP argument registers ldp q0, q1, [sp, #0xC0] ldp q2, q3, [sp, #0xE0] + ldp q4, q5, [sp, #0x100] + ldp q6, q7, [sp, #0x120] // Restore callee saved registers EPILOG_RESTORE_REG_PAIR x19, x20, 0x20 @@ -104,23 +108,45 @@ // x9 <- GetThread() #ifdef FEATURE_EMULATED_TLS - // Save x0-x7 around RhpGetThread call (it trashes all scratch registers) - PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -0x50 + // Save x0-x7 and q0-q7 around RhpGetThread call (it trashes all scratch registers) + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -0xD0 stp x0, x1, [sp, #0x10] stp x2, x3, [sp, #0x20] stp x4, x5, [sp, #0x30] stp x6, x7, [sp, #0x40] + stp q0, q1, [sp, #0x50] + stp q2, q3, [sp, #0x70] + stp q4, q5, [sp, #0x90] + stp q6, q7, [sp, #0xB0] bl C_FUNC(RhpGetThread) mov x9, x0 + ldp q6, q7, [sp, #0xB0] + ldp q4, q5, [sp, #0x90] + ldp q2, q3, [sp, #0x70] + ldp q0, q1, [sp, #0x50] ldp x0, x1, [sp, #0x10] ldp x2, x3, [sp, #0x20] ldp x4, x5, [sp, #0x30] ldp x6, x7, [sp, #0x40] - EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 0x50 + EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 0xD0 #else + // INLINE_GETTHREAD makes blr calls that trash volatile FP registers. + // Save and restore q0-q7 around the call. + sub sp, sp, #0x80 + stp q0, q1, [sp, #0x00] + stp q2, q3, [sp, #0x20] + stp q4, q5, [sp, #0x40] + stp q6, q7, [sp, #0x60] + INLINE_GETTHREAD x9 + + ldp q6, q7, [sp, #0x60] + ldp q4, q5, [sp, #0x40] + ldp q2, q3, [sp, #0x20] + ldp q0, q1, [sp, #0x00] + add sp, sp, #0x80 #endif // diff --git a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm index 1ab7225dca9d60..bd44842a8a8be6 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm +++ b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm @@ -16,7 +16,7 @@ m_CallersSP field 8 ; SP at routine entry field 8 * 8 ; x0..x7 field 8 ; padding for 16-byte alignment of q regs - field 4 * 16; q0..q3 + field 8 * 16; q0..q7 PROBE_FRAME_SIZE field 0 ;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return/argument registers @@ -53,9 +53,11 @@ PROBE_FRAME_SIZE field 0 PROLOG_NOP stp x4, x5, [sp, #0x98] PROLOG_NOP stp x6, x7, [sp, #0xA8] - ;; Save the FP/HFA/HVA return registers + ;; Save the FP argument registers PROLOG_NOP stp q0, q1, [sp, #0xC0] PROLOG_NOP stp q2, q3, [sp, #0xE0] + PROLOG_NOP stp q4, q5, [sp, #0x100] + PROLOG_NOP stp q6, q7, [sp, #0x120] ;; Perform the rest of the PInvokeTransitionFrame initialization. ;; str $threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] ; Thread * (unused by stackwalker) @@ -85,9 +87,11 @@ PROBE_FRAME_SIZE field 0 PROLOG_NOP ldp x4, x5, [sp, #0x98] PROLOG_NOP ldp x6, x7, [sp, #0xA8] - ; Restore the FP/HFA/HVA return registers + ; Restore the FP argument registers EPILOG_NOP ldp q0, q1, [sp, #0xC0] EPILOG_NOP ldp q2, q3, [sp, #0xE0] + EPILOG_NOP ldp q4, q5, [sp, #0x100] + EPILOG_NOP ldp q6, q7, [sp, #0x120] ;; Restore callee saved registers EPILOG_RESTORE_REG_PAIR x19, x20, #0x20 From 246d39d5e0809622867e424309521f783dcbe697 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Apr 2026 15:18:15 +0000 Subject: [PATCH 09/15] Update FixupHijackedCallstack comments to include preserved FP registers Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/2bbc56c9-47e8-42f8-903e-51a79bbb96c6 Co-authored-by: jakobbotsch <7887810+jakobbotsch@users.noreply.github.com> --- src/coreclr/nativeaot/Runtime/amd64/GcProbe.S | 2 +- src/coreclr/nativeaot/Runtime/arm64/GcProbe.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S index de0d863c42fd14..06ba8656b44c6b 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S @@ -99,7 +99,7 @@ // // Register state on exit: // R11: thread pointer -// RAX, RCX, RDX, RSI, RDI, R8, R9 preserved, R10 trashed +// RAX, RCX, RDX, RSI, RDI, R8, R9, xmm0-xmm7 preserved, R10 trashed // .macro FixupHijackedCallstack // preserve volatile argument registers across INLINE_GETTHREAD diff --git a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S index 5e93141833bb06..c327e304b8e290 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S @@ -101,7 +101,7 @@ // // Register state on exit: // x9: thread pointer -// x0-x7 preserved, x10 trashed +// x0-x7, q0-q7 preserved, x10 trashed // .macro FixupHijackedCallstack From 10598c168cae1721217e58e0ba5faf4d887958a5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Apr 2026 15:43:55 +0000 Subject: [PATCH 10/15] Expand RBM_INTERFACELOOKUP_FOR_SLOT_TRASH to trash all non-argument registers including float Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/24a8bc8a-0f2f-49c9-b96a-268d92070834 Co-authored-by: jakobbotsch <7887810+jakobbotsch@users.noreply.github.com> --- src/coreclr/jit/targetamd64.h | 2 +- src/coreclr/jit/targetarm64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 9c144b61d2febe..4bdbd5d1cb02f3 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -540,7 +540,7 @@ // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. #define RBM_INIT_PINVOKE_FRAME_TRASH RBM_CALLEE_TRASH -#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_INT_CALLEE_TRASH_INIT & ~RBM_ARG_REGS) +#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_CALLEE_TRASH & ~(RBM_ARG_REGS | RBM_FLTARG_REGS)) #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH & ~(RBM_R10 | RBM_RCX)) #define RBM_VALIDATE_INDIRECT_CALL_TRASH_ALL (RBM_INT_CALLEE_TRASH_ALL & ~(RBM_R10 | RBM_RCX)) diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index 6719b524b69404..f4edfbf7723b20 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -263,7 +263,7 @@ // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. #define RBM_INIT_PINVOKE_FRAME_TRASH RBM_CALLEE_TRASH -#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_INT_CALLEE_TRASH & ~RBM_ARG_REGS) +#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_CALLEE_TRASH & ~(RBM_ARG_REGS | RBM_FLTARG_REGS)) #define RBM_INTERFACELOOKUP_FOR_SLOT_RETURN RBM_R15 #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH & ~(RBM_R0 | RBM_R1 | RBM_R2 | RBM_R3 | RBM_R4 | RBM_R5 | RBM_R6 | RBM_R7 | RBM_R8 | RBM_R15)) #define REG_VALIDATE_INDIRECT_CALL_ADDR REG_R15 From cd43a53578537d451e34d60db25ca5f46a5e6173 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 00:37:13 +0000 Subject: [PATCH 11/15] Cherry-pick JIT PHI jump threading (PR #126812) with targeted fix for global PHI use bug (#126976) Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/1507ea1a-6b49-4f3d-977a-83f9c781da83 Co-authored-by: mangod9 <61718172+mangod9@users.noreply.github.com> --- src/coreclr/jit/compiler.h | 25 +- src/coreclr/jit/compmemkind.h | 1 + src/coreclr/jit/redundantbranchopts.cpp | 394 +++++++++++++++++- .../JIT/opt/RedundantBranch/JumpThreadPhi.cs | 31 ++ .../opt/RedundantBranch/JumpThreadPhi.csproj | 8 + 5 files changed, 431 insertions(+), 28 deletions(-) create mode 100644 src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.cs create mode 100644 src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.csproj diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index e0e751ee0ca24c..10e36182e61692 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7857,14 +7857,23 @@ class Compiler // Redundant branch opts // - PhaseStatus optRedundantBranches(); - bool optRedundantRelop(BasicBlock* const block); - bool optRedundantDominatingBranch(BasicBlock* const block); - bool optRedundantBranch(BasicBlock* const block); - bool optJumpThreadDom(BasicBlock* const block, BasicBlock* const domBlock, bool domIsSameRelop); - bool optJumpThreadPhi(BasicBlock* const block, GenTree* tree, ValueNum treeNormVN); - bool optJumpThreadCheck(BasicBlock* const block, BasicBlock* const domBlock); - bool optJumpThreadCore(JumpThreadInfo& jti); + enum class JumpThreadCheckResult + { + CannotThread, + CanThread, + NeedsPhiUseResolution, + }; + + PhaseStatus optRedundantBranches(); + bool optRedundantRelop(BasicBlock* const block); + bool optRedundantDominatingBranch(BasicBlock* const block); + bool optRedundantBranch(BasicBlock* const block); + bool optJumpThreadDom(BasicBlock* const block, BasicBlock* const domBlock, bool domIsSameRelop); + bool optJumpThreadPhi(BasicBlock* const block, GenTree* tree, ValueNum treeNormVN); + JumpThreadCheckResult optJumpThreadCheck(BasicBlock* const block, BasicBlock* const domBlock); + bool optFindPhiUsesInBlockAndSuccessors(BasicBlock* block, GenTreeLclVar* phiDef, JumpThreadInfo& jti); + bool optCanRewritePhiUses(JumpThreadInfo& jti); + bool optJumpThreadCore(JumpThreadInfo& jti); enum class ReachabilityResult { diff --git a/src/coreclr/jit/compmemkind.h b/src/coreclr/jit/compmemkind.h index 80da6edf76de91..f4583eb55b5570 100644 --- a/src/coreclr/jit/compmemkind.h +++ b/src/coreclr/jit/compmemkind.h @@ -28,6 +28,7 @@ CompMemKindMacro(LSRA) CompMemKindMacro(LSRA_Interval) CompMemKindMacro(LSRA_RefPosition) CompMemKindMacro(Reachability) +CompMemKindMacro(RedundantBranch) CompMemKindMacro(SSA) CompMemKindMacro(ValueNumber) CompMemKindMacro(LvaTable) diff --git a/src/coreclr/jit/redundantbranchopts.cpp b/src/coreclr/jit/redundantbranchopts.cpp index a9f305bfcaa490..e0ba665213ddba 100644 --- a/src/coreclr/jit/redundantbranchopts.cpp +++ b/src/coreclr/jit/redundantbranchopts.cpp @@ -1335,6 +1335,7 @@ struct JumpThreadInfo , traits(comp->m_dfsTree->PostOrderTraits()) , m_truePreds(BitVecOps::MakeEmpty(&traits)) , m_ambiguousPreds(BitVecOps::MakeEmpty(&traits)) + , m_phiUses(comp->getAllocator(CMK_RedundantBranch)) , m_numPreds(0) , m_numAmbiguousPreds(0) , m_numTruePreds(0) @@ -1359,6 +1360,20 @@ struct JumpThreadInfo // Pred blocks that can't be threaded or for which the predicate // value can't be determined BitVec m_ambiguousPreds; + // Uses of block-local phi defs that must be updated if threading bypasses the block. + struct PhiUse + { + PhiUse(BasicBlock* block, GenTreeLclVarCommon* use) + : m_block(block) + , m_use(use) + { + } + + BasicBlock* m_block; + GenTreeLclVarCommon* m_use; + unsigned m_replacementSsaNum = SsaConfig::RESERVED_SSA_NUM; + }; + ArrayStack m_phiUses; // Total number of predecessors int m_numPreds; // Number of predecessors that can't be threaded or for which the predicate @@ -1374,6 +1389,288 @@ struct JumpThreadInfo bool m_isPhiBased; }; +//------------------------------------------------------------------------ +// JumpThreadPhiUseVisitor: tree visitor that records uses of a specific SSA +// def in a block while preserving the containing block for each use. +// +class JumpThreadPhiUseVisitor final : public GenTreeVisitor +{ +public: + enum + { + DoPreOrder = true, + DoLclVarsOnly = true + }; + + JumpThreadPhiUseVisitor(Compiler* compiler, + unsigned lclNum, + unsigned ssaNum, + BasicBlock* block, + ArrayStack* uses) + : GenTreeVisitor(compiler) + , m_lclNum(lclNum) + , m_ssaNum(ssaNum) + , m_block(block) + , m_uses(uses) + { + } + + Compiler::fgWalkResult PreOrderVisit(GenTree** use, GenTree* user) + { + GenTree* const node = *use; + + if (node->OperIsLocalRead()) + { + GenTreeLclVarCommon* const lclUse = node->AsLclVarCommon(); + if ((lclUse->GetLclNum() == m_lclNum) && (lclUse->GetSsaNum() == m_ssaNum)) + { + m_uses->Emplace(m_block, lclUse); + } + } + + return Compiler::WALK_CONTINUE; + } + +private: + unsigned m_lclNum; + unsigned m_ssaNum; + BasicBlock* const m_block; + ArrayStack* m_uses; +}; + +//------------------------------------------------------------------------ +// optGetThreadedSsaNumForSuccessor: determine the replacement SSA number +// for uses in a successor once all non-ambiguous preds are threaded. +// +// Arguments: +// jti - threading classification for the block +// phiDef - phi definition in the threaded block +// successor - successor being considered for rewriting +// hasThreadedPreds - [OUT] true if one or more preds will be redirected here +// replacementSsaNum - [OUT] the common reaching SSA number for those preds +// +// Returns: +// True if all redirected preds reaching successor agree on one SSA number. +// +static bool optGetThreadedSsaNumForSuccessor(JumpThreadInfo& jti, + GenTreeLclVar* phiDef, + BasicBlock* successor, + bool* hasThreadedPreds, + unsigned* replacementSsaNum) +{ + assert(successor != nullptr); + + *hasThreadedPreds = false; + *replacementSsaNum = SsaConfig::RESERVED_SSA_NUM; + + bool foundReplacement = false; + unsigned replacementSsa = SsaConfig::RESERVED_SSA_NUM; + GenTreePhi* const phi = phiDef->Data()->AsPhi(); + + for (GenTreePhi::Use& use : phi->Uses()) + { + GenTreePhiArg* const phiArgNode = use.GetNode()->AsPhiArg(); + BasicBlock* const predBlock = phiArgNode->gtPredBB; + bool const isTruePred = BitVecOps::IsMember(&jti.traits, jti.m_truePreds, predBlock->bbPostorderNum); + bool const isAmbiguousPred = BitVecOps::IsMember(&jti.traits, jti.m_ambiguousPreds, predBlock->bbPostorderNum); + + if (isAmbiguousPred) + { + continue; + } + + BasicBlock* const predTarget = isTruePred ? jti.m_trueTarget : jti.m_falseTarget; + if (predTarget != successor) + { + continue; + } + + *hasThreadedPreds = true; + + if (!foundReplacement) + { + replacementSsa = phiArgNode->GetSsaNum(); + foundReplacement = true; + } + else if (replacementSsa != phiArgNode->GetSsaNum()) + { + return false; + } + } + + *replacementSsaNum = replacementSsa; + return foundReplacement; +} + +//------------------------------------------------------------------------ +// optFindPhiUsesInBlockAndSuccessors: look for all uses of a PHI SSA def in +// the candidate block and its immediate successors. +// +// Arguments: +// block - jump-threading block that contains the PHI +// phiDef - PHI definition to inspect +// jti - threading information for the block +// +// Returns: +// True if every recorded SSA use for the PHI def is found. +// +bool Compiler::optFindPhiUsesInBlockAndSuccessors(BasicBlock* block, GenTreeLclVar* phiDef, JumpThreadInfo& jti) +{ + unsigned const lclNum = phiDef->GetLclNum(); + unsigned const ssaNum = phiDef->GetSsaNum(); + + LclSsaVarDsc* const ssaVarDsc = lvaGetDesc(lclNum)->GetPerSsaData(ssaNum); + int const expectedUses = ssaVarDsc->GetNumUses(); + if (expectedUses == USHRT_MAX) + { + return false; + } + + int const initialUseCount = jti.m_phiUses.Height(); + + JumpThreadPhiUseVisitor blockVisitor(this, lclNum, ssaNum, block, &jti.m_phiUses); + for (Statement* const blockStmt : block->Statements()) + { + blockVisitor.WalkTree(blockStmt->GetRootNodePointer(), nullptr); + } + + for (BasicBlock* const successor : block->Succs()) + { + if (successor == block) + { + continue; + } + + JumpThreadPhiUseVisitor succVisitor(this, lclNum, ssaNum, successor, &jti.m_phiUses); + for (Statement* const succStmt : successor->Statements()) + { + succVisitor.WalkTree(succStmt->GetRootNodePointer(), nullptr); + } + } + + int const foundUses = jti.m_phiUses.Height() - initialUseCount; + + return foundUses == expectedUses; +} + +//------------------------------------------------------------------------ +// optCanRewritePhiUses: determine if PHIs with global uses can be handled +// safely for jump threading by rewriting successor uses. +// +// Arguments: +// jti - threading information for the block +// +// Returns: +// True if all PHI uses are accounted for and any successor uses can be +// rewritten to a unique reaching SSA def. +// +// Notes: +// We bail out in cases where a successor has a PHI or would need to have +// a PHI for any of the locals that have PHIs in block. +// +bool Compiler::optCanRewritePhiUses(JumpThreadInfo& jti) +{ + BasicBlock* const block = jti.m_block; + + // If any pred remains ambiguous, the block can still reach its successors + // after threading and we would need successor phis instead of simple rewrites. + // + if (jti.m_numAmbiguousPreds != 0) + { + return false; + } + + // Likewise if any successor is already a join, it may have or may need to + // have a PHI. + // + for (BasicBlock* const successor : block->Succs()) + { + if (successor->GetUniquePred(this) != block) + { + return false; + } + } + + // Now check if we can find all of the uses of the PHIs in block, + // either in block or in its successors. + // + for (Statement* const stmt : block->Statements()) + { + if (!stmt->IsPhiDefnStmt()) + { + break; + } + + GenTreeLclVar* const phiDef = stmt->GetRootNode()->AsLclVar(); + unsigned const lclNum = phiDef->GetLclNum(); + unsigned const ssaNum = phiDef->GetSsaNum(); + + LclSsaVarDsc* const ssaVarDsc = lvaGetDesc(lclNum)->GetPerSsaData(ssaNum); + if (!ssaVarDsc->HasGlobalUse()) + { + continue; + } + + if (!optFindPhiUsesInBlockAndSuccessors(block, phiDef, jti)) + { + return false; + } + + for (BasicBlock* const successor : block->Succs()) + { + bool hasSuccUse = false; + for (JumpThreadInfo::PhiUse& phiUse : jti.m_phiUses.BottomUpOrder()) + { + if ((phiUse.m_use->GetLclNum() != lclNum) || (phiUse.m_use->GetSsaNum() != ssaNum)) + { + continue; + } + + if (phiUse.m_block == successor) + { + hasSuccUse = true; + break; + } + } + + if (!hasSuccUse) + { + continue; + } + + // Verify that all preds that will be redirected to successor agree on the same SSA number + // so no phi is needed in successor, just a rewrite of the uses to the common SSA number. + // + bool hasThreadedPreds = false; + unsigned replacementSsa = SsaConfig::RESERVED_SSA_NUM; + if (!optGetThreadedSsaNumForSuccessor(jti, phiDef, successor, &hasThreadedPreds, &replacementSsa)) + { + return false; + } + + if (!hasThreadedPreds) + { + continue; + } + + for (JumpThreadInfo::PhiUse& phiUse : jti.m_phiUses.BottomUpOrder()) + { + if ((phiUse.m_use->GetLclNum() != lclNum) || (phiUse.m_use->GetSsaNum() != ssaNum)) + { + continue; + } + + if (phiUse.m_block == successor) + { + phiUse.m_replacementSsaNum = replacementSsa; + } + } + } + } + + return true; +} + //------------------------------------------------------------------------ // optJumpThreadCheck: see if block is suitable for jump threading. // @@ -1382,15 +1679,15 @@ struct JumpThreadInfo // domBlock - dom block used in inferencing (if any) // // Returns: -// True if the block is suitable for jump threading. +// Viability of jump threading: either CannotThread, CanThread, or NeedsPhiUseResolution. // -bool Compiler::optJumpThreadCheck(BasicBlock* const block, BasicBlock* const domBlock) +Compiler::JumpThreadCheckResult Compiler::optJumpThreadCheck(BasicBlock* const block, BasicBlock* const domBlock) { // If the block is the first block of try-region, then skip jump threading if (bbIsTryBeg(block)) { JITDUMP(FMT_BB " is first block of try-region; no threading\n", block->bbNum); - return false; + return JumpThreadCheckResult::CannotThread; } // Verify that dom block dominates all of block's predecessors. @@ -1406,7 +1703,7 @@ bool Compiler::optJumpThreadCheck(BasicBlock* const block, BasicBlock* const dom { JITDUMP("Dom " FMT_BB " is stale (does not dominate pred " FMT_BB "); no threading\n", domBlock->bbNum, predBlock->bbNum); - return false; + return JumpThreadCheckResult::CannotThread; } } } @@ -1417,23 +1714,26 @@ bool Compiler::optJumpThreadCheck(BasicBlock* const block, BasicBlock* const dom // For non-PHI RBO, we neglect PHI stores. This can leave SSA in // an incorrect state but so far it has not yet caused problems. // - // For PHI-based RBO we need to be more cautious and insist that - // any PHI is locally consumed, so that if we bypass the block we - // don't need to make SSA updates. + // For PHI-based RBO we need to be more cautious. We can now tolerate + // some non-local PHI uses, but only when all uses are found in the + // block or its immediate successors so the needed SSA/VN updates can be + // made. // // TODO: handle blocks with side effects. For those predecessors that are // favorable (ones that don't reach block via a critical edge), consider // duplicating block's IR into the predecessor. This is the jump threading // analog of the optimization we encourage via fgOptimizeUncondBranchToSimpleCond. // - Statement* const lastStmt = block->lastStmt(); - bool const isPhiRBO = (domBlock == nullptr); + Statement* const lastStmt = block->lastStmt(); + bool const isPhiRBO = (domBlock == nullptr); + bool hasGlobalPhiUses = false; for (Statement* const stmt : block->Statements()) { GenTree* const tree = stmt->GetRootNode(); - // If we are doing PHI-based RBO then all local PHIs must be locally consumed. + // If we are doing PHI-based RBO then each local PHI must either be + // locally consumed or have only tracked uses that we can safely rewrite. // if (stmt->IsPhiDefnStmt()) { @@ -1452,7 +1752,7 @@ bool Compiler::optJumpThreadCheck(BasicBlock* const block, BasicBlock* const dom { JITDUMP(FMT_BB " has phi for promoted field V%02u.%u; no phi-based threading\n", block->bbNum, lclNum, ssaNum); - return false; + return JumpThreadCheckResult::CannotThread; } LclSsaVarDsc* const ssaVarDsc = varDsc->GetPerSsaData(ssaNum); @@ -1462,9 +1762,9 @@ bool Compiler::optJumpThreadCheck(BasicBlock* const block, BasicBlock* const dom // if (ssaVarDsc->HasGlobalUse()) { - JITDUMP(FMT_BB " has global phi for V%02u.%u; no phi-based threading\n", block->bbNum, lclNum, - ssaNum); - return false; + JITDUMP(FMT_BB " has global phi for V%02u.%u; no phi-based threading\n", + block->bbNum, lclNum, ssaNum); + return JumpThreadCheckResult::CannotThread; } } @@ -1502,11 +1802,11 @@ bool Compiler::optJumpThreadCheck(BasicBlock* const block, BasicBlock* const dom } JITDUMP(FMT_BB " has side effects; no threading\n", block->bbNum); - return false; + return JumpThreadCheckResult::CannotThread; } } - return true; + return hasGlobalPhiUses ? JumpThreadCheckResult::NeedsPhiUseResolution : JumpThreadCheckResult::CanThread; } //------------------------------------------------------------------------ @@ -1591,9 +1891,15 @@ bool Compiler::optJumpThreadDom(BasicBlock* const block, BasicBlock* const domBl JITDUMP("Both successors of %sdom " FMT_BB " reach " FMT_BB " -- attempting jump threading\n", isIDom ? "i" : "", domBlock->bbNum, block->bbNum); - const bool check = optJumpThreadCheck(block, domBlock); - if (!check) + const JumpThreadCheckResult check = optJumpThreadCheck(block, domBlock); + if (check == JumpThreadCheckResult::CannotThread) + { + return false; + } + + if (check == JumpThreadCheckResult::NeedsPhiUseResolution) { + JITDUMP(FMT_BB " has global phi uses; no jump threading\n", block->bbNum); return false; } @@ -1719,8 +2025,8 @@ bool Compiler::optJumpThreadPhi(BasicBlock* block, GenTree* tree, ValueNum treeN { // First see if block is eligible for threading. // - const bool check = optJumpThreadCheck(block, /* domBlock*/ nullptr); - if (!check) + const JumpThreadCheckResult check = optJumpThreadCheck(block, /* domBlock*/ nullptr); + if (check == JumpThreadCheckResult::CannotThread) { return false; } @@ -1948,6 +2254,12 @@ bool Compiler::optJumpThreadPhi(BasicBlock* block, GenTree* tree, ValueNum treeN } } + if ((check == JumpThreadCheckResult::NeedsPhiUseResolution) && !optCanRewritePhiUses(jti)) + { + JITDUMP(FMT_BB " has global phi uses we cannot safely account for; no phi-based threading\n", block->bbNum); + return false; + } + // Do the optimization. // return optJumpThreadCore(jti); @@ -1982,6 +2294,48 @@ bool Compiler::optJumpThreadCore(JumpThreadInfo& jti) // JITDUMP("Optimizing via jump threading\n"); + for (JumpThreadInfo::PhiUse& phiUse : jti.m_phiUses.BottomUpOrder()) + { + if (phiUse.m_replacementSsaNum == SsaConfig::RESERVED_SSA_NUM) + { + continue; + } + + GenTreeLclVarCommon* const use = phiUse.m_use; + unsigned const oldSsaNum = use->GetSsaNum(); + + if (oldSsaNum == phiUse.m_replacementSsaNum) + { + continue; + } + + JITDUMP("Updating [%06u] in " FMT_BB " from u:%u to u:%u\n", dspTreeID(use), phiUse.m_block->bbNum, oldSsaNum, + phiUse.m_replacementSsaNum); + + LclSsaVarDsc* const replacementSsaDef = lvaGetDesc(use->GetLclNum())->GetPerSsaData(phiUse.m_replacementSsaNum); + + use->SetSsaNum(phiUse.m_replacementSsaNum); + + // Keep the use's value number in sync with the rewritten SSA def. + if (use->gtVNPair != replacementSsaDef->m_vnPair) + { + use->SetVNs(replacementSsaDef->m_vnPair); + GenTree* node = use; + GenTree* parent = node->gtGetParent(nullptr); + + while ((parent != nullptr) && parent->OperIs(GT_COMMA) && (parent->AsOp()->gtOp2 == node)) + { + JITDUMP(" Updating COMMA parent VN [%06u]\n", dspTreeID(parent)); + ValueNumPair op1Xvnp = vnStore->VNPExceptionSet(parent->AsOp()->gtOp1->gtVNPair); + parent->SetVNs(vnStore->VNPWithExc(parent->AsOp()->gtOp2->gtVNPair, op1Xvnp)); + node = parent; + parent = node->gtGetParent(nullptr); + } + } + + replacementSsaDef->AddUse(phiUse.m_block); + } + bool setNoCseIn = false; // If this is a phi-based threading, and the block we're bypassing has diff --git a/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.cs b/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.cs new file mode 100644 index 00000000000000..64fa4824bd23f9 --- /dev/null +++ b/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.cs @@ -0,0 +1,31 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Numerics; +using System.Runtime.CompilerServices; +using Xunit; + +public class JumpThreadPhi +{ + [MethodImpl(MethodImplOptions.NoInlining)] + private static int Phi_00(int value) + { + int idx = BitOperations.TrailingZeroCount(value); + idx = (idx != 32) ? idx : -1; + + if (idx != -1) + { + return idx + 100; + } + + return -1; + } + + [Fact] + public static void TestPhi00() + { + Assert.Equal(-1, Phi_00(0)); + Assert.Equal(100, Phi_00(1)); + Assert.Equal(103, Phi_00(8)); + } +} diff --git a/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.csproj b/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.csproj new file mode 100644 index 00000000000000..de6d5e08882e86 --- /dev/null +++ b/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.csproj @@ -0,0 +1,8 @@ + + + True + + + + + From 5a82a7de3e15bd6459ecfa91ce02d8358b813e1c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 00:54:04 +0000 Subject: [PATCH 12/15] Revert "Cherry-pick JIT PHI jump threading (PR #126812) with targeted fix for global PHI use bug (#126976)" This reverts commit cd43a53578537d451e34d60db25ca5f46a5e6173. Co-authored-by: mangod9 <61718172+mangod9@users.noreply.github.com> --- src/coreclr/jit/compiler.h | 25 +- src/coreclr/jit/compmemkind.h | 1 - src/coreclr/jit/redundantbranchopts.cpp | 394 +----------------- .../JIT/opt/RedundantBranch/JumpThreadPhi.cs | 31 -- .../opt/RedundantBranch/JumpThreadPhi.csproj | 8 - 5 files changed, 28 insertions(+), 431 deletions(-) delete mode 100644 src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.cs delete mode 100644 src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.csproj diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 10e36182e61692..e0e751ee0ca24c 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7857,23 +7857,14 @@ class Compiler // Redundant branch opts // - enum class JumpThreadCheckResult - { - CannotThread, - CanThread, - NeedsPhiUseResolution, - }; - - PhaseStatus optRedundantBranches(); - bool optRedundantRelop(BasicBlock* const block); - bool optRedundantDominatingBranch(BasicBlock* const block); - bool optRedundantBranch(BasicBlock* const block); - bool optJumpThreadDom(BasicBlock* const block, BasicBlock* const domBlock, bool domIsSameRelop); - bool optJumpThreadPhi(BasicBlock* const block, GenTree* tree, ValueNum treeNormVN); - JumpThreadCheckResult optJumpThreadCheck(BasicBlock* const block, BasicBlock* const domBlock); - bool optFindPhiUsesInBlockAndSuccessors(BasicBlock* block, GenTreeLclVar* phiDef, JumpThreadInfo& jti); - bool optCanRewritePhiUses(JumpThreadInfo& jti); - bool optJumpThreadCore(JumpThreadInfo& jti); + PhaseStatus optRedundantBranches(); + bool optRedundantRelop(BasicBlock* const block); + bool optRedundantDominatingBranch(BasicBlock* const block); + bool optRedundantBranch(BasicBlock* const block); + bool optJumpThreadDom(BasicBlock* const block, BasicBlock* const domBlock, bool domIsSameRelop); + bool optJumpThreadPhi(BasicBlock* const block, GenTree* tree, ValueNum treeNormVN); + bool optJumpThreadCheck(BasicBlock* const block, BasicBlock* const domBlock); + bool optJumpThreadCore(JumpThreadInfo& jti); enum class ReachabilityResult { diff --git a/src/coreclr/jit/compmemkind.h b/src/coreclr/jit/compmemkind.h index f4583eb55b5570..80da6edf76de91 100644 --- a/src/coreclr/jit/compmemkind.h +++ b/src/coreclr/jit/compmemkind.h @@ -28,7 +28,6 @@ CompMemKindMacro(LSRA) CompMemKindMacro(LSRA_Interval) CompMemKindMacro(LSRA_RefPosition) CompMemKindMacro(Reachability) -CompMemKindMacro(RedundantBranch) CompMemKindMacro(SSA) CompMemKindMacro(ValueNumber) CompMemKindMacro(LvaTable) diff --git a/src/coreclr/jit/redundantbranchopts.cpp b/src/coreclr/jit/redundantbranchopts.cpp index e0ba665213ddba..a9f305bfcaa490 100644 --- a/src/coreclr/jit/redundantbranchopts.cpp +++ b/src/coreclr/jit/redundantbranchopts.cpp @@ -1335,7 +1335,6 @@ struct JumpThreadInfo , traits(comp->m_dfsTree->PostOrderTraits()) , m_truePreds(BitVecOps::MakeEmpty(&traits)) , m_ambiguousPreds(BitVecOps::MakeEmpty(&traits)) - , m_phiUses(comp->getAllocator(CMK_RedundantBranch)) , m_numPreds(0) , m_numAmbiguousPreds(0) , m_numTruePreds(0) @@ -1360,20 +1359,6 @@ struct JumpThreadInfo // Pred blocks that can't be threaded or for which the predicate // value can't be determined BitVec m_ambiguousPreds; - // Uses of block-local phi defs that must be updated if threading bypasses the block. - struct PhiUse - { - PhiUse(BasicBlock* block, GenTreeLclVarCommon* use) - : m_block(block) - , m_use(use) - { - } - - BasicBlock* m_block; - GenTreeLclVarCommon* m_use; - unsigned m_replacementSsaNum = SsaConfig::RESERVED_SSA_NUM; - }; - ArrayStack m_phiUses; // Total number of predecessors int m_numPreds; // Number of predecessors that can't be threaded or for which the predicate @@ -1389,288 +1374,6 @@ struct JumpThreadInfo bool m_isPhiBased; }; -//------------------------------------------------------------------------ -// JumpThreadPhiUseVisitor: tree visitor that records uses of a specific SSA -// def in a block while preserving the containing block for each use. -// -class JumpThreadPhiUseVisitor final : public GenTreeVisitor -{ -public: - enum - { - DoPreOrder = true, - DoLclVarsOnly = true - }; - - JumpThreadPhiUseVisitor(Compiler* compiler, - unsigned lclNum, - unsigned ssaNum, - BasicBlock* block, - ArrayStack* uses) - : GenTreeVisitor(compiler) - , m_lclNum(lclNum) - , m_ssaNum(ssaNum) - , m_block(block) - , m_uses(uses) - { - } - - Compiler::fgWalkResult PreOrderVisit(GenTree** use, GenTree* user) - { - GenTree* const node = *use; - - if (node->OperIsLocalRead()) - { - GenTreeLclVarCommon* const lclUse = node->AsLclVarCommon(); - if ((lclUse->GetLclNum() == m_lclNum) && (lclUse->GetSsaNum() == m_ssaNum)) - { - m_uses->Emplace(m_block, lclUse); - } - } - - return Compiler::WALK_CONTINUE; - } - -private: - unsigned m_lclNum; - unsigned m_ssaNum; - BasicBlock* const m_block; - ArrayStack* m_uses; -}; - -//------------------------------------------------------------------------ -// optGetThreadedSsaNumForSuccessor: determine the replacement SSA number -// for uses in a successor once all non-ambiguous preds are threaded. -// -// Arguments: -// jti - threading classification for the block -// phiDef - phi definition in the threaded block -// successor - successor being considered for rewriting -// hasThreadedPreds - [OUT] true if one or more preds will be redirected here -// replacementSsaNum - [OUT] the common reaching SSA number for those preds -// -// Returns: -// True if all redirected preds reaching successor agree on one SSA number. -// -static bool optGetThreadedSsaNumForSuccessor(JumpThreadInfo& jti, - GenTreeLclVar* phiDef, - BasicBlock* successor, - bool* hasThreadedPreds, - unsigned* replacementSsaNum) -{ - assert(successor != nullptr); - - *hasThreadedPreds = false; - *replacementSsaNum = SsaConfig::RESERVED_SSA_NUM; - - bool foundReplacement = false; - unsigned replacementSsa = SsaConfig::RESERVED_SSA_NUM; - GenTreePhi* const phi = phiDef->Data()->AsPhi(); - - for (GenTreePhi::Use& use : phi->Uses()) - { - GenTreePhiArg* const phiArgNode = use.GetNode()->AsPhiArg(); - BasicBlock* const predBlock = phiArgNode->gtPredBB; - bool const isTruePred = BitVecOps::IsMember(&jti.traits, jti.m_truePreds, predBlock->bbPostorderNum); - bool const isAmbiguousPred = BitVecOps::IsMember(&jti.traits, jti.m_ambiguousPreds, predBlock->bbPostorderNum); - - if (isAmbiguousPred) - { - continue; - } - - BasicBlock* const predTarget = isTruePred ? jti.m_trueTarget : jti.m_falseTarget; - if (predTarget != successor) - { - continue; - } - - *hasThreadedPreds = true; - - if (!foundReplacement) - { - replacementSsa = phiArgNode->GetSsaNum(); - foundReplacement = true; - } - else if (replacementSsa != phiArgNode->GetSsaNum()) - { - return false; - } - } - - *replacementSsaNum = replacementSsa; - return foundReplacement; -} - -//------------------------------------------------------------------------ -// optFindPhiUsesInBlockAndSuccessors: look for all uses of a PHI SSA def in -// the candidate block and its immediate successors. -// -// Arguments: -// block - jump-threading block that contains the PHI -// phiDef - PHI definition to inspect -// jti - threading information for the block -// -// Returns: -// True if every recorded SSA use for the PHI def is found. -// -bool Compiler::optFindPhiUsesInBlockAndSuccessors(BasicBlock* block, GenTreeLclVar* phiDef, JumpThreadInfo& jti) -{ - unsigned const lclNum = phiDef->GetLclNum(); - unsigned const ssaNum = phiDef->GetSsaNum(); - - LclSsaVarDsc* const ssaVarDsc = lvaGetDesc(lclNum)->GetPerSsaData(ssaNum); - int const expectedUses = ssaVarDsc->GetNumUses(); - if (expectedUses == USHRT_MAX) - { - return false; - } - - int const initialUseCount = jti.m_phiUses.Height(); - - JumpThreadPhiUseVisitor blockVisitor(this, lclNum, ssaNum, block, &jti.m_phiUses); - for (Statement* const blockStmt : block->Statements()) - { - blockVisitor.WalkTree(blockStmt->GetRootNodePointer(), nullptr); - } - - for (BasicBlock* const successor : block->Succs()) - { - if (successor == block) - { - continue; - } - - JumpThreadPhiUseVisitor succVisitor(this, lclNum, ssaNum, successor, &jti.m_phiUses); - for (Statement* const succStmt : successor->Statements()) - { - succVisitor.WalkTree(succStmt->GetRootNodePointer(), nullptr); - } - } - - int const foundUses = jti.m_phiUses.Height() - initialUseCount; - - return foundUses == expectedUses; -} - -//------------------------------------------------------------------------ -// optCanRewritePhiUses: determine if PHIs with global uses can be handled -// safely for jump threading by rewriting successor uses. -// -// Arguments: -// jti - threading information for the block -// -// Returns: -// True if all PHI uses are accounted for and any successor uses can be -// rewritten to a unique reaching SSA def. -// -// Notes: -// We bail out in cases where a successor has a PHI or would need to have -// a PHI for any of the locals that have PHIs in block. -// -bool Compiler::optCanRewritePhiUses(JumpThreadInfo& jti) -{ - BasicBlock* const block = jti.m_block; - - // If any pred remains ambiguous, the block can still reach its successors - // after threading and we would need successor phis instead of simple rewrites. - // - if (jti.m_numAmbiguousPreds != 0) - { - return false; - } - - // Likewise if any successor is already a join, it may have or may need to - // have a PHI. - // - for (BasicBlock* const successor : block->Succs()) - { - if (successor->GetUniquePred(this) != block) - { - return false; - } - } - - // Now check if we can find all of the uses of the PHIs in block, - // either in block or in its successors. - // - for (Statement* const stmt : block->Statements()) - { - if (!stmt->IsPhiDefnStmt()) - { - break; - } - - GenTreeLclVar* const phiDef = stmt->GetRootNode()->AsLclVar(); - unsigned const lclNum = phiDef->GetLclNum(); - unsigned const ssaNum = phiDef->GetSsaNum(); - - LclSsaVarDsc* const ssaVarDsc = lvaGetDesc(lclNum)->GetPerSsaData(ssaNum); - if (!ssaVarDsc->HasGlobalUse()) - { - continue; - } - - if (!optFindPhiUsesInBlockAndSuccessors(block, phiDef, jti)) - { - return false; - } - - for (BasicBlock* const successor : block->Succs()) - { - bool hasSuccUse = false; - for (JumpThreadInfo::PhiUse& phiUse : jti.m_phiUses.BottomUpOrder()) - { - if ((phiUse.m_use->GetLclNum() != lclNum) || (phiUse.m_use->GetSsaNum() != ssaNum)) - { - continue; - } - - if (phiUse.m_block == successor) - { - hasSuccUse = true; - break; - } - } - - if (!hasSuccUse) - { - continue; - } - - // Verify that all preds that will be redirected to successor agree on the same SSA number - // so no phi is needed in successor, just a rewrite of the uses to the common SSA number. - // - bool hasThreadedPreds = false; - unsigned replacementSsa = SsaConfig::RESERVED_SSA_NUM; - if (!optGetThreadedSsaNumForSuccessor(jti, phiDef, successor, &hasThreadedPreds, &replacementSsa)) - { - return false; - } - - if (!hasThreadedPreds) - { - continue; - } - - for (JumpThreadInfo::PhiUse& phiUse : jti.m_phiUses.BottomUpOrder()) - { - if ((phiUse.m_use->GetLclNum() != lclNum) || (phiUse.m_use->GetSsaNum() != ssaNum)) - { - continue; - } - - if (phiUse.m_block == successor) - { - phiUse.m_replacementSsaNum = replacementSsa; - } - } - } - } - - return true; -} - //------------------------------------------------------------------------ // optJumpThreadCheck: see if block is suitable for jump threading. // @@ -1679,15 +1382,15 @@ bool Compiler::optCanRewritePhiUses(JumpThreadInfo& jti) // domBlock - dom block used in inferencing (if any) // // Returns: -// Viability of jump threading: either CannotThread, CanThread, or NeedsPhiUseResolution. +// True if the block is suitable for jump threading. // -Compiler::JumpThreadCheckResult Compiler::optJumpThreadCheck(BasicBlock* const block, BasicBlock* const domBlock) +bool Compiler::optJumpThreadCheck(BasicBlock* const block, BasicBlock* const domBlock) { // If the block is the first block of try-region, then skip jump threading if (bbIsTryBeg(block)) { JITDUMP(FMT_BB " is first block of try-region; no threading\n", block->bbNum); - return JumpThreadCheckResult::CannotThread; + return false; } // Verify that dom block dominates all of block's predecessors. @@ -1703,7 +1406,7 @@ Compiler::JumpThreadCheckResult Compiler::optJumpThreadCheck(BasicBlock* const b { JITDUMP("Dom " FMT_BB " is stale (does not dominate pred " FMT_BB "); no threading\n", domBlock->bbNum, predBlock->bbNum); - return JumpThreadCheckResult::CannotThread; + return false; } } } @@ -1714,26 +1417,23 @@ Compiler::JumpThreadCheckResult Compiler::optJumpThreadCheck(BasicBlock* const b // For non-PHI RBO, we neglect PHI stores. This can leave SSA in // an incorrect state but so far it has not yet caused problems. // - // For PHI-based RBO we need to be more cautious. We can now tolerate - // some non-local PHI uses, but only when all uses are found in the - // block or its immediate successors so the needed SSA/VN updates can be - // made. + // For PHI-based RBO we need to be more cautious and insist that + // any PHI is locally consumed, so that if we bypass the block we + // don't need to make SSA updates. // // TODO: handle blocks with side effects. For those predecessors that are // favorable (ones that don't reach block via a critical edge), consider // duplicating block's IR into the predecessor. This is the jump threading // analog of the optimization we encourage via fgOptimizeUncondBranchToSimpleCond. // - Statement* const lastStmt = block->lastStmt(); - bool const isPhiRBO = (domBlock == nullptr); - bool hasGlobalPhiUses = false; + Statement* const lastStmt = block->lastStmt(); + bool const isPhiRBO = (domBlock == nullptr); for (Statement* const stmt : block->Statements()) { GenTree* const tree = stmt->GetRootNode(); - // If we are doing PHI-based RBO then each local PHI must either be - // locally consumed or have only tracked uses that we can safely rewrite. + // If we are doing PHI-based RBO then all local PHIs must be locally consumed. // if (stmt->IsPhiDefnStmt()) { @@ -1752,7 +1452,7 @@ Compiler::JumpThreadCheckResult Compiler::optJumpThreadCheck(BasicBlock* const b { JITDUMP(FMT_BB " has phi for promoted field V%02u.%u; no phi-based threading\n", block->bbNum, lclNum, ssaNum); - return JumpThreadCheckResult::CannotThread; + return false; } LclSsaVarDsc* const ssaVarDsc = varDsc->GetPerSsaData(ssaNum); @@ -1762,9 +1462,9 @@ Compiler::JumpThreadCheckResult Compiler::optJumpThreadCheck(BasicBlock* const b // if (ssaVarDsc->HasGlobalUse()) { - JITDUMP(FMT_BB " has global phi for V%02u.%u; no phi-based threading\n", - block->bbNum, lclNum, ssaNum); - return JumpThreadCheckResult::CannotThread; + JITDUMP(FMT_BB " has global phi for V%02u.%u; no phi-based threading\n", block->bbNum, lclNum, + ssaNum); + return false; } } @@ -1802,11 +1502,11 @@ Compiler::JumpThreadCheckResult Compiler::optJumpThreadCheck(BasicBlock* const b } JITDUMP(FMT_BB " has side effects; no threading\n", block->bbNum); - return JumpThreadCheckResult::CannotThread; + return false; } } - return hasGlobalPhiUses ? JumpThreadCheckResult::NeedsPhiUseResolution : JumpThreadCheckResult::CanThread; + return true; } //------------------------------------------------------------------------ @@ -1891,15 +1591,9 @@ bool Compiler::optJumpThreadDom(BasicBlock* const block, BasicBlock* const domBl JITDUMP("Both successors of %sdom " FMT_BB " reach " FMT_BB " -- attempting jump threading\n", isIDom ? "i" : "", domBlock->bbNum, block->bbNum); - const JumpThreadCheckResult check = optJumpThreadCheck(block, domBlock); - if (check == JumpThreadCheckResult::CannotThread) - { - return false; - } - - if (check == JumpThreadCheckResult::NeedsPhiUseResolution) + const bool check = optJumpThreadCheck(block, domBlock); + if (!check) { - JITDUMP(FMT_BB " has global phi uses; no jump threading\n", block->bbNum); return false; } @@ -2025,8 +1719,8 @@ bool Compiler::optJumpThreadPhi(BasicBlock* block, GenTree* tree, ValueNum treeN { // First see if block is eligible for threading. // - const JumpThreadCheckResult check = optJumpThreadCheck(block, /* domBlock*/ nullptr); - if (check == JumpThreadCheckResult::CannotThread) + const bool check = optJumpThreadCheck(block, /* domBlock*/ nullptr); + if (!check) { return false; } @@ -2254,12 +1948,6 @@ bool Compiler::optJumpThreadPhi(BasicBlock* block, GenTree* tree, ValueNum treeN } } - if ((check == JumpThreadCheckResult::NeedsPhiUseResolution) && !optCanRewritePhiUses(jti)) - { - JITDUMP(FMT_BB " has global phi uses we cannot safely account for; no phi-based threading\n", block->bbNum); - return false; - } - // Do the optimization. // return optJumpThreadCore(jti); @@ -2294,48 +1982,6 @@ bool Compiler::optJumpThreadCore(JumpThreadInfo& jti) // JITDUMP("Optimizing via jump threading\n"); - for (JumpThreadInfo::PhiUse& phiUse : jti.m_phiUses.BottomUpOrder()) - { - if (phiUse.m_replacementSsaNum == SsaConfig::RESERVED_SSA_NUM) - { - continue; - } - - GenTreeLclVarCommon* const use = phiUse.m_use; - unsigned const oldSsaNum = use->GetSsaNum(); - - if (oldSsaNum == phiUse.m_replacementSsaNum) - { - continue; - } - - JITDUMP("Updating [%06u] in " FMT_BB " from u:%u to u:%u\n", dspTreeID(use), phiUse.m_block->bbNum, oldSsaNum, - phiUse.m_replacementSsaNum); - - LclSsaVarDsc* const replacementSsaDef = lvaGetDesc(use->GetLclNum())->GetPerSsaData(phiUse.m_replacementSsaNum); - - use->SetSsaNum(phiUse.m_replacementSsaNum); - - // Keep the use's value number in sync with the rewritten SSA def. - if (use->gtVNPair != replacementSsaDef->m_vnPair) - { - use->SetVNs(replacementSsaDef->m_vnPair); - GenTree* node = use; - GenTree* parent = node->gtGetParent(nullptr); - - while ((parent != nullptr) && parent->OperIs(GT_COMMA) && (parent->AsOp()->gtOp2 == node)) - { - JITDUMP(" Updating COMMA parent VN [%06u]\n", dspTreeID(parent)); - ValueNumPair op1Xvnp = vnStore->VNPExceptionSet(parent->AsOp()->gtOp1->gtVNPair); - parent->SetVNs(vnStore->VNPWithExc(parent->AsOp()->gtOp2->gtVNPair, op1Xvnp)); - node = parent; - parent = node->gtGetParent(nullptr); - } - } - - replacementSsaDef->AddUse(phiUse.m_block); - } - bool setNoCseIn = false; // If this is a phi-based threading, and the block we're bypassing has diff --git a/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.cs b/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.cs deleted file mode 100644 index 64fa4824bd23f9..00000000000000 --- a/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.cs +++ /dev/null @@ -1,31 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Numerics; -using System.Runtime.CompilerServices; -using Xunit; - -public class JumpThreadPhi -{ - [MethodImpl(MethodImplOptions.NoInlining)] - private static int Phi_00(int value) - { - int idx = BitOperations.TrailingZeroCount(value); - idx = (idx != 32) ? idx : -1; - - if (idx != -1) - { - return idx + 100; - } - - return -1; - } - - [Fact] - public static void TestPhi00() - { - Assert.Equal(-1, Phi_00(0)); - Assert.Equal(100, Phi_00(1)); - Assert.Equal(103, Phi_00(8)); - } -} diff --git a/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.csproj b/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.csproj deleted file mode 100644 index de6d5e08882e86..00000000000000 --- a/src/tests/JIT/opt/RedundantBranch/JumpThreadPhi.csproj +++ /dev/null @@ -1,8 +0,0 @@ - - - True - - - - - From 7442a6d92d48f7358d0f03fb2647c2d16c8c36c0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 00:59:50 +0000 Subject: [PATCH 13/15] Merge main and apply targeted JIT PHI jump threading fix for #126976 Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/9c1cd1ef-faf7-466e-8a33-b76de1779dcf Co-authored-by: mangod9 <61718172+mangod9@users.noreply.github.com> --- src/coreclr/jit/redundantbranchopts.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/redundantbranchopts.cpp b/src/coreclr/jit/redundantbranchopts.cpp index e7d02b9635144e..e0ba665213ddba 100644 --- a/src/coreclr/jit/redundantbranchopts.cpp +++ b/src/coreclr/jit/redundantbranchopts.cpp @@ -1762,9 +1762,9 @@ Compiler::JumpThreadCheckResult Compiler::optJumpThreadCheck(BasicBlock* const b // if (ssaVarDsc->HasGlobalUse()) { - JITDUMP(FMT_BB " has global phi for V%02u.%u; deferring jump threading pending use analysis\n", + JITDUMP(FMT_BB " has global phi for V%02u.%u; no phi-based threading\n", block->bbNum, lclNum, ssaNum); - hasGlobalPhiUses = true; + return JumpThreadCheckResult::CannotThread; } } From 90a23b2ef8b1009aefb0581de8cb8a6dc01ec3ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Strehovsk=C3=BD?= Date: Thu, 16 Apr 2026 16:10:42 +0900 Subject: [PATCH 14/15] Add project exclusions for arm64 Windows tests Added project exclusions for LibraryImportGenerator and ComInterfaceGenerator tests on arm64 Windows. --- src/libraries/tests.proj | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/libraries/tests.proj b/src/libraries/tests.proj index 1f717aea046908..43615ce89c8823 100644 --- a/src/libraries/tests.proj +++ b/src/libraries/tests.proj @@ -430,6 +430,12 @@ + + + + From 4369c7657be325cd71e2b4ed496c309fe7b9560e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 14:14:59 +0000 Subject: [PATCH 15/15] Fix JITDUMP formatting in redundantbranchopts.cpp to satisfy jitformat checker Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/b06cc629-2471-44a5-aa16-52d70086c932 Co-authored-by: mangod9 <61718172+mangod9@users.noreply.github.com> --- src/coreclr/jit/redundantbranchopts.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/redundantbranchopts.cpp b/src/coreclr/jit/redundantbranchopts.cpp index e0ba665213ddba..fbe16664ab7609 100644 --- a/src/coreclr/jit/redundantbranchopts.cpp +++ b/src/coreclr/jit/redundantbranchopts.cpp @@ -1762,8 +1762,8 @@ Compiler::JumpThreadCheckResult Compiler::optJumpThreadCheck(BasicBlock* const b // if (ssaVarDsc->HasGlobalUse()) { - JITDUMP(FMT_BB " has global phi for V%02u.%u; no phi-based threading\n", - block->bbNum, lclNum, ssaNum); + JITDUMP(FMT_BB " has global phi for V%02u.%u; no phi-based threading\n", block->bbNum, lclNum, + ssaNum); return JumpThreadCheckResult::CannotThread; } }