From 67d1f728d2ba7d2bdcecb64405bc6b337d546be6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 16:56:42 +0000 Subject: [PATCH 1/9] Initial plan From 785de0b73486c116882cfa59e4545417c1289d24 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 17:02:07 +0000 Subject: [PATCH 2/9] Reduce Vector256/512 Sum to single V128 reduction on x64 Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/f98b46df-b011-4898-8d7f-4edea22a5662 Co-authored-by: tannergooding <10487869+tannergooding@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 826fba0d2ce604..b4459a05e498a4 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -28185,6 +28185,12 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd #if defined(TARGET_XARCH) + // For larger vectors, reduce down to a single V128 by adding the upper + // and lower halves together. This avoids duplicating the (relatively + // expensive) V128 horizontal-reduction sequence for each half and keeps + // the per-call-site code size small. The final V128 reduction below + // produces the scalar result. + if (simdSize == 64) { GenTree* op1Dup = fgMakeMultiUse(&op1); @@ -28192,18 +28198,6 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd op1 = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseType, simdSize); op1Dup = gtNewSimdGetUpperNode(TYP_SIMD32, op1Dup, simdBaseType, simdSize); - if (varTypeIsFloating(simdBaseType)) - { - // We need to ensure deterministic results which requires - // consistently adding values together. Since many operations - // end up operating on 128-bit lanes, we break sum the same way. - - op1 = gtNewSimdSumNode(type, op1, simdBaseType, 32); - op1Dup = gtNewSimdSumNode(type, op1Dup, simdBaseType, 32); - - return gtNewOperNode(GT_ADD, type, op1, op1Dup); - } - simdSize = 32; op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD32, op1, op1Dup, simdBaseType, 32); } @@ -28215,18 +28209,6 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd op1 = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseType, simdSize); op1Dup = gtNewSimdGetUpperNode(TYP_SIMD16, op1Dup, simdBaseType, simdSize); - if (varTypeIsFloating(simdBaseType)) - { - // We need to ensure deterministic results which requires - // consistently adding values together. Since many operations - // end up operating on 128-bit lanes, we break sum the same way. - - op1 = gtNewSimdSumNode(type, op1, simdBaseType, 16); - op1Dup = gtNewSimdSumNode(type, op1Dup, simdBaseType, 16); - - return gtNewOperNode(GT_ADD, type, op1, op1Dup); - } - simdSize = 16; op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1, op1Dup, simdBaseType, 16); } From 633a2848b1e8419d41b3f03a6d58d3d6777d5136 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 17:56:40 +0000 Subject: [PATCH 3/9] Preserve FP sum ordering: reduce at full width, then combine halves Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/7dd925c6-f17d-475f-a448-008418ad164a Co-authored-by: tannergooding <10487869+tannergooding@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 153 +++++++++++++++++++++++++----------- 1 file changed, 106 insertions(+), 47 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index b4459a05e498a4..ebd750965e0b3c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -28185,56 +28185,45 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd #if defined(TARGET_XARCH) - // For larger vectors, reduce down to a single V128 by adding the upper - // and lower halves together. This avoids duplicating the (relatively - // expensive) V128 horizontal-reduction sequence for each half and keeps - // the per-call-site code size small. The final V128 reduction below - // produces the scalar result. - - if (simdSize == 64) - { - GenTree* op1Dup = fgMakeMultiUse(&op1); - - op1 = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseType, simdSize); - op1Dup = gtNewSimdGetUpperNode(TYP_SIMD32, op1Dup, simdBaseType, simdSize); - - simdSize = 32; - op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD32, op1, op1Dup, simdBaseType, 32); - } - - if (simdSize == 32) - { - GenTree* op1Dup = fgMakeMultiUse(&op1); - - op1 = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseType, simdSize); - op1Dup = gtNewSimdGetUpperNode(TYP_SIMD16, op1Dup, simdBaseType, simdSize); - - simdSize = 16; - op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1, op1Dup, simdBaseType, 16); - } - - assert(simdSize == 16); - if (varTypeIsFloating(simdBaseType)) { + // For floating-point we first run the horizontal permute+add sequence + // at the full simd width. vpermilps/vpermilpd permute WITHIN each + // 128-bit lane, so this is effectively 2x (V256) or 4x (V512) V128 + // reductions running in parallel with no duplicated work. + // + // After that, each 128-bit lane of op1 holds the sum of its elements + // broadcast across the lane. We then reduce the lanes to a single + // scalar by combining halves in Sum(lower) + Sum(upper) order, which + // is the order used by the managed software fallback. That order is + // preserved here because floating-point addition is not associative. + if (simdBaseType == TYP_FLOAT) { GenTree* op1Shuffled = fgMakeMultiUse(&op1); - if (compOpportunisticallyDependsOn(InstructionSet_AVX)) + NamedIntrinsic permIntrinsic = NI_AVX_Permute; + if (simdSize == 64) + { + // vpermilps above 256-bit requires AVX-512 encoding + permIntrinsic = NI_AVX512_Permute4x32; + } + + if ((simdSize > 16) || compOpportunisticallyDependsOn(InstructionSet_AVX)) { - // The permute below gives us [0, 1, 2, 3] -> [1, 0, 3, 2] - op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtNewIconNode((int)0b10110001, TYP_INT), NI_AVX_Permute, + // Per lane, the permute below gives us [0, 1, 2, 3] -> [1, 0, 3, 2] + op1 = gtNewSimdHWIntrinsicNode(simdType, op1, gtNewIconNode((int)0b10110001, TYP_INT), permIntrinsic, simdBaseType, simdSize); - // The add below now results in [0 + 1, 1 + 0, 2 + 3, 3 + 2] - op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1, op1Shuffled, simdBaseType, simdSize); + // Per lane, the add below now results in [0 + 1, 1 + 0, 2 + 3, 3 + 2] + op1 = gtNewSimdBinOpNode(GT_ADD, simdType, op1, op1Shuffled, simdBaseType, simdSize); op1Shuffled = fgMakeMultiUse(&op1); - // The permute below gives us [0 + 1, 1 + 0, 2 + 3, 3 + 2] -> [2 + 3, 3 + 2, 0 + 1, 1 + 0] - op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtNewIconNode((int)0b01001110, TYP_INT), NI_AVX_Permute, + // Per lane, the permute below gives us [0 + 1, 1 + 0, 2 + 3, 3 + 2] -> [2 + 3, 3 + 2, 0 + 1, 1 + 0] + op1 = gtNewSimdHWIntrinsicNode(simdType, op1, gtNewIconNode((int)0b01001110, TYP_INT), permIntrinsic, simdBaseType, simdSize); } else { + assert(simdSize == 16); // The shuffle below gives us [0, 1, 2, 3] -> [1, 0, 3, 2] op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op1Shuffled, gtNewIconNode((int)0b10110001, TYP_INT), NI_X86Base_Shuffle, simdBaseType, simdSize); @@ -28247,34 +28236,104 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd NI_X86Base_Shuffle, simdBaseType, simdSize); op1Shuffled = fgMakeMultiUse(&op1Shuffled); } - // Finally adding the results gets us [(0 + 1) + (2 + 3), (1 + 0) + (3 + 2), (2 + 3) + (0 + 1), (3 + 2) + (1 - // + 0)] - op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1, op1Shuffled, simdBaseType, simdSize); - return gtNewSimdToScalarNode(type, op1, simdBaseType, simdSize); + // Per lane, adding the results gets us [(0 + 1) + (2 + 3), (1 + 0) + (3 + 2), (2 + 3) + (0 + 1), + // (3 + 2) + (1 + 0)] + op1 = gtNewSimdBinOpNode(GT_ADD, simdType, op1, op1Shuffled, simdBaseType, simdSize); } else { GenTree* op1Shuffled = fgMakeMultiUse(&op1); - if (compOpportunisticallyDependsOn(InstructionSet_AVX)) + NamedIntrinsic permIntrinsic = NI_AVX_Permute; + if (simdSize == 64) + { + // vpermilpd above 256-bit requires AVX-512 encoding + permIntrinsic = NI_AVX512_Permute2x64; + } + + if ((simdSize > 16) || compOpportunisticallyDependsOn(InstructionSet_AVX)) { - // The permute below gives us [0, 1] -> [1, 0] - op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtNewIconNode((int)0b0001, TYP_INT), NI_AVX_Permute, + // Per lane, the permute below gives us [0, 1] -> [1, 0] + op1 = gtNewSimdHWIntrinsicNode(simdType, op1, gtNewIconNode((int)0b0001, TYP_INT), permIntrinsic, simdBaseType, simdSize); } else { + assert(simdSize == 16); // The shuffle below gives us [0, 1] -> [1, 0] op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op1Shuffled, gtNewIconNode((int)0b0001, TYP_INT), NI_X86Base_Shuffle, simdBaseType, simdSize); op1Shuffled = fgMakeMultiUse(&op1Shuffled); } - // Finally adding the results gets us [0 + 1, 1 + 0] - op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1, op1Shuffled, simdBaseType, simdSize); - return gtNewSimdToScalarNode(type, op1, simdBaseType, simdSize); + // Per lane, adding the results gets us [0 + 1, 1 + 0] + op1 = gtNewSimdBinOpNode(GT_ADD, simdType, op1, op1Shuffled, simdBaseType, simdSize); } + + // At this point every 128-bit lane of op1 contains that lane's reduced + // sum broadcast across the lane. Combine the lanes into a single V128 + // while preserving the Sum(lower) + Sum(upper) order used by the + // managed fallback. + + if (simdSize == 64) + { + GenTree* op1Dup = fgMakeMultiUse(&op1); + + GenTree* lower = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseType, 64); + GenTree* upper = gtNewSimdGetUpperNode(TYP_SIMD32, op1Dup, simdBaseType, 64); + + GenTree* lowerDup = fgMakeMultiUse(&lower); + GenTree* lowerLo = gtNewSimdGetLowerNode(TYP_SIMD16, lower, simdBaseType, 32); + GenTree* lowerHi = gtNewSimdGetUpperNode(TYP_SIMD16, lowerDup, simdBaseType, 32); + lower = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, lowerLo, lowerHi, simdBaseType, 16); + + GenTree* upperDup = fgMakeMultiUse(&upper); + GenTree* upperLo = gtNewSimdGetLowerNode(TYP_SIMD16, upper, simdBaseType, 32); + GenTree* upperHi = gtNewSimdGetUpperNode(TYP_SIMD16, upperDup, simdBaseType, 32); + upper = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, upperLo, upperHi, simdBaseType, 16); + + op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, lower, upper, simdBaseType, 16); + } + else if (simdSize == 32) + { + GenTree* op1Dup = fgMakeMultiUse(&op1); + + GenTree* lower = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseType, 32); + GenTree* upper = gtNewSimdGetUpperNode(TYP_SIMD16, op1Dup, simdBaseType, 32); + + op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, lower, upper, simdBaseType, 16); + } + + return gtNewSimdToScalarNode(type, op1, simdBaseType, 16); } + // Integer: integer addition is associative, so we can safely reduce the + // upper/lower halves element-wise down to a single V128 before running + // the V128 reduction. + + if (simdSize == 64) + { + GenTree* op1Dup = fgMakeMultiUse(&op1); + + op1 = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseType, simdSize); + op1Dup = gtNewSimdGetUpperNode(TYP_SIMD32, op1Dup, simdBaseType, simdSize); + + simdSize = 32; + op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD32, op1, op1Dup, simdBaseType, 32); + } + + if (simdSize == 32) + { + GenTree* op1Dup = fgMakeMultiUse(&op1); + + op1 = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseType, simdSize); + op1Dup = gtNewSimdGetUpperNode(TYP_SIMD16, op1Dup, simdBaseType, simdSize); + + simdSize = 16; + op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1, op1Dup, simdBaseType, 16); + } + + assert(simdSize == 16); + unsigned vectorLength = getSIMDVectorLength(simdSize, simdBaseType); int shiftCount = genLog2(vectorLength); int typeSize = genTypeSize(simdBaseType); From 30ff4d15ab12a7eb93637de096fd708dee624a52 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 18:35:58 +0000 Subject: [PATCH 4/9] Simplify ternary and fall through V512->V256->V128 halves-combine Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/873ded98-dc23-47f4-9027-6bfb940355ab Co-authored-by: tannergooding <10487869+tannergooding@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 64 +++++++++++++++---------------------- 1 file changed, 25 insertions(+), 39 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ebd750965e0b3c..2717716928b598 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -28193,21 +28193,16 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd // reductions running in parallel with no duplicated work. // // After that, each 128-bit lane of op1 holds the sum of its elements - // broadcast across the lane. We then reduce the lanes to a single - // scalar by combining halves in Sum(lower) + Sum(upper) order, which - // is the order used by the managed software fallback. That order is - // preserved here because floating-point addition is not associative. + // broadcast across the lane. We then reduce the lanes by combining + // upper/lower halves step-by-step down to a single V128. Floating- + // point addition is not associative, so the exact IR shape of the + // half-combine is deliberately preserved and matches the integer + // path below. if (simdBaseType == TYP_FLOAT) { - GenTree* op1Shuffled = fgMakeMultiUse(&op1); - - NamedIntrinsic permIntrinsic = NI_AVX_Permute; - if (simdSize == 64) - { - // vpermilps above 256-bit requires AVX-512 encoding - permIntrinsic = NI_AVX512_Permute4x32; - } + GenTree* op1Shuffled = fgMakeMultiUse(&op1); + NamedIntrinsic permIntrinsic = (simdSize == 64) ? NI_AVX512_Permute4x32 : NI_AVX_Permute; if ((simdSize > 16) || compOpportunisticallyDependsOn(InstructionSet_AVX)) { @@ -28242,14 +28237,8 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd } else { - GenTree* op1Shuffled = fgMakeMultiUse(&op1); - - NamedIntrinsic permIntrinsic = NI_AVX_Permute; - if (simdSize == 64) - { - // vpermilpd above 256-bit requires AVX-512 encoding - permIntrinsic = NI_AVX512_Permute2x64; - } + GenTree* op1Shuffled = fgMakeMultiUse(&op1); + NamedIntrinsic permIntrinsic = (simdSize == 64) ? NI_AVX512_Permute2x64 : NI_AVX_Permute; if ((simdSize > 16) || compOpportunisticallyDependsOn(InstructionSet_AVX)) { @@ -28271,38 +28260,35 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd // At this point every 128-bit lane of op1 contains that lane's reduced // sum broadcast across the lane. Combine the lanes into a single V128 - // while preserving the Sum(lower) + Sum(upper) order used by the - // managed fallback. + // by combining upper/lower halves step-by-step, mirroring the integer + // path below: + // vector256 = vector512.GetLower() + vector512.GetUpper() + // vector128 = vector256.GetLower() + vector256.GetUpper() + // return vector128.ToScalar() if (simdSize == 64) { GenTree* op1Dup = fgMakeMultiUse(&op1); - GenTree* lower = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseType, 64); - GenTree* upper = gtNewSimdGetUpperNode(TYP_SIMD32, op1Dup, simdBaseType, 64); + op1 = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseType, 64); + op1Dup = gtNewSimdGetUpperNode(TYP_SIMD32, op1Dup, simdBaseType, 64); - GenTree* lowerDup = fgMakeMultiUse(&lower); - GenTree* lowerLo = gtNewSimdGetLowerNode(TYP_SIMD16, lower, simdBaseType, 32); - GenTree* lowerHi = gtNewSimdGetUpperNode(TYP_SIMD16, lowerDup, simdBaseType, 32); - lower = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, lowerLo, lowerHi, simdBaseType, 16); - - GenTree* upperDup = fgMakeMultiUse(&upper); - GenTree* upperLo = gtNewSimdGetLowerNode(TYP_SIMD16, upper, simdBaseType, 32); - GenTree* upperHi = gtNewSimdGetUpperNode(TYP_SIMD16, upperDup, simdBaseType, 32); - upper = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, upperLo, upperHi, simdBaseType, 16); - - op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, lower, upper, simdBaseType, 16); + simdSize = 32; + op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD32, op1, op1Dup, simdBaseType, 32); } - else if (simdSize == 32) + + if (simdSize == 32) { GenTree* op1Dup = fgMakeMultiUse(&op1); - GenTree* lower = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseType, 32); - GenTree* upper = gtNewSimdGetUpperNode(TYP_SIMD16, op1Dup, simdBaseType, 32); + op1 = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseType, 32); + op1Dup = gtNewSimdGetUpperNode(TYP_SIMD16, op1Dup, simdBaseType, 32); - op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, lower, upper, simdBaseType, 16); + simdSize = 16; + op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1, op1Dup, simdBaseType, 16); } + assert(simdSize == 16); return gtNewSimdToScalarNode(type, op1, simdBaseType, 16); } From ed85ea4b331bc364c3a8d681fe075e8d7f8f1286 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:11:56 +0000 Subject: [PATCH 5/9] Align = on op1Shuffled/permIntrinsic declarations Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/5ae7cd14-a6cd-4a9c-a559-dae9287ab791 Co-authored-by: tannergooding <10487869+tannergooding@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 2717716928b598..90a85bf50f297f 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -28201,7 +28201,7 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd if (simdBaseType == TYP_FLOAT) { - GenTree* op1Shuffled = fgMakeMultiUse(&op1); + GenTree* op1Shuffled = fgMakeMultiUse(&op1); NamedIntrinsic permIntrinsic = (simdSize == 64) ? NI_AVX512_Permute4x32 : NI_AVX_Permute; if ((simdSize > 16) || compOpportunisticallyDependsOn(InstructionSet_AVX)) @@ -28237,7 +28237,7 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd } else { - GenTree* op1Shuffled = fgMakeMultiUse(&op1); + GenTree* op1Shuffled = fgMakeMultiUse(&op1); NamedIntrinsic permIntrinsic = (simdSize == 64) ? NI_AVX512_Permute2x64 : NI_AVX_Permute; if ((simdSize > 16) || compOpportunisticallyDependsOn(InstructionSet_AVX)) From 3a5a082cdcb6fa7ee95c61416696b3716937a4c4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:51:30 +0000 Subject: [PATCH 6/9] V512 float Sum: preserve (s0+s1)+(s2+s3) halve-combine grouping Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/e62caef7-2b87-4b6c-bc83-401d7672fdfc Co-authored-by: tannergooding <10487869+tannergooding@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 45 ++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 90a85bf50f297f..8a0c4b8b67356f 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -28195,9 +28195,9 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd // After that, each 128-bit lane of op1 holds the sum of its elements // broadcast across the lane. We then reduce the lanes by combining // upper/lower halves step-by-step down to a single V128. Floating- - // point addition is not associative, so the exact IR shape of the - // half-combine is deliberately preserved and matches the integer - // path below. + // point addition is not associative, so the halve-combine grouping + // below deliberately preserves the prior recursive + // `Sum(lower) + Sum(upper)` shape. if (simdBaseType == TYP_FLOAT) { @@ -28260,24 +28260,39 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd // At this point every 128-bit lane of op1 contains that lane's reduced // sum broadcast across the lane. Combine the lanes into a single V128 - // by combining upper/lower halves step-by-step, mirroring the integer - // path below: - // vector256 = vector512.GetLower() + vector512.GetUpper() - // vector128 = vector256.GetLower() + vector256.GetUpper() - // return vector128.ToScalar() + // by reducing upper/lower halves step-by-step. Floating-point addition + // is not associative, so the grouping used here deliberately matches + // the prior recursive shape: + // V512: Sum = Sum(v512.GetLower()) + Sum(v512.GetUpper()) + // V256: Sum = (v256.GetLower() + v256.GetUpper()).ToScalar() + // V128: Sum = v128.ToScalar() if (simdSize == 64) { - GenTree* op1Dup = fgMakeMultiUse(&op1); + // Split v512 into its two V256 halves and reduce each to a V128 + // independently (GetLower + GetUpper + Add), then add the two + // V128 results. This preserves `Sum(lower256) + Sum(upper256)` + // grouping, i.e. `(s0 + s1) + (s2 + s3)` where s_i is the sum + // of the i-th 128-bit lane. + GenTree* op1Upper = fgMakeMultiUse(&op1); - op1 = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseType, 64); - op1Dup = gtNewSimdGetUpperNode(TYP_SIMD32, op1Dup, simdBaseType, 64); + op1 = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseType, 64); + op1Upper = gtNewSimdGetUpperNode(TYP_SIMD32, op1Upper, simdBaseType, 64); - simdSize = 32; - op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD32, op1, op1Dup, simdBaseType, 32); - } + GenTree* tmpDup = fgMakeMultiUse(&op1); + op1 = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseType, 32); + tmpDup = gtNewSimdGetUpperNode(TYP_SIMD16, tmpDup, simdBaseType, 32); + op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1, tmpDup, simdBaseType, 16); - if (simdSize == 32) + tmpDup = fgMakeMultiUse(&op1Upper); + op1Upper = gtNewSimdGetLowerNode(TYP_SIMD16, op1Upper, simdBaseType, 32); + tmpDup = gtNewSimdGetUpperNode(TYP_SIMD16, tmpDup, simdBaseType, 32); + op1Upper = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1Upper, tmpDup, simdBaseType, 16); + + simdSize = 16; + op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1, op1Upper, simdBaseType, 16); + } + else if (simdSize == 32) { GenTree* op1Dup = fgMakeMultiUse(&op1); From 98cc1c938fc535ef61a640ccc25d54d7e7943e08 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 20:19:42 +0000 Subject: [PATCH 7/9] V512 FP Sum: extract 4 lanes directly via GetLower128/ExtractVector128 Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/7dae7a9d-0582-4e67-b63d-83e84b191141 Co-authored-by: tannergooding <10487869+tannergooding@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 40 ++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 8a0c4b8b67356f..57625367183bab 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -28269,28 +28269,28 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd if (simdSize == 64) { - // Split v512 into its two V256 halves and reduce each to a V128 - // independently (GetLower + GetUpper + Add), then add the two - // V128 results. This preserves `Sum(lower256) + Sum(upper256)` - // grouping, i.e. `(s0 + s1) + (s2 + s3)` where s_i is the sum - // of the i-th 128-bit lane. - GenTree* op1Upper = fgMakeMultiUse(&op1); - - op1 = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseType, 64); - op1Upper = gtNewSimdGetUpperNode(TYP_SIMD32, op1Upper, simdBaseType, 64); - - GenTree* tmpDup = fgMakeMultiUse(&op1); - op1 = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseType, 32); - tmpDup = gtNewSimdGetUpperNode(TYP_SIMD16, tmpDup, simdBaseType, 32); - op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1, tmpDup, simdBaseType, 16); - - tmpDup = fgMakeMultiUse(&op1Upper); - op1Upper = gtNewSimdGetLowerNode(TYP_SIMD16, op1Upper, simdBaseType, 32); - tmpDup = gtNewSimdGetUpperNode(TYP_SIMD16, tmpDup, simdBaseType, 32); - op1Upper = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1Upper, tmpDup, simdBaseType, 16); + // Extract each of the four 128-bit lanes directly from the V512 + // using GetLower128 (lane 0) and AVX512 ExtractVector128 (lanes + // 1-3), then combine as `(s0 + s1) + (s2 + s3)` to preserve the + // prior recursive `Sum(lower256) + Sum(upper256)` grouping. + GenTree* op1Lane1 = fgMakeMultiUse(&op1); + GenTree* op1Lane2 = fgMakeMultiUse(&op1); + GenTree* op1Lane3 = fgMakeMultiUse(&op1); + + GenTree* op1Lane0 = + gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector512_GetLower128, simdBaseType, 64); + op1Lane1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Lane1, gtNewIconNode(1), NI_AVX512_ExtractVector128, + simdBaseType, 64); + op1Lane2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Lane2, gtNewIconNode(2), NI_AVX512_ExtractVector128, + simdBaseType, 64); + op1Lane3 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Lane3, gtNewIconNode(3), NI_AVX512_ExtractVector128, + simdBaseType, 64); + + GenTree* lowerSum = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1Lane0, op1Lane1, simdBaseType, 16); + GenTree* upperSum = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1Lane2, op1Lane3, simdBaseType, 16); simdSize = 16; - op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, op1, op1Upper, simdBaseType, 16); + op1 = gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, lowerSum, upperSum, simdBaseType, 16); } else if (simdSize == 32) { From 4935bf7e53a691005943cd05d15099c66c3fd63d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 12:49:57 +0000 Subject: [PATCH 8/9] JIT: Use 0b01010101 for TYP_DOUBLE vpermilpd imm to cover all lanes Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/0fd8d936-8f2e-4c10-a124-9a16a5998182 Co-authored-by: tannergooding <10487869+tannergooding@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 57625367183bab..504b887d1f7612 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -28243,7 +28243,9 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd if ((simdSize > 16) || compOpportunisticallyDependsOn(InstructionSet_AVX)) { // Per lane, the permute below gives us [0, 1] -> [1, 0] - op1 = gtNewSimdHWIntrinsicNode(simdType, op1, gtNewIconNode((int)0b0001, TYP_INT), permIntrinsic, + // vpermilpd uses one imm bit per double element (2 for V128, 4 for V256, + // 8 for V512); 0b01010101 swaps within each 128-bit lane at all widths. + op1 = gtNewSimdHWIntrinsicNode(simdType, op1, gtNewIconNode((int)0b01010101, TYP_INT), permIntrinsic, simdBaseType, simdSize); } else From 08cbe42d4ffa1f1b6ecba1ceef741d895e33fd6f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 21:08:49 +0000 Subject: [PATCH 9/9] JIT: Put V512 GetLower128 call on a single line to satisfy jit-format Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/9e0c910c-c854-4c8e-8436-95f44c4c8071 Co-authored-by: tannergooding <10487869+tannergooding@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 504b887d1f7612..d74d6d1449168d 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -28279,8 +28279,7 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, var_types simd GenTree* op1Lane2 = fgMakeMultiUse(&op1); GenTree* op1Lane3 = fgMakeMultiUse(&op1); - GenTree* op1Lane0 = - gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector512_GetLower128, simdBaseType, 64); + GenTree* op1Lane0 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector512_GetLower128, simdBaseType, 64); op1Lane1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Lane1, gtNewIconNode(1), NI_AVX512_ExtractVector128, simdBaseType, 64); op1Lane2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Lane2, gtNewIconNode(2), NI_AVX512_ExtractVector128,