From c7c7c22b541a48980203f503376868d44bceeb07 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 25 Apr 2026 19:35:11 +0200 Subject: [PATCH 1/4] Use safe Span.Slice loop pattern in Enumerable.SumSignedIntegersVectorized Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../System.Linq/src/System/Linq/Sum.cs | 35 +++++++------------ 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/src/libraries/System.Linq/src/System/Linq/Sum.cs b/src/libraries/System.Linq/src/System/Linq/Sum.cs index 481997b8f82633..203555bed43ed4 100644 --- a/src/libraries/System.Linq/src/System/Linq/Sum.cs +++ b/src/libraries/System.Linq/src/System/Linq/Sum.cs @@ -5,7 +5,6 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; namespace System.Linq { @@ -84,9 +83,6 @@ private static T SumSignedIntegersVectorized(ReadOnlySpan span) Debug.Assert(Vector.Count > 2); Debug.Assert(Vector.IsHardwareAccelerated); - ref T ptr = ref MemoryMarshal.GetReference(span); - nuint length = (nuint)span.Length; - // Overflow testing for vectors is based on setting the sign bit of the overflowTracking // vector for an element if the following are all true: // - The two elements being summed have the same sign bit. If one element is positive @@ -120,24 +116,22 @@ private static T SumSignedIntegersVectorized(ReadOnlySpan span) // Unroll the loop to sum 4 vectors per iteration. This reduces range check // and overflow check frequency, allows us to eliminate move operations swapping // accumulators, and may have pipelining benefits. - nuint index = 0; - nuint limit = length - (nuint)Vector.Count * 4; - do + while (span.Length >= Vector.Count * 4) { // Switch accumulators with each step to avoid an additional move operation - Vector data = Vector.LoadUnsafe(ref ptr, index); + Vector data = Vector.Create(span); Vector accumulator2 = accumulator + data; Vector overflowTracking = (accumulator2 ^ accumulator) & (accumulator2 ^ data); - data = Vector.LoadUnsafe(ref ptr, index + (nuint)Vector.Count); + data = Vector.Create(span.Slice(Vector.Count)); accumulator = accumulator2 + data; overflowTracking |= (accumulator ^ accumulator2) & (accumulator ^ data); - data = Vector.LoadUnsafe(ref ptr, index + (nuint)Vector.Count * 2); + data = Vector.Create(span.Slice(Vector.Count * 2)); accumulator2 = accumulator + data; overflowTracking |= (accumulator2 ^ accumulator) & (accumulator2 ^ data); - data = Vector.LoadUnsafe(ref ptr, index + (nuint)Vector.Count * 3); + data = Vector.Create(span.Slice(Vector.Count * 3)); accumulator = accumulator2 + data; overflowTracking |= (accumulator ^ accumulator2) & (accumulator ^ data); @@ -146,24 +140,23 @@ private static T SumSignedIntegersVectorized(ReadOnlySpan span) ThrowHelper.ThrowOverflowException(); } - index += (nuint)Vector.Count * 4; - } while (index < limit); + span = span.Slice(Vector.Count * 4); + } // Process remaining vectors, if any, without unrolling - limit = length - (nuint)Vector.Count; - if (index < limit) + if (span.Length >= Vector.Count) { Vector overflowTracking = Vector.Zero; do { - Vector data = Vector.LoadUnsafe(ref ptr, index); + Vector data = Vector.Create(span); Vector accumulator2 = accumulator + data; overflowTracking |= (accumulator2 ^ accumulator) & (accumulator2 ^ data); accumulator = accumulator2; - index += (nuint)Vector.Count; - } while (index < limit); + span = span.Slice(Vector.Count); + } while (span.Length >= Vector.Count); if ((overflowTracking & overflowTestVector) != Vector.Zero) { @@ -180,11 +173,9 @@ private static T SumSignedIntegersVectorized(ReadOnlySpan span) } // Add any remaining elements - while (index < length) + foreach (T value in span) { - checked { result += Unsafe.Add(ref ptr, index); } - - index++; + checked { result += value; } } return result; From dd072a0e484f9f89288a9d5a8793c4648bc0531c Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 26 Apr 2026 02:39:51 +0200 Subject: [PATCH 2/4] Defer overflow-vector check to end of loop Hoist `overflowTracking` out of the unrolled main loop and the tail loop so a single `vptest + jne` runs once after all data is processed instead of every 4 vectors. The sign-bit overflow trick is unchanged the bits accumulated across the whole input still land in the same lanes, so a single final test is sufficient. Also unifies the previous separate `if (... >= Count) { Vector overflowTracking = Zero; do { ... } while (...); if (...) Throw; }` tail block into the same shared `overflowTracking` chain as the main loop, which lets us drop the `do..while` and the second sign-bit test entirely. Microbenchmark on Ryzen 9 7950X (AVX-512, `int[].Sum`): ```n| N | Original | This PR | Ratio | |------- |---------:|---------:|------:| | 50 | 4.85 ns | 4.71 ns | 0.97x | | 100 | 6.11 ns | 5.87 ns | 0.96x | | 100000 | 4.10 us | 3.77 us | 0.92x | ```n Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../System.Linq/src/System/Linq/Sum.cs | 45 ++++++++----------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/src/libraries/System.Linq/src/System/Linq/Sum.cs b/src/libraries/System.Linq/src/System/Linq/Sum.cs index 203555bed43ed4..a9187f9ebb40b6 100644 --- a/src/libraries/System.Linq/src/System/Linq/Sum.cs +++ b/src/libraries/System.Linq/src/System/Linq/Sum.cs @@ -100,28 +100,29 @@ private static T SumSignedIntegersVectorized(ReadOnlySpan span) // Thus, if we had a sign swap compared to both inputs, then signof(input1) == signof(input2) and // we must have overflowed. // - // By bitwise or-ing the overflowTracking vector for each step we can save cycles by testing - // the sign bits less often. If any iteration has the sign bit set in any element it indicates - // there was an overflow. + // By bitwise or-ing the overflowTracking vector throughout the entire loop and + // only testing it once at the end we save the cost of an in-loop test+branch per + // iteration. If any accumulation across the whole input has the sign bit set in + // any element it indicates there was an overflow. // // Note: The overflow checking in this algorithm is only correct for signed integers. // If support is ever added for unsigned integers then the overflow check should be: // overflowTracking |= (input1 & input2) | Vector.AndNot(input1 | input2, result); Vector accumulator = Vector.Zero; + Vector overflowTracking = Vector.Zero; // Build a test vector with only the sign bit set in each element. Vector overflowTestVector = new(T.MinValue); - // Unroll the loop to sum 4 vectors per iteration. This reduces range check - // and overflow check frequency, allows us to eliminate move operations swapping - // accumulators, and may have pipelining benefits. + // Unroll the loop to sum 4 vectors per iteration. This allows us to eliminate + // move operations swapping accumulators, and may have pipelining benefits. while (span.Length >= Vector.Count * 4) { // Switch accumulators with each step to avoid an additional move operation Vector data = Vector.Create(span); Vector accumulator2 = accumulator + data; - Vector overflowTracking = (accumulator2 ^ accumulator) & (accumulator2 ^ data); + overflowTracking |= (accumulator2 ^ accumulator) & (accumulator2 ^ data); data = Vector.Create(span.Slice(Vector.Count)); accumulator = accumulator2 + data; @@ -135,33 +136,23 @@ private static T SumSignedIntegersVectorized(ReadOnlySpan span) accumulator = accumulator2 + data; overflowTracking |= (accumulator ^ accumulator2) & (accumulator ^ data); - if ((overflowTracking & overflowTestVector) != Vector.Zero) - { - ThrowHelper.ThrowOverflowException(); - } - span = span.Slice(Vector.Count * 4); } // Process remaining vectors, if any, without unrolling - if (span.Length >= Vector.Count) + while (span.Length >= Vector.Count) { - Vector overflowTracking = Vector.Zero; - - do - { - Vector data = Vector.Create(span); - Vector accumulator2 = accumulator + data; - overflowTracking |= (accumulator2 ^ accumulator) & (accumulator2 ^ data); - accumulator = accumulator2; + Vector data = Vector.Create(span); + Vector accumulator2 = accumulator + data; + overflowTracking |= (accumulator2 ^ accumulator) & (accumulator2 ^ data); + accumulator = accumulator2; - span = span.Slice(Vector.Count); - } while (span.Length >= Vector.Count); + span = span.Slice(Vector.Count); + } - if ((overflowTracking & overflowTestVector) != Vector.Zero) - { - ThrowHelper.ThrowOverflowException(); - } + if ((overflowTracking & overflowTestVector) != Vector.Zero) + { + ThrowHelper.ThrowOverflowException(); } // Add the elements in the vector horizontally. From 2261c361d046e62297417d6c2444a1cb0baba0be Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 26 Apr 2026 22:44:19 +0200 Subject: [PATCH 3/4] test --- src/coreclr/jit/inline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/inline.h b/src/coreclr/jit/inline.h index 83d74587789366..36bbd9946b1d87 100644 --- a/src/coreclr/jit/inline.h +++ b/src/coreclr/jit/inline.h @@ -989,7 +989,7 @@ class InlineStrategy // Maximum number of over-budget [Intrinsic]-type inlines allowed per root method. enum { - MAX_OVER_BUDGET_INTRINSIC_INLINES = 50 + MAX_OVER_BUDGET_INTRINSIC_INLINES = 100 }; // Number of over-budget inlines admitted because the callee was on an [Intrinsic] type. From f72c7b8800415319585a1c4be1291beef55f2d42 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 26 Apr 2026 22:45:42 +0200 Subject: [PATCH 4/4] Update inline.h --- src/coreclr/jit/inline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/inline.h b/src/coreclr/jit/inline.h index 36bbd9946b1d87..83d74587789366 100644 --- a/src/coreclr/jit/inline.h +++ b/src/coreclr/jit/inline.h @@ -989,7 +989,7 @@ class InlineStrategy // Maximum number of over-budget [Intrinsic]-type inlines allowed per root method. enum { - MAX_OVER_BUDGET_INTRINSIC_INLINES = 100 + MAX_OVER_BUDGET_INTRINSIC_INLINES = 50 }; // Number of over-budget inlines admitted because the callee was on an [Intrinsic] type.