From d70bfc1361b51ad66bb6145f98a29c02f7fdba81 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Wed, 4 Mar 2026 13:27:01 -0800
Subject: [PATCH 1/6] improve Adler32 vectorization

---
 .../src/System.IO.Hashing.csproj              |   1 +
 .../src/System/IO/Hashing/Adler32.cs          | 268 +----------
 .../src/System/IO/Hashing/Adler32Simd.cs      | 415 ++++++++++++++++++
 .../System.IO.Hashing/tests/Adler32Tests.cs   |  54 ++-
 4 files changed, 445 insertions(+), 293 deletions(-)
 create mode 100644 src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
diff --git a/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj b/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj
index 06555e0e92fc41..29cc99349682d0 100644
--- a/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj
+++ b/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj
@@ -35,6 +35,7 @@ System.IO.Hashing.XxHash32</PackageDescription>
   </ItemGroup>
 
   <ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETCoreApp'">
+    <Compile Include="System\IO\Hashing\Adler32Simd.cs" />
     <Compile Include="System\IO\Hashing\Crc32.Arm.cs" />
     <Compile Include="System\IO\Hashing\Crc32.Vectorized.cs" />
     <Compile Include="System\IO\Hashing\Crc64.Vectorized.cs" />
diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs
index da54cdb3372b19..e6e9956d56c190 100644
--- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs
+++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs
@@ -5,11 +5,6 @@
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-#if NET
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.Arm;
-using System.Runtime.Intrinsics.X86;
-#endif
 
 namespace System.IO.Hashing
 {
@@ -30,7 +25,7 @@ public sealed partial class Adler32 : NonCryptographicHashAlgorithm
         private uint _adler = InitialState;
 
         /// <summary>Largest prime smaller than 65536.</summary>
-        private const uint ModBase = 65521;
+        internal const uint ModBase = 65521;
         /// <summary>NMax is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) &lt;= 2^32-1</summary>
         private const int NMax = 5552;
 
@@ -192,21 +187,9 @@ private static uint Update(uint adler, ReadOnlySpan<byte> source)
             }
 
 #if NET
-            if (BitConverter.IsLittleEndian &&
-                Vector128.IsHardwareAccelerated &&
-                source.Length >= Vector128<byte>.Count * 2)
+            if (IsVectorizable(source))
             {
-                if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && source.Length >= Vector512<byte>.Count)
-                {
-                    return UpdateVector512(adler, source);
-                }
-
-                if (Vector256.IsHardwareAccelerated && Avx2.IsSupported && source.Length >= Vector256<byte>.Count)
-                {
-                    return UpdateVector256(adler, source);
-                }
-
-                return UpdateVector128(adler, source);
+                return UpdateVectorized(adler, source);
             }
 #endif
 
@@ -236,250 +219,5 @@ private static uint UpdateScalar(uint adler, ReadOnlySpan<byte> source)
 
             return (s2 << 16) | s1;
         }
-
-#if NET
-        [MethodImpl(MethodImplOptions.NoInlining)]
-        private static uint UpdateVector128(uint adler, ReadOnlySpan<byte> source)
-        {
-            Debug.Assert(source.Length >= Vector128<byte>.Count * 2);
-
-            const int BlockSize = 32; // two Vector128<byte> loads
-
-            uint s1 = adler & 0xFFFF;
-            uint s2 = (adler >> 16) & 0xFFFF;
-
-            ref byte sourceRef = ref MemoryMarshal.GetReference(source);
-            int length = source.Length;
-
-            Vector128<sbyte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
-            Vector128<sbyte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-
-            do
-            {
-                int n = Math.Min(length, NMax);
-                int blocks = n / BlockSize;
-                n = blocks * BlockSize;
-                length -= n;
-
-                Vector128<uint> vs1 = Vector128<uint>.Zero;
-                Vector128<uint> vs2 = Vector128.CreateScalar(s2);
-                Vector128<uint> vps = Vector128.CreateScalar(s1 * (uint)blocks);
-
-                do
-                {
-                    Vector128<byte> bytes1 = Vector128.LoadUnsafe(ref sourceRef);
-                    Vector128<byte> bytes2 = Vector128.LoadUnsafe(ref sourceRef, 16);
-                    sourceRef = ref Unsafe.Add(ref sourceRef, BlockSize);
-
-                    vps += vs1;
-
-                    if (Ssse3.IsSupported)
-                    {
-                        vs1 += Sse2.SumAbsoluteDifferences(bytes1, Vector128<byte>.Zero).AsUInt32();
-                        vs1 += Sse2.SumAbsoluteDifferences(bytes2, Vector128<byte>.Zero).AsUInt32();
-
-                        vs2 += Sse2.MultiplyAddAdjacent(Ssse3.MultiplyAddAdjacent(bytes1, tap1), Vector128<short>.One).AsUInt32();
-                        vs2 += Sse2.MultiplyAddAdjacent(Ssse3.MultiplyAddAdjacent(bytes2, tap2), Vector128<short>.One).AsUInt32();
-                    }
-                    else if (AdvSimd.IsSupported)
-                    {
-                        // Widening byte sum (equivalent of SumAbsoluteDifferences against zero)
-                        vs1 = AdvSimd.AddPairwiseWideningAndAdd(
-                            vs1,
-                            AdvSimd.AddPairwiseWideningAndAdd(
-                                AdvSimd.AddPairwiseWidening(bytes1),
-                                bytes2));
-
-                        // Widening multiply + horizontal add (equivalent of MultiplyAddAdjacent chain).
-                        // Because weights are all positive (1-32), unsigned byte * unsigned byte multiply is valid.
-                        Vector128<ushort> wprod1 = AdvSimd.MultiplyWideningLower(bytes1.GetLower(), tap1.AsByte().GetLower());
-                        wprod1 = AdvSimd.MultiplyWideningUpperAndAdd(wprod1, bytes1, tap1.AsByte());
-                        vs2 = AdvSimd.AddPairwiseWideningAndAdd(vs2, wprod1);
-
-                        Vector128<ushort> wprod2 = AdvSimd.MultiplyWideningLower(bytes2.GetLower(), tap2.AsByte().GetLower());
-                        wprod2 = AdvSimd.MultiplyWideningUpperAndAdd(wprod2, bytes2, tap2.AsByte());
-                        vs2 = AdvSimd.AddPairwiseWideningAndAdd(vs2, wprod2);
-                    }
-                    else
-                    {
-                        (Vector128<ushort> lo1, Vector128<ushort> hi1) = Vector128.Widen(bytes1);
-                        (Vector128<ushort> lo2, Vector128<ushort> hi2) = Vector128.Widen(bytes2);
-                        (Vector128<uint> sumLo, Vector128<uint> sumHi) = Vector128.Widen(lo1 + hi1 + lo2 + hi2);
-                        vs1 += sumLo + sumHi;
-                        vs2 += WeightedSumWidening128(bytes1, tap1) + WeightedSumWidening128(bytes2, tap2);
-
-                        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                        static Vector128<uint> WeightedSumWidening128(Vector128<byte> data, Vector128<sbyte> weights)
-                        {
-                            (Vector128<ushort> dLo, Vector128<ushort> dHi) = Vector128.Widen(data);
-                            (Vector128<short> wLo, Vector128<short> wHi) = Vector128.Widen(weights);
-
-                            (Vector128<int> pLo1, Vector128<int> pHi1) = Vector128.Widen(dLo.AsInt16() * wLo);
-                            (Vector128<int> pLo2, Vector128<int> pHi2) = Vector128.Widen(dHi.AsInt16() * wHi);
-
-                            return (pLo1 + pHi1 + pLo2 + pHi2).AsUInt32();
-                        }
-                    }
-                }
-                while (--blocks > 0);
-
-                vs2 += vps << 5;
-
-                s1 += Vector128.Sum(vs1);
-                s2 = Vector128.Sum(vs2);
-
-                s1 %= ModBase;
-                s2 %= ModBase;
-            }
-            while (length >= BlockSize);
-
-            if (length > 0)
-            {
-                UpdateScalarTail(ref sourceRef, length, ref s1, ref s2);
-            }
-
-            return (s2 << 16) | s1;
-        }
-
-        [MethodImpl(MethodImplOptions.NoInlining)]
-        private static uint UpdateVector256(uint adler, ReadOnlySpan<byte> source)
-        {
-            Debug.Assert(source.Length >= Vector256<byte>.Count);
-
-            const int BlockSize = 32;
-
-            uint s1 = adler & 0xFFFF;
-            uint s2 = (adler >> 16) & 0xFFFF;
-
-            ref byte sourceRef = ref MemoryMarshal.GetReference(source);
-            int length = source.Length;
-
-            Vector256<sbyte> weights = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-
-            do
-            {
-                int n = Math.Min(length, NMax);
-                int blocks = n / BlockSize;
-                n = blocks * BlockSize;
-                length -= n;
-
-                Vector256<uint> vs1 = Vector256.CreateScalar(s1);
-                Vector256<uint> vs2 = Vector256.CreateScalar(s2);
-                Vector256<uint> vs3 = Vector256<uint>.Zero;
-
-                do
-                {
-                    Vector256<byte> data = Vector256.LoadUnsafe(ref sourceRef);
-                    sourceRef = ref Unsafe.Add(ref sourceRef, BlockSize);
-
-                    Vector256<uint> vs1_0 = vs1;
-                    vs1 += Avx2.SumAbsoluteDifferences(data, Vector256<byte>.Zero).AsUInt32();
-                    vs3 += vs1_0;
-
-                    Vector256<short> mad = Avx2.MultiplyAddAdjacent(data, weights);
-                    vs2 += Avx2.MultiplyAddAdjacent(mad, Vector256<short>.One).AsUInt32();
-                }
-                while (--blocks > 0);
-
-                vs3 <<= 5;
-                vs2 += vs3;
-
-                s1 = (uint)Vector256.Sum(vs1.AsUInt64()); // SumAbsoluteDifferences stores the results in the even lanes
-                s2 = Vector256.Sum(vs2);
-
-                s1 %= ModBase;
-                s2 %= ModBase;
-            }
-            while (length >= BlockSize);
-
-            if (length > 0)
-            {
-                UpdateScalarTail(ref sourceRef, length, ref s1, ref s2);
-            }
-
-            return (s2 << 16) | s1;
-        }
-
-        [MethodImpl(MethodImplOptions.NoInlining)]
-        private static uint UpdateVector512(uint adler, ReadOnlySpan<byte> source)
-        {
-            Debug.Assert(source.Length >= Vector512<byte>.Count);
-
-            const int BlockSize = 64;
-
-            uint s1 = adler & 0xFFFF;
-            uint s2 = (adler >> 16) & 0xFFFF;
-
-            ref byte sourceRef = ref MemoryMarshal.GetReference(source);
-            int length = source.Length;
-
-            Vector512<sbyte> weights = Vector512.Create(
-                32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
-                32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-
-            do
-            {
-                int n = Math.Min(length, NMax);
-                int blocks = n / BlockSize;
-                n = blocks * BlockSize;
-                length -= n;
-
-                Vector512<uint> vs1 = Vector512.CreateScalar(s1);
-                Vector512<uint> vs2 = Vector512.CreateScalar(s2);
-                Vector512<uint> vs3 = Vector512<uint>.Zero;
-
-                do
-                {
-                    Vector512<byte> data = Vector512.LoadUnsafe(ref sourceRef);
-                    sourceRef = ref Unsafe.Add(ref sourceRef, BlockSize);
-
-                    Vector512<uint> vs1_0 = vs1;
-                    vs1 += Avx512BW.SumAbsoluteDifferences(data, Vector512<byte>.Zero).AsUInt32();
-                    vs3 += vs1_0;
-                    vs2 += Avx512BW.MultiplyAddAdjacent(Avx512BW.MultiplyAddAdjacent(data, weights), Vector512<short>.One).AsUInt32();
-
-                    Vector256<uint> sumLo = Avx2.SumAbsoluteDifferences(data.GetLower(), Vector256<byte>.Zero).AsUInt32();
-                    vs2 += Vector512.Create(sumLo << 5, Vector256<uint>.Zero);
-                }
-                while (--blocks > 0);
-
-                vs3 <<= 6;
-                vs2 += vs3;
-
-                s1 = (uint)Vector512.Sum(vs1.AsUInt64());
-                s2 = Vector512.Sum(vs2);
-
-                s1 %= ModBase;
-                s2 %= ModBase;
-            }
-            while (length >= BlockSize);
-
-            if (length >= Vector256<byte>.Count)
-            {
-                return UpdateVector256((s2 << 16) | s1, MemoryMarshal.CreateReadOnlySpan(ref sourceRef, length));
-            }
-
-            if (length > 0)
-            {
-                UpdateScalarTail(ref sourceRef, length, ref s1, ref s2);
-            }
-
-            return (s2 << 16) | s1;
-        }
-
-        private static void UpdateScalarTail(ref byte sourceRef, int length, ref uint s1, ref uint s2)
-        {
-            Debug.Assert(length is > 0 and < NMax);
-
-            foreach (byte b in MemoryMarshal.CreateReadOnlySpan(ref sourceRef, length))
-            {
-                s1 += b;
-                s2 += s1;
-            }
-
-            s1 %= ModBase;
-            s2 %= ModBase;
-        }
-#endif
     }
 }
diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
new file mode 100644
index 00000000000000..b96ee4404497bb
--- /dev/null
+++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
@@ -0,0 +1,415 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
+
+namespace System.IO.Hashing;
+
+public sealed partial class Adler32
+{
+    private static bool IsVectorizable(ReadOnlySpan<byte> source)
+        => Vector128.IsHardwareAccelerated && source.Length >= Vector128<byte>.Count;
+
+    private static uint UpdateVectorized(uint adler, ReadOnlySpan<byte> source)
+        => Adler32Simd.UpdateVectorized(adler, source);
+}
+
+file static class Adler32Simd
+{
+    // VMax represents the maximum number of 16-byte vectors we can process before reducing
+    // mod 65521. This is analogous to NMax in the scalar code, however because the accumulated
+    // values are distributed across vector elements, we can process more bytes before possible
+    // overflow in any individual element. For this implementation, the max is actually 460
+    // vectors, but we choose 448, because it divides evenly by any reasonable block size.
+    public const uint VMax = 448;
+
+    private static ReadOnlySpan<byte> MaskBytes => [
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+    ];
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static uint UpdateVectorized(uint adler, ReadOnlySpan<byte> source)
+    {
+        if (Vector256.IsHardwareAccelerated && Avx2.IsSupported)
+        {
+            return UpdateCore<AdlerVector256, AccumulateX86, DotProductX86>(adler, source);
+        }
+
+        if (Ssse3.IsSupported)
+        {
+            return UpdateCore<AdlerVector128, AccumulateX86, DotProductX86>(adler, source);
+        }
+
+        if (AdvSimd.Arm64.IsSupported)
+        {
+            if (Dp.IsSupported)
+            {
+                return UpdateCore<AdlerVector128, AccumulateArm64, DotProductArm64Dp>(adler, source);
+            }
+
+            return UpdateCore<AdlerVector128, AccumulateArm64, DotProductArm64>(adler, source);
+        }
+
+        return UpdateCore<AdlerVector128, AccumulateXplat, DotProductXplat>(adler, source);
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    private static uint UpdateCore<TSimdStrategy, TAccumulate, TDotProduct>(uint adler, ReadOnlySpan<byte> source)
+        where TSimdStrategy : struct, ISimdStrategy
+        where TAccumulate : struct, ISimdAccumulate
+        where TDotProduct : struct, ISimdDotProduct
+    {
+        Debug.Assert(source.Length >= Vector128<byte>.Count);
+
+        uint s1 = (ushort)adler;
+        uint s2 = adler >>> 16;
+
+        ref byte bufRef = ref MemoryMarshal.GetReference(source);
+        uint len = (uint)source.Length;
+
+        uint vectors = len / (uint)Vector128<byte>.Count;
+        uint loopVectors = vectors & ~1u;
+        len -= vectors * (uint)Vector128<byte>.Count;
+
+        Vector128<uint> vs1 = Vector128.CreateScalar(s1);
+        Vector128<uint> vs2 = Vector128.CreateScalar(s2);
+
+        if (loopVectors != 0)
+        {
+            (vs1, vs2) = TSimdStrategy.VectorLoop<TAccumulate, TDotProduct>(vs1, vs2, ref bufRef, loopVectors);
+            bufRef = ref Unsafe.Add(ref bufRef, loopVectors * (uint)Vector128<byte>.Count);
+        }
+
+        Vector128<byte> weights = Vector128.CreateSequence((byte)16, unchecked((byte)-1));
+        Vector128<uint> vps;
+
+        if ((vectors & 1) != 0)
+        {
+            Vector128<byte> bytes = Vector128.LoadUnsafe(ref bufRef);
+            bufRef = ref Unsafe.Add(ref bufRef, (uint)Vector128<byte>.Count);
+
+            vps = vs1;
+
+            vs1 = TAccumulate.Accumulate(vs1, bytes);
+            vs2 = TDotProduct.DotProduct(vs2, bytes, weights);
+
+            vs2 += vps << 4;
+        }
+
+        if (len != 0)
+        {
+            Debug.Assert(len < (uint)Vector128<byte>.Count);
+
+            Vector128<byte> bytes = Vector128.LoadUnsafe(ref Unsafe.Subtract(ref bufRef, (uint)Vector128<byte>.Count - len));
+            bytes &= Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(MaskBytes), len);
+
+            vps = vs1;
+
+            vs1 = TAccumulate.Accumulate(vs1, bytes);
+            vs2 = TDotProduct.DotProduct(vs2, bytes, weights);
+
+            vs2 += vps * Vector128.Create(len);
+        }
+
+        s1 = Vector128.Sum(vs1) % Adler32.ModBase;
+        s2 = Vector128.Sum(vs2) % Adler32.ModBase;
+
+        return s1 | (s2 << 16);
+    }
+}
+
+file struct AdlerVector128 : ISimdStrategy
+{
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static Vector128<uint> QuickModBase(Vector128<uint> values)
+    {
+        // Calculating the residual mod 65521 is impractical in SIMD, however we can reduce by
+        // enough to prevent overflow without changing the final result of a modulo performed later.
+        // Essentially, the high word of the accumulator represents the number of times it has
+        // wrapped to 65536.
+        //
+        // 65536 % 65521 = 15, which is what would be carried forward from the high word.
+        // We can simply multiply the high word by 15 and add that to the low word to perform
+        // the reduction, resulting in a maximum possible residual of 0xFFFF0.
+        //
+        // This is further optimized to: `high * 16 - high + low`
+        // and implemented as: `(high << 4) - high + low`.
+
+        Vector128<uint> vlo = values & (Vector128<uint>.AllBitsSet >>> 16);
+        Vector128<uint> vhi = values >>> 16;
+        return (vhi << 4) - vhi + vlo;
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate, TDotProduct>(Vector128<uint> vs1, Vector128<uint> vs2, ref byte sourceRef, uint vectors)
+        where TAccumulate : struct, ISimdAccumulate
+        where TDotProduct : struct, ISimdDotProduct
+    {
+        const uint blockSize = 2;
+
+        Vector128<byte> weights1 = Vector128.CreateSequence((byte)32, unchecked((byte)-1));
+        Vector128<byte> weights2 = Vector128.CreateSequence((byte)16, unchecked((byte)-1));
+
+        while (vectors >= blockSize)
+        {
+            Vector128<uint> vs3 = default;
+            Vector128<uint> vps = default;
+
+            uint blocks = uint.Min(vectors, Adler32Simd.VMax) / blockSize;
+            vectors -= blocks * blockSize;
+
+            do
+            {
+                Vector128<byte> bytes1 = Vector128.LoadUnsafe(ref sourceRef);
+                Vector128<byte> bytes2 = Vector128.LoadUnsafe(ref sourceRef, (uint)Vector128<byte>.Count);
+                sourceRef = ref Unsafe.Add(ref sourceRef, (uint)Vector128<byte>.Count * 2);
+
+                vps += vs1;
+
+                vs1 = TAccumulate.Accumulate(vs1, bytes1, bytes2);
+
+                vs2 = TDotProduct.DotProduct(vs2, bytes1, weights1);
+                vs3 = TDotProduct.DotProduct(vs3, bytes2, weights2);
+            }
+            while (--blocks != 0);
+
+            vs2 += vs3;
+            vs2 += vps << 5;
+
+            vs1 = QuickModBase(vs1);
+            vs2 = QuickModBase(vs2);
+        }
+
+        return (vs1, vs2);
+    }
+}
+
+file struct AdlerVector256 : ISimdStrategy
+{
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<uint> QuickModBase(Vector256<uint> values)
+    {
+        Vector256<uint> vlo = values & (Vector256<uint>.AllBitsSet >>> 16);
+        Vector256<uint> vhi = values >>> 16;
+        return (vhi << 4) - vhi + vlo;
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate, TDotProduct>(Vector128<uint> vs1, Vector128<uint> vs2, ref byte sourceRef, uint vectors)
+        where TAccumulate : struct, ISimdAccumulate
+        where TDotProduct : struct, ISimdDotProduct
+    {
+        const uint blockSize = 4;
+
+        Vector256<byte> weights1 = Vector256.CreateSequence((byte)64, unchecked((byte)-1));
+        Vector256<byte> weights2 = Vector256.CreateSequence((byte)32, unchecked((byte)-1));
+
+        Vector256<uint> ws1 = vs1.ToVector256Unsafe();
+        Vector256<uint> ws2 = vs2.ToVector256Unsafe();
+
+        while (vectors >= blockSize)
+        {
+            Vector256<uint> ws3 = default;
+            Vector256<uint> wps = default;
+
+            uint blocks = uint.Min(vectors, Adler32Simd.VMax) / blockSize;
+            vectors -= blocks * blockSize;
+
+            do
+            {
+                Vector256<byte> bytes1 = Vector256.LoadUnsafe(ref sourceRef);
+                Vector256<byte> bytes2 = Vector256.LoadUnsafe(ref sourceRef, (uint)Vector256<byte>.Count);
+                sourceRef = ref Unsafe.Add(ref sourceRef, (uint)Vector256<byte>.Count * 2);
+
+                wps += ws1;
+
+                ws1 = TAccumulate.Accumulate(ws1, bytes1, bytes2);
+
+                ws2 = TDotProduct.DotProduct(ws2, bytes1, weights1);
+                ws3 = TDotProduct.DotProduct(ws3, bytes2, weights2);
+            }
+            while (--blocks != 0);
+
+            ws2 += ws3;
+            ws2 += wps << 6;
+
+            ws1 = QuickModBase(ws1);
+            ws2 = QuickModBase(ws2);
+        }
+
+        if ((vectors & 2) != 0)
+        {
+            Vector256<byte> bytes = Vector256.LoadUnsafe(ref sourceRef);
+            sourceRef = ref Unsafe.Add(ref sourceRef, (uint)Vector256<byte>.Count);
+
+            Vector256<uint> w_ps = ws1;
+
+            ws1 = TAccumulate.Accumulate(ws1, bytes);
+            ws2 = TDotProduct.DotProduct(ws2, bytes, weights2);
+
+            ws2 += w_ps << 5;
+        }
+
+        vs1 = ws1.GetLower() + ws1.GetUpper();
+        vs2 = ws2.GetLower() + ws2.GetUpper();
+
+        return (vs1, vs2);
+    }
+}
+
+file struct AccumulateX86 : ISimdAccumulate
+{
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes)
+        => Sse2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums;
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes)
+        => Avx2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums;
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes1, Vector128<byte> bytes2)
+    {
+        Vector128<byte> zero = default;
+        Vector128<uint> sad = Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32();
+        return sad + Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums;
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2)
+    {
+        Vector256<byte> zero = default;
+        Vector256<uint> sad = Avx2.SumAbsoluteDifferences(bytes1, zero).AsUInt32();
+        return sad + Avx2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums;
+    }
+}
+
+file struct AccumulateArm64 : ISimdAccumulate
+{
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes)
+        => AdvSimd.Arm64.AddAcrossWidening(bytes).AsUInt32().ToVector128Unsafe() + sums;
+
+    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes)
+        => throw new NotImplementedException();
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes1, Vector128<byte> bytes2)
+        => AdvSimd.AddPairwiseWideningAndAdd(sums, AdvSimd.AddPairwiseWideningAndAdd(AdvSimd.AddPairwiseWidening(bytes1), bytes2));
+
+    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2)
+        => throw new NotImplementedException();
+}
+
+file struct AccumulateXplat : ISimdAccumulate
+{
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes)
+    {
+        (Vector128<ushort> bl, Vector128<ushort> bh) = Vector128.Widen(bytes);
+        (Vector128<uint> sl, Vector128<uint> sh) = Vector128.Widen(bl + bh);
+        return sums + sl + sh;
+    }
+
+    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes)
+        => throw new NotImplementedException();
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes1, Vector128<byte> bytes2)
+    {
+        (Vector128<ushort> b1l, Vector128<ushort> b1h) = Vector128.Widen(bytes1);
+        (Vector128<ushort> b2l, Vector128<ushort> b2h) = Vector128.Widen(bytes2);
+        (Vector128<uint> sl, Vector128<uint> sh) = Vector128.Widen(b1l + b1h + b2l + b2h);
+        return sums + sl + sh;
+    }
+
+    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2)
+        => throw new NotImplementedException();
+}
+
+file struct DotProductX86 : ISimdDotProduct
+{
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte> left, Vector128<byte> right)
+    {
+        Vector128<short> mad = Ssse3.MultiplyAddAdjacent(left, right.AsSByte());
+        return Sse2.MultiplyAddAdjacent(mad, Vector128<short>.One).AsUInt32() + addend;
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
+    {
+        Vector256<short> mad = Avx2.MultiplyAddAdjacent(left, right.AsSByte());
+        return Avx2.MultiplyAddAdjacent(mad, Vector256<short>.One).AsUInt32() + addend;
+    }
+}
+
+file struct DotProductArm64 : ISimdDotProduct
+{
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte> left, Vector128<byte> right)
+    {
+        Vector128<ushort> mad = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower());
+        mad = AdvSimd.MultiplyWideningUpperAndAdd(mad, left, right);
+        return AdvSimd.AddPairwiseWideningAndAdd(addend, mad);
+    }
+
+    public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
+        => throw new NotImplementedException();
+}
+
+file struct DotProductArm64Dp : ISimdDotProduct
+{
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte> left, Vector128<byte> right)
+        => Dp.DotProduct(addend, left, right);
+
+    public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
+        => throw new NotImplementedException();
+}
+
+file struct DotProductXplat : ISimdDotProduct
+{
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte> left, Vector128<byte> right)
+    {
+        (Vector128<ushort> ll, Vector128<ushort> lh) = Vector128.Widen(left);
+        (Vector128<ushort> rl, Vector128<ushort> rh) = Vector128.Widen(right);
+        (Vector128<uint> ml, Vector128<uint> mh) = Vector128.Widen(ll * rl + lh * rh);
+        return addend + ml + mh;
+    }
+
+    public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
+        => throw new NotImplementedException();
+}
+
+file interface ISimdAccumulate
+{
+    static abstract Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes);
+
+    static abstract Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes);
+
+    static abstract Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes1, Vector128<byte> bytes2);
+
+    static abstract Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2);
+}
+
+file interface ISimdDotProduct
+{
+    static abstract Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte> left, Vector128<byte> right);
+
+    static abstract Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right);
+}
+
+file interface ISimdStrategy
+{
+    static abstract (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate, TDotProduct>(Vector128<uint> vs1, Vector128<uint> vs2, ref byte sourceRef, uint vectors)
+        where TAccumulate : struct, ISimdAccumulate
+        where TDotProduct : struct, ISimdDotProduct;
+}
diff --git a/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs b/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs
index 12ec92693d3d51..3e905139b09db8 100644
--- a/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs
+++ b/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs
@@ -152,19 +152,31 @@ public void VerifyHashToUInt32(TestCase testCase)
         }
 
         [Theory]
-        [InlineData(5553, 0xAA40476Bu)]
-        [InlineData(11104, 0xA2778E87u)]
+        [InlineData(5553, 0x62C69C89u)]
+        [InlineData(11104, 0xA8AE3724u)]
         public void LargeInput_ExceedsNMax(int length, uint expected)
         {
-            byte[] data = new byte[length];
-            for (int i = 0; i < data.Length; i++)
-            {
-                data[i] = (byte)('a' + (i % 26));
-            }
-
-            Assert.Equal(expected, Adler32.HashToUInt32(data));
+            // This test ensures that Adler32 optimizations involving delayed modulo
+            // do not overflow a 32-bit intermediate at any point.
 
             var alg = new Adler32();
+
+            // The maximum possible value of an Adler32 checksum is 0xFFF0FFF0,
+            // which has both components just below the modulo value (0xFFF0 == 65520).
+            // A sequence of 65519 ones will generate this value.
+
+            byte[] primer = new byte[65519];
+            primer.AsSpan().Fill(1);
+
+            alg.Append(primer);
+            Assert.Equal(0xFFF0FFF0, alg.GetCurrentHashAsUInt32());
+
+            // Starting from an already-maxed checksum, a stream of 5553 max value
+            // bytes will overflow if not reduced by mod 65521 before the last byte.
+
+            byte[] data = new byte[length];
+            data.AsSpan().Fill(byte.MaxValue);
+
             alg.Append(data);
             Assert.Equal(expected, alg.GetCurrentHashAsUInt32());
         }
@@ -177,9 +189,14 @@ public void LargeInput_ExceedsNMax(int length, uint expected)
         [InlineData(1)]
         [InlineData(2)]
         [InlineData(7)]
+        [InlineData(8)]
+        [InlineData(9)]
         [InlineData(15)]
         [InlineData(16)]
         [InlineData(17)]
+        [InlineData(23)]
+        [InlineData(24)]
+        [InlineData(25)]
         [InlineData(31)]
         [InlineData(32)]
         [InlineData(33)]
@@ -223,25 +240,6 @@ public void VariousLengths_MatchesReference(int length)
             Assert.Equal(expected, alg.GetCurrentHashAsUInt32());
         }
 
-        /// <summary>
-        /// Tests with all-0xFF bytes, which maximizes accumulator values and stresses
-        /// overflow-safe behavior in the vectorized paths.
-        /// </summary>
-        [Theory]
-        [InlineData(32)]
-        [InlineData(64)]
-        [InlineData(128)]
-        [InlineData(256)]
-        [InlineData(5552)]
-        [InlineData(5553)]
-        public void AllMaxBytes_MatchesReference(int length)
-        {
-            byte[] data = new byte[length];
-            data.AsSpan().Fill(0xFF);
-
-            Assert.Equal(ReferenceAdler32(data), Adler32.HashToUInt32(data));
-        }
-
         /// <summary>
         /// Tests incremental appending with various chunk sizes to verify that the
         /// vectorized paths produce the same result regardless of how data is fed in.

From 9735c7e1e9847d2770d01359e8eee0feafe7d581 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Wed, 4 Mar 2026 13:48:01 -0800
Subject: [PATCH 2/6] feedback

---
 .../src/System/IO/Hashing/Adler32Simd.cs         |  9 +++------
 .../System.IO.Hashing/tests/Adler32Tests.cs      | 16 ++++++++++------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
index b96ee4404497bb..4fd61b5c8d6fed 100644
--- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
+++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
@@ -80,11 +80,8 @@ private static uint UpdateCore<TSimdStrategy, TAccumulate, TDotProduct>(uint adl
         Vector128<uint> vs1 = Vector128.CreateScalar(s1);
         Vector128<uint> vs2 = Vector128.CreateScalar(s2);
 
-        if (loopVectors != 0)
-        {
-            (vs1, vs2) = TSimdStrategy.VectorLoop<TAccumulate, TDotProduct>(vs1, vs2, ref bufRef, loopVectors);
-            bufRef = ref Unsafe.Add(ref bufRef, loopVectors * (uint)Vector128<byte>.Count);
-        }
+        (vs1, vs2) = TSimdStrategy.VectorLoop<TAccumulate, TDotProduct>(vs1, vs2, ref bufRef, loopVectors);
+        bufRef = ref Unsafe.Add(ref bufRef, loopVectors * (uint)Vector128<byte>.Count);
 
         Vector128<byte> weights = Vector128.CreateSequence((byte)16, unchecked((byte)-1));
         Vector128<uint> vps;
@@ -131,9 +128,9 @@ private static Vector128<uint> QuickModBase(Vector128<uint> values)
     {
         // Calculating the residual mod 65521 is impractical in SIMD, however we can reduce by
         // enough to prevent overflow without changing the final result of a modulo performed later.
+        //
         // Essentially, the high word of the accumulator represents the number of times it has
         // wrapped to 65536.
-        //
         // 65536 % 65521 = 15, which is what would be carried forward from the high word.
         // We can simply multiply the high word by 15 and add that to the low word to perform
         // the reduction, resulting in a maximum possible residual of 0xFFFF0.
diff --git a/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs b/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs
index 3e905139b09db8..bc365c9b2ffeee 100644
--- a/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs
+++ b/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs
@@ -152,9 +152,10 @@ public void VerifyHashToUInt32(TestCase testCase)
         }
 
         [Theory]
-        [InlineData(5553, 0x62C69C89u)]
-        [InlineData(11104, 0xA8AE3724u)]
-        public void LargeInput_ExceedsNMax(int length, uint expected)
+        [InlineData(5553)]
+        [InlineData(11104)]
+        [InlineData(65536)]
+        public void LargeInput_ExceedsNMax(int length)
         {
             // This test ensures that Adler32 optimizations involving delayed modulo
             // do not overflow a 32-bit intermediate at any point.
@@ -167,22 +168,25 @@ public void LargeInput_ExceedsNMax(int length, uint expected)
 
             byte[] primer = new byte[65519];
             primer.AsSpan().Fill(1);
-
             alg.Append(primer);
+
             Assert.Equal(0xFFF0FFF0, alg.GetCurrentHashAsUInt32());
 
             // Starting from an already-maxed checksum, a stream of 5553 max value
             // bytes will overflow if not reduced by mod 65521 before the last byte.
+            // Of course, once overflowed, the result will be incorrect for any larger
+            // input as well.
 
             byte[] data = new byte[length];
             data.AsSpan().Fill(byte.MaxValue);
-
             alg.Append(data);
+
+            uint expected = ReferenceAdler32(data, 0xFFF0FFF0);
             Assert.Equal(expected, alg.GetCurrentHashAsUInt32());
         }
 
         /// <summary>
-        /// Tests a wide variety of lengths to exercise scalar, Vector128, Vector256, and Vector512
+        /// Tests a wide variety of lengths to exercise scalar, Vector128, and Vector256
         /// code paths as well as their transitions and tail handling.
         /// </summary>
         [Theory]

From e379f9f19977ee3667ae01eb8800292c6a3f7568 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Wed, 4 Mar 2026 19:33:57 -0800
Subject: [PATCH 3/6] tidying

---
 .../src/System/IO/Hashing/Adler32Simd.cs      | 25 ++++++++-----------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
index 4fd61b5c8d6fed..69b142c3d6d786 100644
--- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
+++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
@@ -84,14 +84,13 @@ private static uint UpdateCore<TSimdStrategy, TAccumulate, TDotProduct>(uint adl
         bufRef = ref Unsafe.Add(ref bufRef, loopVectors * (uint)Vector128<byte>.Count);
 
         Vector128<byte> weights = Vector128.CreateSequence((byte)16, unchecked((byte)-1));
-        Vector128<uint> vps;
 
         if ((vectors & 1) != 0)
         {
             Vector128<byte> bytes = Vector128.LoadUnsafe(ref bufRef);
             bufRef = ref Unsafe.Add(ref bufRef, (uint)Vector128<byte>.Count);
 
-            vps = vs1;
+            Vector128<uint> vps = vs1;
 
             vs1 = TAccumulate.Accumulate(vs1, bytes);
             vs2 = TDotProduct.DotProduct(vs2, bytes, weights);
@@ -106,7 +105,7 @@ private static uint UpdateCore<TSimdStrategy, TAccumulate, TDotProduct>(uint adl
             Vector128<byte> bytes = Vector128.LoadUnsafe(ref Unsafe.Subtract(ref bufRef, (uint)Vector128<byte>.Count - len));
             bytes &= Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(MaskBytes), len);
 
-            vps = vs1;
+            Vector128<uint> vps = vs1;
 
             vs1 = TAccumulate.Accumulate(vs1, bytes);
             vs2 = TDotProduct.DotProduct(vs2, bytes, weights);
@@ -170,14 +169,13 @@ public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate,
                 vps += vs1;
 
                 vs1 = TAccumulate.Accumulate(vs1, bytes1, bytes2);
-
                 vs2 = TDotProduct.DotProduct(vs2, bytes1, weights1);
                 vs3 = TDotProduct.DotProduct(vs3, bytes2, weights2);
             }
             while (--blocks != 0);
 
-            vs2 += vs3;
             vs2 += vps << 5;
+            vs2 += vs3;
 
             vs1 = QuickModBase(vs1);
             vs2 = QuickModBase(vs2);
@@ -227,14 +225,13 @@ public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate,
                 wps += ws1;
 
                 ws1 = TAccumulate.Accumulate(ws1, bytes1, bytes2);
-
                 ws2 = TDotProduct.DotProduct(ws2, bytes1, weights1);
                 ws3 = TDotProduct.DotProduct(ws3, bytes2, weights2);
             }
             while (--blocks != 0);
 
-            ws2 += ws3;
             ws2 += wps << 6;
+            ws2 += ws3;
 
             ws1 = QuickModBase(ws1);
             ws2 = QuickModBase(ws2);
@@ -294,14 +291,14 @@ public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> b
         => AdvSimd.Arm64.AddAcrossWidening(bytes).AsUInt32().ToVector128Unsafe() + sums;
 
     public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes)
-        => throw new NotImplementedException();
+        => throw new UnreachableException();
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes1, Vector128<byte> bytes2)
         => AdvSimd.AddPairwiseWideningAndAdd(sums, AdvSimd.AddPairwiseWideningAndAdd(AdvSimd.AddPairwiseWidening(bytes1), bytes2));
 
     public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2)
-        => throw new NotImplementedException();
+        => throw new UnreachableException();
 }
 
 file struct AccumulateXplat : ISimdAccumulate
@@ -315,7 +312,7 @@ public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> b
     }
 
     public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes)
-        => throw new NotImplementedException();
+        => throw new UnreachableException();
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes1, Vector128<byte> bytes2)
@@ -327,7 +324,7 @@ public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> b
     }
 
     public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2)
-        => throw new NotImplementedException();
+        => throw new UnreachableException();
 }
 
 file struct DotProductX86 : ISimdDotProduct
@@ -358,7 +355,7 @@ public static Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte>
     }
 
     public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
-        => throw new NotImplementedException();
+        => throw new UnreachableException();
 }
 
 file struct DotProductArm64Dp : ISimdDotProduct
@@ -368,7 +365,7 @@ public static Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte>
         => Dp.DotProduct(addend, left, right);
 
     public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
-        => throw new NotImplementedException();
+        => throw new UnreachableException();
 }
 
 file struct DotProductXplat : ISimdDotProduct
@@ -383,7 +380,7 @@ public static Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte>
     }
 
     public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
-        => throw new NotImplementedException();
+        => throw new UnreachableException();
 }
 
 file interface ISimdAccumulate

From f35db7827522a30f2121b6769b3720cae6a16d54 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Thu, 5 Mar 2026 09:28:35 -0800
Subject: [PATCH 4/6] tidying

---
 .../System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
index 69b142c3d6d786..a654503cd839d7 100644
--- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
+++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
@@ -242,12 +242,12 @@ public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate,
             Vector256<byte> bytes = Vector256.LoadUnsafe(ref sourceRef);
             sourceRef = ref Unsafe.Add(ref sourceRef, (uint)Vector256<byte>.Count);
 
-            Vector256<uint> w_ps = ws1;
+            Vector256<uint> wps = ws1;
 
             ws1 = TAccumulate.Accumulate(ws1, bytes);
             ws2 = TDotProduct.DotProduct(ws2, bytes, weights2);
 
-            ws2 += w_ps << 5;
+            ws2 += wps << 5;
         }
 
         vs1 = ws1.GetLower() + ws1.GetUpper();

From d4110329973d16749fa1b4dd75c1150463862215 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Fri, 6 Mar 2026 10:47:23 -0800
Subject: [PATCH 5/6] more tidying, add more length asserts

---
 .../src/System.IO.Hashing.csproj              |   4 +-
 .../{Adler32Simd.cs => Adler32.Vectorized.cs} | 114 ++++++++----------
 2 files changed, 49 insertions(+), 69 deletions(-)
 rename src/libraries/System.IO.Hashing/src/System/IO/Hashing/{Adler32Simd.cs => Adler32.Vectorized.cs} (86%)

diff --git a/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj b/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj
index 29cc99349682d0..81d97845da26b6 100644
--- a/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj
+++ b/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj
@@ -1,4 +1,4 @@
-<Project Sdk="Microsoft.NET.Sdk">
+﻿<Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
     <TargetFrameworks>$(NetCoreAppCurrent);$(NetCoreAppPrevious);$(NetCoreAppMinimum);netstandard2.0;$(NetFrameworkMinimum)</TargetFrameworks>
@@ -35,7 +35,7 @@ System.IO.Hashing.XxHash32</PackageDescription>
   </ItemGroup>
 
   <ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETCoreApp'">
-    <Compile Include="System\IO\Hashing\Adler32Simd.cs" />
+    <Compile Include="System\IO\Hashing\Adler32.Vectorized.cs" />
     <Compile Include="System\IO\Hashing\Crc32.Arm.cs" />
     <Compile Include="System\IO\Hashing\Crc32.Vectorized.cs" />
     <Compile Include="System\IO\Hashing\Crc64.Vectorized.cs" />
diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs
similarity index 86%
rename from src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
rename to src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs
index a654503cd839d7..119d27fcd9b1cc 100644
--- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs
+++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs
@@ -67,15 +67,16 @@ private static uint UpdateCore<TSimdStrategy, TAccumulate, TDotProduct>(uint adl
     {
         Debug.Assert(source.Length >= Vector128<byte>.Count);
 
-        uint s1 = (ushort)adler;
-        uint s2 = adler >>> 16;
-
         ref byte bufRef = ref MemoryMarshal.GetReference(source);
-        uint len = (uint)source.Length;
+        uint totalLength = (uint)source.Length;
+        uint totalVectors = totalLength / (uint)Vector128<byte>.Count;
 
-        uint vectors = len / (uint)Vector128<byte>.Count;
-        uint loopVectors = vectors & ~1u;
-        len -= vectors * (uint)Vector128<byte>.Count;
+        uint loopVectors = totalVectors & ~1u;
+        uint tailVectors = totalVectors - loopVectors;
+        uint tailLength = totalLength - totalVectors * (uint)Vector128<byte>.Count;
+
+        uint s1 = (ushort)adler;
+        uint s2 = adler >>> 16;
 
         Vector128<uint> vs1 = Vector128.CreateScalar(s1);
         Vector128<uint> vs2 = Vector128.CreateScalar(s2);
@@ -85,8 +86,10 @@ private static uint UpdateCore<TSimdStrategy, TAccumulate, TDotProduct>(uint adl
 
         Vector128<byte> weights = Vector128.CreateSequence((byte)16, unchecked((byte)-1));
 
-        if ((vectors & 1) != 0)
+        if (tailVectors != 0)
         {
+            Debug.Assert(tailVectors == 1);
+
             Vector128<byte> bytes = Vector128.LoadUnsafe(ref bufRef);
             bufRef = ref Unsafe.Add(ref bufRef, (uint)Vector128<byte>.Count);
 
@@ -98,19 +101,19 @@ private static uint UpdateCore<TSimdStrategy, TAccumulate, TDotProduct>(uint adl
             vs2 += vps << 4;
         }
 
-        if (len != 0)
+        if (tailLength != 0)
         {
-            Debug.Assert(len < (uint)Vector128<byte>.Count);
+            Debug.Assert(tailLength < (uint)Vector128<byte>.Count);
 
-            Vector128<byte> bytes = Vector128.LoadUnsafe(ref Unsafe.Subtract(ref bufRef, (uint)Vector128<byte>.Count - len));
-            bytes &= Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(MaskBytes), len);
+            Vector128<byte> bytes = Vector128.LoadUnsafe(ref Unsafe.Subtract(ref bufRef, (uint)Vector128<byte>.Count - tailLength));
+            bytes &= Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(MaskBytes), tailLength);
 
             Vector128<uint> vps = vs1;
 
             vs1 = TAccumulate.Accumulate(vs1, bytes);
             vs2 = TDotProduct.DotProduct(vs2, bytes, weights);
 
-            vs2 += vps * Vector128.Create(len);
+            vs2 += vps * Vector128.Create(tailLength);
         }
 
         s1 = Vector128.Sum(vs1) % Adler32.ModBase;
@@ -147,6 +150,8 @@ public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate,
         where TAccumulate : struct, ISimdAccumulate
         where TDotProduct : struct, ISimdDotProduct
     {
+        Debug.Assert(uint.IsEvenInteger(vectors));
+
         const uint blockSize = 2;
 
         Vector128<byte> weights1 = Vector128.CreateSequence((byte)32, unchecked((byte)-1));
@@ -195,11 +200,32 @@ public static Vector256<uint> QuickModBase(Vector256<uint> values)
         return (vhi << 4) - vhi + vlo;
     }
 
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes)
+        => Avx2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums;
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2)
+    {
+        Vector256<byte> zero = default;
+        Vector256<uint> sad = Avx2.SumAbsoluteDifferences(bytes1, zero).AsUInt32();
+        return sad + Avx2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums;
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
+    {
+        Vector256<short> mad = Avx2.MultiplyAddAdjacent(left, right.AsSByte());
+        return Avx2.MultiplyAddAdjacent(mad, Vector256<short>.One).AsUInt32() + addend;
+    }
+
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate, TDotProduct>(Vector128<uint> vs1, Vector128<uint> vs2, ref byte sourceRef, uint vectors)
         where TAccumulate : struct, ISimdAccumulate
         where TDotProduct : struct, ISimdDotProduct
     {
+        Debug.Assert(uint.IsEvenInteger(vectors));
+
         const uint blockSize = 4;
 
         Vector256<byte> weights1 = Vector256.CreateSequence((byte)64, unchecked((byte)-1));
@@ -224,9 +250,9 @@ public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate,
 
                 wps += ws1;
 
-                ws1 = TAccumulate.Accumulate(ws1, bytes1, bytes2);
-                ws2 = TDotProduct.DotProduct(ws2, bytes1, weights1);
-                ws3 = TDotProduct.DotProduct(ws3, bytes2, weights2);
+                ws1 = Accumulate(ws1, bytes1, bytes2);
+                ws2 = DotProduct(ws2, bytes1, weights1);
+                ws3 = DotProduct(ws3, bytes2, weights2);
             }
             while (--blocks != 0);
 
@@ -237,15 +263,15 @@ public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate,
             ws2 = QuickModBase(ws2);
         }
 
-        if ((vectors & 2) != 0)
+        if (vectors != 0)
         {
-            Vector256<byte> bytes = Vector256.LoadUnsafe(ref sourceRef);
-            sourceRef = ref Unsafe.Add(ref sourceRef, (uint)Vector256<byte>.Count);
+            Debug.Assert(vectors == 2);
 
+            Vector256<byte> bytes = Vector256.LoadUnsafe(ref sourceRef);
             Vector256<uint> wps = ws1;
 
-            ws1 = TAccumulate.Accumulate(ws1, bytes);
-            ws2 = TDotProduct.DotProduct(ws2, bytes, weights2);
+            ws1 = Accumulate(ws1, bytes);
+            ws2 = DotProduct(ws2, bytes, weights2);
 
             ws2 += wps << 5;
         }
@@ -263,10 +289,6 @@ public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate,
     public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes)
         => Sse2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums;
 
-    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes)
-        => Avx2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums;
-
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes1, Vector128<byte> bytes2)
     {
@@ -274,14 +296,6 @@ public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> b
         Vector128<uint> sad = Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32();
         return sad + Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums;
     }
-
-    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2)
-    {
-        Vector256<byte> zero = default;
-        Vector256<uint> sad = Avx2.SumAbsoluteDifferences(bytes1, zero).AsUInt32();
-        return sad + Avx2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums;
-    }
 }
 
 file struct AccumulateArm64 : ISimdAccumulate
@@ -290,15 +304,9 @@ public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> b
     public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes)
         => AdvSimd.Arm64.AddAcrossWidening(bytes).AsUInt32().ToVector128Unsafe() + sums;
 
-    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes)
-        => throw new UnreachableException();
-
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes1, Vector128<byte> bytes2)
         => AdvSimd.AddPairwiseWideningAndAdd(sums, AdvSimd.AddPairwiseWideningAndAdd(AdvSimd.AddPairwiseWidening(bytes1), bytes2));
-
-    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2)
-        => throw new UnreachableException();
 }
 
 file struct AccumulateXplat : ISimdAccumulate
@@ -311,9 +319,6 @@ public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> b
         return sums + sl + sh;
     }
 
-    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes)
-        => throw new UnreachableException();
-
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes1, Vector128<byte> bytes2)
     {
@@ -322,9 +327,6 @@ public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> b
         (Vector128<uint> sl, Vector128<uint> sh) = Vector128.Widen(b1l + b1h + b2l + b2h);
         return sums + sl + sh;
     }
-
-    public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2)
-        => throw new UnreachableException();
 }
 
 file struct DotProductX86 : ISimdDotProduct
@@ -335,13 +337,6 @@ public static Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte>
         Vector128<short> mad = Ssse3.MultiplyAddAdjacent(left, right.AsSByte());
         return Sse2.MultiplyAddAdjacent(mad, Vector128<short>.One).AsUInt32() + addend;
     }
-
-    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
-    {
-        Vector256<short> mad = Avx2.MultiplyAddAdjacent(left, right.AsSByte());
-        return Avx2.MultiplyAddAdjacent(mad, Vector256<short>.One).AsUInt32() + addend;
-    }
 }
 
 file struct DotProductArm64 : ISimdDotProduct
@@ -353,9 +348,6 @@ public static Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte>
         mad = AdvSimd.MultiplyWideningUpperAndAdd(mad, left, right);
         return AdvSimd.AddPairwiseWideningAndAdd(addend, mad);
     }
-
-    public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
-        => throw new UnreachableException();
 }
 
 file struct DotProductArm64Dp : ISimdDotProduct
@@ -363,9 +355,6 @@ public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte> left, Vector128<byte> right)
         => Dp.DotProduct(addend, left, right);
-
-    public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
-        => throw new UnreachableException();
 }
 
 file struct DotProductXplat : ISimdDotProduct
@@ -378,27 +367,18 @@ public static Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte>
         (Vector128<uint> ml, Vector128<uint> mh) = Vector128.Widen(ll * rl + lh * rh);
         return addend + ml + mh;
     }
-
-    public static Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right)
-        => throw new UnreachableException();
 }
 
 file interface ISimdAccumulate
 {
     static abstract Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes);
 
-    static abstract Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes);
-
     static abstract Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes1, Vector128<byte> bytes2);
-
-    static abstract Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2);
 }
 
 file interface ISimdDotProduct
 {
     static abstract Vector128<uint> DotProduct(Vector128<uint> addend, Vector128<byte> left, Vector128<byte> right);
-
-    static abstract Vector256<uint> DotProduct(Vector256<uint> addend, Vector256<byte> left, Vector256<byte> right);
 }
 
 file interface ISimdStrategy

From 2a9af33d3d0b077f1dc8299ea869d39f07a6f92c Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Mon, 9 Mar 2026 17:13:56 -0700
Subject: [PATCH 6/6] address feedback

---
 .../System/IO/Hashing/Adler32.Vectorized.cs   | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs
index 119d27fcd9b1cc..88d064912d9d41 100644
--- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs
+++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs
@@ -138,11 +138,11 @@ private static Vector128<uint> QuickModBase(Vector128<uint> values)
         // the reduction, resulting in a maximum possible residual of 0xFFFF0.
         //
         // This is further optimized to: `high * 16 - high + low`
-        // and implemented as: `(high << 4) - high + low`.
+        // and implemented as: `(high << 4) + (low - high)`.
 
         Vector128<uint> vlo = values & (Vector128<uint>.AllBitsSet >>> 16);
         Vector128<uint> vhi = values >>> 16;
-        return (vhi << 4) - vhi + vlo;
+        return (vhi << 4) + (vlo - vhi);
     }
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -159,8 +159,8 @@ public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate,
 
         while (vectors >= blockSize)
         {
-            Vector128<uint> vs3 = default;
-            Vector128<uint> vps = default;
+            Vector128<uint> vs3 = Vector128<uint>.Zero;
+            Vector128<uint> vps = Vector128<uint>.Zero;
 
             uint blocks = uint.Min(vectors, Adler32Simd.VMax) / blockSize;
             vectors -= blocks * blockSize;
@@ -197,17 +197,17 @@ public static Vector256<uint> QuickModBase(Vector256<uint> values)
     {
         Vector256<uint> vlo = values & (Vector256<uint>.AllBitsSet >>> 16);
         Vector256<uint> vhi = values >>> 16;
-        return (vhi << 4) - vhi + vlo;
+        return (vhi << 4) + (vlo - vhi);
     }
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes)
-        => Avx2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums;
+        => Avx2.SumAbsoluteDifferences(bytes, Vector256<byte>.Zero).AsUInt32() + sums;
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector256<uint> Accumulate(Vector256<uint> sums, Vector256<byte> bytes1, Vector256<byte> bytes2)
     {
-        Vector256<byte> zero = default;
+        Vector256<byte> zero = Vector256<byte>.Zero;
         Vector256<uint> sad = Avx2.SumAbsoluteDifferences(bytes1, zero).AsUInt32();
         return sad + Avx2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums;
     }
@@ -236,8 +236,8 @@ public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate,
 
         while (vectors >= blockSize)
         {
-            Vector256<uint> ws3 = default;
-            Vector256<uint> wps = default;
+            Vector256<uint> ws3 = Vector256<uint>.Zero;
+            Vector256<uint> wps = Vector256<uint>.Zero;
 
             uint blocks = uint.Min(vectors, Adler32Simd.VMax) / blockSize;
             vectors -= blocks * blockSize;
@@ -287,12 +287,12 @@ public static (Vector128<uint> vs1, Vector128<uint> vs2) VectorLoop<TAccumulate,
 {
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes)
-        => Sse2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums;
+        => Sse2.SumAbsoluteDifferences(bytes, Vector128<byte>.Zero).AsUInt32() + sums;
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> bytes1, Vector128<byte> bytes2)
     {
-        Vector128<byte> zero = default;
+        Vector128<byte> zero = Vector128<byte>.Zero;
         Vector128<uint> sad = Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32();
         return sad + Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums;
     }
@@ -324,7 +324,7 @@ public static Vector128<uint> Accumulate(Vector128<uint> sums, Vector128<byte> b
     {
         (Vector128<ushort> b1l, Vector128<ushort> b1h) = Vector128.Widen(bytes1);
         (Vector128<ushort> b2l, Vector128<ushort> b2h) = Vector128.Widen(bytes2);
-        (Vector128<uint> sl, Vector128<uint> sh) = Vector128.Widen(b1l + b1h + b2l + b2h);
+        (Vector128<uint> sl, Vector128<uint> sh) = Vector128.Widen((b1l + b1h) + (b2l + b2h));
         return sums + sl + sh;
     }
 }