From d70bfc1361b51ad66bb6145f98a29c02f7fdba81 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 4 Mar 2026 13:27:01 -0800 Subject: [PATCH 1/6] improve Adler32 vectorization --- .../src/System.IO.Hashing.csproj | 1 + .../src/System/IO/Hashing/Adler32.cs | 268 +---------- .../src/System/IO/Hashing/Adler32Simd.cs | 415 ++++++++++++++++++ .../System.IO.Hashing/tests/Adler32Tests.cs | 54 ++- 4 files changed, 445 insertions(+), 293 deletions(-) create mode 100644 src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs diff --git a/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj b/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj index 06555e0e92fc41..29cc99349682d0 100644 --- a/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj +++ b/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj @@ -35,6 +35,7 @@ System.IO.Hashing.XxHash32 + diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs index da54cdb3372b19..e6e9956d56c190 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.cs @@ -5,11 +5,6 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -#if NET -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.Arm; -using System.Runtime.Intrinsics.X86; -#endif namespace System.IO.Hashing { @@ -30,7 +25,7 @@ public sealed partial class Adler32 : NonCryptographicHashAlgorithm private uint _adler = InitialState; /// Largest prime smaller than 65536. - private const uint ModBase = 65521; + internal const uint ModBase = 65521; /// NMax is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 private const int NMax = 5552; @@ -192,21 +187,9 @@ private static uint Update(uint adler, ReadOnlySpan source) } #if NET - if (BitConverter.IsLittleEndian && - Vector128.IsHardwareAccelerated && - source.Length >= Vector128.Count * 2) + if (IsVectorizable(source)) { - if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && source.Length >= Vector512.Count) - { - return UpdateVector512(adler, source); - } - - if (Vector256.IsHardwareAccelerated && Avx2.IsSupported && source.Length >= Vector256.Count) - { - return UpdateVector256(adler, source); - } - - return UpdateVector128(adler, source); + return UpdateVectorized(adler, source); } #endif @@ -236,250 +219,5 @@ private static uint UpdateScalar(uint adler, ReadOnlySpan source) return (s2 << 16) | s1; } - -#if NET - [MethodImpl(MethodImplOptions.NoInlining)] - private static uint UpdateVector128(uint adler, ReadOnlySpan source) - { - Debug.Assert(source.Length >= Vector128.Count * 2); - - const int BlockSize = 32; // two Vector128 loads - - uint s1 = adler & 0xFFFF; - uint s2 = (adler >> 16) & 0xFFFF; - - ref byte sourceRef = ref MemoryMarshal.GetReference(source); - int length = source.Length; - - Vector128 tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); - Vector128 tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); - - do - { - int n = Math.Min(length, NMax); - int blocks = n / BlockSize; - n = blocks * BlockSize; - length -= n; - - Vector128 vs1 = Vector128.Zero; - Vector128 vs2 = Vector128.CreateScalar(s2); - Vector128 vps = Vector128.CreateScalar(s1 * (uint)blocks); - - do - { - Vector128 bytes1 = Vector128.LoadUnsafe(ref sourceRef); - Vector128 bytes2 = Vector128.LoadUnsafe(ref sourceRef, 16); - sourceRef = ref Unsafe.Add(ref sourceRef, BlockSize); - - vps += vs1; - - if (Ssse3.IsSupported) - { - vs1 += Sse2.SumAbsoluteDifferences(bytes1, Vector128.Zero).AsUInt32(); - vs1 += Sse2.SumAbsoluteDifferences(bytes2, Vector128.Zero).AsUInt32(); - - vs2 += Sse2.MultiplyAddAdjacent(Ssse3.MultiplyAddAdjacent(bytes1, tap1), Vector128.One).AsUInt32(); - vs2 += Sse2.MultiplyAddAdjacent(Ssse3.MultiplyAddAdjacent(bytes2, tap2), Vector128.One).AsUInt32(); - } - else if (AdvSimd.IsSupported) - { - // Widening byte sum (equivalent of SumAbsoluteDifferences against zero) - vs1 = AdvSimd.AddPairwiseWideningAndAdd( - vs1, - AdvSimd.AddPairwiseWideningAndAdd( - AdvSimd.AddPairwiseWidening(bytes1), - bytes2)); - - // Widening multiply + horizontal add (equivalent of MultiplyAddAdjacent chain). - // Because weights are all positive (1-32), unsigned byte * unsigned byte multiply is valid. - Vector128 wprod1 = AdvSimd.MultiplyWideningLower(bytes1.GetLower(), tap1.AsByte().GetLower()); - wprod1 = AdvSimd.MultiplyWideningUpperAndAdd(wprod1, bytes1, tap1.AsByte()); - vs2 = AdvSimd.AddPairwiseWideningAndAdd(vs2, wprod1); - - Vector128 wprod2 = AdvSimd.MultiplyWideningLower(bytes2.GetLower(), tap2.AsByte().GetLower()); - wprod2 = AdvSimd.MultiplyWideningUpperAndAdd(wprod2, bytes2, tap2.AsByte()); - vs2 = AdvSimd.AddPairwiseWideningAndAdd(vs2, wprod2); - } - else - { - (Vector128 lo1, Vector128 hi1) = Vector128.Widen(bytes1); - (Vector128 lo2, Vector128 hi2) = Vector128.Widen(bytes2); - (Vector128 sumLo, Vector128 sumHi) = Vector128.Widen(lo1 + hi1 + lo2 + hi2); - vs1 += sumLo + sumHi; - vs2 += WeightedSumWidening128(bytes1, tap1) + WeightedSumWidening128(bytes2, tap2); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 WeightedSumWidening128(Vector128 data, Vector128 weights) - { - (Vector128 dLo, Vector128 dHi) = Vector128.Widen(data); - (Vector128 wLo, Vector128 wHi) = Vector128.Widen(weights); - - (Vector128 pLo1, Vector128 pHi1) = Vector128.Widen(dLo.AsInt16() * wLo); - (Vector128 pLo2, Vector128 pHi2) = Vector128.Widen(dHi.AsInt16() * wHi); - - return (pLo1 + pHi1 + pLo2 + pHi2).AsUInt32(); - } - } - } - while (--blocks > 0); - - vs2 += vps << 5; - - s1 += Vector128.Sum(vs1); - s2 = Vector128.Sum(vs2); - - s1 %= ModBase; - s2 %= ModBase; - } - while (length >= BlockSize); - - if (length > 0) - { - UpdateScalarTail(ref sourceRef, length, ref s1, ref s2); - } - - return (s2 << 16) | s1; - } - - [MethodImpl(MethodImplOptions.NoInlining)] - private static uint UpdateVector256(uint adler, ReadOnlySpan source) - { - Debug.Assert(source.Length >= Vector256.Count); - - const int BlockSize = 32; - - uint s1 = adler & 0xFFFF; - uint s2 = (adler >> 16) & 0xFFFF; - - ref byte sourceRef = ref MemoryMarshal.GetReference(source); - int length = source.Length; - - Vector256 weights = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); - - do - { - int n = Math.Min(length, NMax); - int blocks = n / BlockSize; - n = blocks * BlockSize; - length -= n; - - Vector256 vs1 = Vector256.CreateScalar(s1); - Vector256 vs2 = Vector256.CreateScalar(s2); - Vector256 vs3 = Vector256.Zero; - - do - { - Vector256 data = Vector256.LoadUnsafe(ref sourceRef); - sourceRef = ref Unsafe.Add(ref sourceRef, BlockSize); - - Vector256 vs1_0 = vs1; - vs1 += Avx2.SumAbsoluteDifferences(data, Vector256.Zero).AsUInt32(); - vs3 += vs1_0; - - Vector256 mad = Avx2.MultiplyAddAdjacent(data, weights); - vs2 += Avx2.MultiplyAddAdjacent(mad, Vector256.One).AsUInt32(); - } - while (--blocks > 0); - - vs3 <<= 5; - vs2 += vs3; - - s1 = (uint)Vector256.Sum(vs1.AsUInt64()); // SumAbsoluteDifferences stores the results in the even lanes - s2 = Vector256.Sum(vs2); - - s1 %= ModBase; - s2 %= ModBase; - } - while (length >= BlockSize); - - if (length > 0) - { - UpdateScalarTail(ref sourceRef, length, ref s1, ref s2); - } - - return (s2 << 16) | s1; - } - - [MethodImpl(MethodImplOptions.NoInlining)] - private static uint UpdateVector512(uint adler, ReadOnlySpan source) - { - Debug.Assert(source.Length >= Vector512.Count); - - const int BlockSize = 64; - - uint s1 = adler & 0xFFFF; - uint s2 = (adler >> 16) & 0xFFFF; - - ref byte sourceRef = ref MemoryMarshal.GetReference(source); - int length = source.Length; - - Vector512 weights = Vector512.Create( - 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, - 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); - - do - { - int n = Math.Min(length, NMax); - int blocks = n / BlockSize; - n = blocks * BlockSize; - length -= n; - - Vector512 vs1 = Vector512.CreateScalar(s1); - Vector512 vs2 = Vector512.CreateScalar(s2); - Vector512 vs3 = Vector512.Zero; - - do - { - Vector512 data = Vector512.LoadUnsafe(ref sourceRef); - sourceRef = ref Unsafe.Add(ref sourceRef, BlockSize); - - Vector512 vs1_0 = vs1; - vs1 += Avx512BW.SumAbsoluteDifferences(data, Vector512.Zero).AsUInt32(); - vs3 += vs1_0; - vs2 += Avx512BW.MultiplyAddAdjacent(Avx512BW.MultiplyAddAdjacent(data, weights), Vector512.One).AsUInt32(); - - Vector256 sumLo = Avx2.SumAbsoluteDifferences(data.GetLower(), Vector256.Zero).AsUInt32(); - vs2 += Vector512.Create(sumLo << 5, Vector256.Zero); - } - while (--blocks > 0); - - vs3 <<= 6; - vs2 += vs3; - - s1 = (uint)Vector512.Sum(vs1.AsUInt64()); - s2 = Vector512.Sum(vs2); - - s1 %= ModBase; - s2 %= ModBase; - } - while (length >= BlockSize); - - if (length >= Vector256.Count) - { - return UpdateVector256((s2 << 16) | s1, MemoryMarshal.CreateReadOnlySpan(ref sourceRef, length)); - } - - if (length > 0) - { - UpdateScalarTail(ref sourceRef, length, ref s1, ref s2); - } - - return (s2 << 16) | s1; - } - - private static void UpdateScalarTail(ref byte sourceRef, int length, ref uint s1, ref uint s2) - { - Debug.Assert(length is > 0 and < NMax); - - foreach (byte b in MemoryMarshal.CreateReadOnlySpan(ref sourceRef, length)) - { - s1 += b; - s2 += s1; - } - - s1 %= ModBase; - s2 %= ModBase; - } -#endif } } diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs new file mode 100644 index 00000000000000..b96ee4404497bb --- /dev/null +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs @@ -0,0 +1,415 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.X86; + +namespace System.IO.Hashing; + +public sealed partial class Adler32 +{ + private static bool IsVectorizable(ReadOnlySpan source) + => Vector128.IsHardwareAccelerated && source.Length >= Vector128.Count; + + private static uint UpdateVectorized(uint adler, ReadOnlySpan source) + => Adler32Simd.UpdateVectorized(adler, source); +} + +file static class Adler32Simd +{ + // VMax represents the maximum number of 16-byte vectors we can process before reducing + // mod 65521. This is analogous to NMax in the scalar code, however because the accumulated + // values are distributed across vector elements, we can process more bytes before possible + // overflow in any individual element. For this implementation, the max is actually 460 + // vectors, but we choose 448, because it divides evenly by any reasonable block size. + public const uint VMax = 448; + + private static ReadOnlySpan MaskBytes => [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + ]; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static uint UpdateVectorized(uint adler, ReadOnlySpan source) + { + if (Vector256.IsHardwareAccelerated && Avx2.IsSupported) + { + return UpdateCore(adler, source); + } + + if (Ssse3.IsSupported) + { + return UpdateCore(adler, source); + } + + if (AdvSimd.Arm64.IsSupported) + { + if (Dp.IsSupported) + { + return UpdateCore(adler, source); + } + + return UpdateCore(adler, source); + } + + return UpdateCore(adler, source); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint UpdateCore(uint adler, ReadOnlySpan source) + where TSimdStrategy : struct, ISimdStrategy + where TAccumulate : struct, ISimdAccumulate + where TDotProduct : struct, ISimdDotProduct + { + Debug.Assert(source.Length >= Vector128.Count); + + uint s1 = (ushort)adler; + uint s2 = adler >>> 16; + + ref byte bufRef = ref MemoryMarshal.GetReference(source); + uint len = (uint)source.Length; + + uint vectors = len / (uint)Vector128.Count; + uint loopVectors = vectors & ~1u; + len -= vectors * (uint)Vector128.Count; + + Vector128 vs1 = Vector128.CreateScalar(s1); + Vector128 vs2 = Vector128.CreateScalar(s2); + + if (loopVectors != 0) + { + (vs1, vs2) = TSimdStrategy.VectorLoop(vs1, vs2, ref bufRef, loopVectors); + bufRef = ref Unsafe.Add(ref bufRef, loopVectors * (uint)Vector128.Count); + } + + Vector128 weights = Vector128.CreateSequence((byte)16, unchecked((byte)-1)); + Vector128 vps; + + if ((vectors & 1) != 0) + { + Vector128 bytes = Vector128.LoadUnsafe(ref bufRef); + bufRef = ref Unsafe.Add(ref bufRef, (uint)Vector128.Count); + + vps = vs1; + + vs1 = TAccumulate.Accumulate(vs1, bytes); + vs2 = TDotProduct.DotProduct(vs2, bytes, weights); + + vs2 += vps << 4; + } + + if (len != 0) + { + Debug.Assert(len < (uint)Vector128.Count); + + Vector128 bytes = Vector128.LoadUnsafe(ref Unsafe.Subtract(ref bufRef, (uint)Vector128.Count - len)); + bytes &= Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(MaskBytes), len); + + vps = vs1; + + vs1 = TAccumulate.Accumulate(vs1, bytes); + vs2 = TDotProduct.DotProduct(vs2, bytes, weights); + + vs2 += vps * Vector128.Create(len); + } + + s1 = Vector128.Sum(vs1) % Adler32.ModBase; + s2 = Vector128.Sum(vs2) % Adler32.ModBase; + + return s1 | (s2 << 16); + } +} + +file struct AdlerVector128 : ISimdStrategy +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 QuickModBase(Vector128 values) + { + // Calculating the residual mod 65521 is impractical in SIMD, however we can reduce by + // enough to prevent overflow without changing the final result of a modulo performed later. + // Essentially, the high word of the accumulator represents the number of times it has + // wrapped to 65536. + // + // 65536 % 65521 = 15, which is what would be carried forward from the high word. + // We can simply multiply the high word by 15 and add that to the low word to perform + // the reduction, resulting in a maximum possible residual of 0xFFFF0. + // + // This is further optimized to: `high * 16 - high + low` + // and implemented as: `(high << 4) - high + low`. + + Vector128 vlo = values & (Vector128.AllBitsSet >>> 16); + Vector128 vhi = values >>> 16; + return (vhi << 4) - vhi + vlo; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static (Vector128 vs1, Vector128 vs2) VectorLoop(Vector128 vs1, Vector128 vs2, ref byte sourceRef, uint vectors) + where TAccumulate : struct, ISimdAccumulate + where TDotProduct : struct, ISimdDotProduct + { + const uint blockSize = 2; + + Vector128 weights1 = Vector128.CreateSequence((byte)32, unchecked((byte)-1)); + Vector128 weights2 = Vector128.CreateSequence((byte)16, unchecked((byte)-1)); + + while (vectors >= blockSize) + { + Vector128 vs3 = default; + Vector128 vps = default; + + uint blocks = uint.Min(vectors, Adler32Simd.VMax) / blockSize; + vectors -= blocks * blockSize; + + do + { + Vector128 bytes1 = Vector128.LoadUnsafe(ref sourceRef); + Vector128 bytes2 = Vector128.LoadUnsafe(ref sourceRef, (uint)Vector128.Count); + sourceRef = ref Unsafe.Add(ref sourceRef, (uint)Vector128.Count * 2); + + vps += vs1; + + vs1 = TAccumulate.Accumulate(vs1, bytes1, bytes2); + + vs2 = TDotProduct.DotProduct(vs2, bytes1, weights1); + vs3 = TDotProduct.DotProduct(vs3, bytes2, weights2); + } + while (--blocks != 0); + + vs2 += vs3; + vs2 += vps << 5; + + vs1 = QuickModBase(vs1); + vs2 = QuickModBase(vs2); + } + + return (vs1, vs2); + } +} + +file struct AdlerVector256 : ISimdStrategy +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 QuickModBase(Vector256 values) + { + Vector256 vlo = values & (Vector256.AllBitsSet >>> 16); + Vector256 vhi = values >>> 16; + return (vhi << 4) - vhi + vlo; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static (Vector128 vs1, Vector128 vs2) VectorLoop(Vector128 vs1, Vector128 vs2, ref byte sourceRef, uint vectors) + where TAccumulate : struct, ISimdAccumulate + where TDotProduct : struct, ISimdDotProduct + { + const uint blockSize = 4; + + Vector256 weights1 = Vector256.CreateSequence((byte)64, unchecked((byte)-1)); + Vector256 weights2 = Vector256.CreateSequence((byte)32, unchecked((byte)-1)); + + Vector256 ws1 = vs1.ToVector256Unsafe(); + Vector256 ws2 = vs2.ToVector256Unsafe(); + + while (vectors >= blockSize) + { + Vector256 ws3 = default; + Vector256 wps = default; + + uint blocks = uint.Min(vectors, Adler32Simd.VMax) / blockSize; + vectors -= blocks * blockSize; + + do + { + Vector256 bytes1 = Vector256.LoadUnsafe(ref sourceRef); + Vector256 bytes2 = Vector256.LoadUnsafe(ref sourceRef, (uint)Vector256.Count); + sourceRef = ref Unsafe.Add(ref sourceRef, (uint)Vector256.Count * 2); + + wps += ws1; + + ws1 = TAccumulate.Accumulate(ws1, bytes1, bytes2); + + ws2 = TDotProduct.DotProduct(ws2, bytes1, weights1); + ws3 = TDotProduct.DotProduct(ws3, bytes2, weights2); + } + while (--blocks != 0); + + ws2 += ws3; + ws2 += wps << 6; + + ws1 = QuickModBase(ws1); + ws2 = QuickModBase(ws2); + } + + if ((vectors & 2) != 0) + { + Vector256 bytes = Vector256.LoadUnsafe(ref sourceRef); + sourceRef = ref Unsafe.Add(ref sourceRef, (uint)Vector256.Count); + + Vector256 w_ps = ws1; + + ws1 = TAccumulate.Accumulate(ws1, bytes); + ws2 = TDotProduct.DotProduct(ws2, bytes, weights2); + + ws2 += w_ps << 5; + } + + vs1 = ws1.GetLower() + ws1.GetUpper(); + vs2 = ws2.GetLower() + ws2.GetUpper(); + + return (vs1, vs2); + } +} + +file struct AccumulateX86 : ISimdAccumulate +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Accumulate(Vector128 sums, Vector128 bytes) + => Sse2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 Accumulate(Vector256 sums, Vector256 bytes) + => Avx2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Accumulate(Vector128 sums, Vector128 bytes1, Vector128 bytes2) + { + Vector128 zero = default; + Vector128 sad = Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32(); + return sad + Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2) + { + Vector256 zero = default; + Vector256 sad = Avx2.SumAbsoluteDifferences(bytes1, zero).AsUInt32(); + return sad + Avx2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums; + } +} + +file struct AccumulateArm64 : ISimdAccumulate +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Accumulate(Vector128 sums, Vector128 bytes) + => AdvSimd.Arm64.AddAcrossWidening(bytes).AsUInt32().ToVector128Unsafe() + sums; + + public static Vector256 Accumulate(Vector256 sums, Vector256 bytes) + => throw new NotImplementedException(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Accumulate(Vector128 sums, Vector128 bytes1, Vector128 bytes2) + => AdvSimd.AddPairwiseWideningAndAdd(sums, AdvSimd.AddPairwiseWideningAndAdd(AdvSimd.AddPairwiseWidening(bytes1), bytes2)); + + public static Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2) + => throw new NotImplementedException(); +} + +file struct AccumulateXplat : ISimdAccumulate +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Accumulate(Vector128 sums, Vector128 bytes) + { + (Vector128 bl, Vector128 bh) = Vector128.Widen(bytes); + (Vector128 sl, Vector128 sh) = Vector128.Widen(bl + bh); + return sums + sl + sh; + } + + public static Vector256 Accumulate(Vector256 sums, Vector256 bytes) + => throw new NotImplementedException(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Accumulate(Vector128 sums, Vector128 bytes1, Vector128 bytes2) + { + (Vector128 b1l, Vector128 b1h) = Vector128.Widen(bytes1); + (Vector128 b2l, Vector128 b2h) = Vector128.Widen(bytes2); + (Vector128 sl, Vector128 sh) = Vector128.Widen(b1l + b1h + b2l + b2h); + return sums + sl + sh; + } + + public static Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2) + => throw new NotImplementedException(); +} + +file struct DotProductX86 : ISimdDotProduct +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 DotProduct(Vector128 addend, Vector128 left, Vector128 right) + { + Vector128 mad = Ssse3.MultiplyAddAdjacent(left, right.AsSByte()); + return Sse2.MultiplyAddAdjacent(mad, Vector128.One).AsUInt32() + addend; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) + { + Vector256 mad = Avx2.MultiplyAddAdjacent(left, right.AsSByte()); + return Avx2.MultiplyAddAdjacent(mad, Vector256.One).AsUInt32() + addend; + } +} + +file struct DotProductArm64 : ISimdDotProduct +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 DotProduct(Vector128 addend, Vector128 left, Vector128 right) + { + Vector128 mad = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower()); + mad = AdvSimd.MultiplyWideningUpperAndAdd(mad, left, right); + return AdvSimd.AddPairwiseWideningAndAdd(addend, mad); + } + + public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) + => throw new NotImplementedException(); +} + +file struct DotProductArm64Dp : ISimdDotProduct +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 DotProduct(Vector128 addend, Vector128 left, Vector128 right) + => Dp.DotProduct(addend, left, right); + + public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) + => throw new NotImplementedException(); +} + +file struct DotProductXplat : ISimdDotProduct +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 DotProduct(Vector128 addend, Vector128 left, Vector128 right) + { + (Vector128 ll, Vector128 lh) = Vector128.Widen(left); + (Vector128 rl, Vector128 rh) = Vector128.Widen(right); + (Vector128 ml, Vector128 mh) = Vector128.Widen(ll * rl + lh * rh); + return addend + ml + mh; + } + + public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) + => throw new NotImplementedException(); +} + +file interface ISimdAccumulate +{ + static abstract Vector128 Accumulate(Vector128 sums, Vector128 bytes); + + static abstract Vector256 Accumulate(Vector256 sums, Vector256 bytes); + + static abstract Vector128 Accumulate(Vector128 sums, Vector128 bytes1, Vector128 bytes2); + + static abstract Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2); +} + +file interface ISimdDotProduct +{ + static abstract Vector128 DotProduct(Vector128 addend, Vector128 left, Vector128 right); + + static abstract Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right); +} + +file interface ISimdStrategy +{ + static abstract (Vector128 vs1, Vector128 vs2) VectorLoop(Vector128 vs1, Vector128 vs2, ref byte sourceRef, uint vectors) + where TAccumulate : struct, ISimdAccumulate + where TDotProduct : struct, ISimdDotProduct; +} diff --git a/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs b/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs index 12ec92693d3d51..3e905139b09db8 100644 --- a/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs +++ b/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs @@ -152,19 +152,31 @@ public void VerifyHashToUInt32(TestCase testCase) } [Theory] - [InlineData(5553, 0xAA40476Bu)] - [InlineData(11104, 0xA2778E87u)] + [InlineData(5553, 0x62C69C89u)] + [InlineData(11104, 0xA8AE3724u)] public void LargeInput_ExceedsNMax(int length, uint expected) { - byte[] data = new byte[length]; - for (int i = 0; i < data.Length; i++) - { - data[i] = (byte)('a' + (i % 26)); - } - - Assert.Equal(expected, Adler32.HashToUInt32(data)); + // This test ensures that Adler32 optimizations involving delayed modulo + // do not overflow a 32-bit intermediate at any point. var alg = new Adler32(); + + // The maximum possible value of an Adler32 checksum is 0xFFF0FFF0, + // which has both components just below the modulo value (0xFFF0 == 65520). + // A sequence of 65519 ones will generate this value. + + byte[] primer = new byte[65519]; + primer.AsSpan().Fill(1); + + alg.Append(primer); + Assert.Equal(0xFFF0FFF0, alg.GetCurrentHashAsUInt32()); + + // Starting from an already-maxed checksum, a stream of 5553 max value + // bytes will overflow if not reduced by mod 65521 before the last byte. + + byte[] data = new byte[length]; + data.AsSpan().Fill(byte.MaxValue); + alg.Append(data); Assert.Equal(expected, alg.GetCurrentHashAsUInt32()); } @@ -177,9 +189,14 @@ public void LargeInput_ExceedsNMax(int length, uint expected) [InlineData(1)] [InlineData(2)] [InlineData(7)] + [InlineData(8)] + [InlineData(9)] [InlineData(15)] [InlineData(16)] [InlineData(17)] + [InlineData(23)] + [InlineData(24)] + [InlineData(25)] [InlineData(31)] [InlineData(32)] [InlineData(33)] @@ -223,25 +240,6 @@ public void VariousLengths_MatchesReference(int length) Assert.Equal(expected, alg.GetCurrentHashAsUInt32()); } - /// - /// Tests with all-0xFF bytes, which maximizes accumulator values and stresses - /// overflow-safe behavior in the vectorized paths. - /// - [Theory] - [InlineData(32)] - [InlineData(64)] - [InlineData(128)] - [InlineData(256)] - [InlineData(5552)] - [InlineData(5553)] - public void AllMaxBytes_MatchesReference(int length) - { - byte[] data = new byte[length]; - data.AsSpan().Fill(0xFF); - - Assert.Equal(ReferenceAdler32(data), Adler32.HashToUInt32(data)); - } - /// /// Tests incremental appending with various chunk sizes to verify that the /// vectorized paths produce the same result regardless of how data is fed in. From 9735c7e1e9847d2770d01359e8eee0feafe7d581 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 4 Mar 2026 13:48:01 -0800 Subject: [PATCH 2/6] feedback --- .../src/System/IO/Hashing/Adler32Simd.cs | 9 +++------ .../System.IO.Hashing/tests/Adler32Tests.cs | 16 ++++++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs index b96ee4404497bb..4fd61b5c8d6fed 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs @@ -80,11 +80,8 @@ private static uint UpdateCore(uint adl Vector128 vs1 = Vector128.CreateScalar(s1); Vector128 vs2 = Vector128.CreateScalar(s2); - if (loopVectors != 0) - { - (vs1, vs2) = TSimdStrategy.VectorLoop(vs1, vs2, ref bufRef, loopVectors); - bufRef = ref Unsafe.Add(ref bufRef, loopVectors * (uint)Vector128.Count); - } + (vs1, vs2) = TSimdStrategy.VectorLoop(vs1, vs2, ref bufRef, loopVectors); + bufRef = ref Unsafe.Add(ref bufRef, loopVectors * (uint)Vector128.Count); Vector128 weights = Vector128.CreateSequence((byte)16, unchecked((byte)-1)); Vector128 vps; @@ -131,9 +128,9 @@ private static Vector128 QuickModBase(Vector128 values) { // Calculating the residual mod 65521 is impractical in SIMD, however we can reduce by // enough to prevent overflow without changing the final result of a modulo performed later. + // // Essentially, the high word of the accumulator represents the number of times it has // wrapped to 65536. - // // 65536 % 65521 = 15, which is what would be carried forward from the high word. // We can simply multiply the high word by 15 and add that to the low word to perform // the reduction, resulting in a maximum possible residual of 0xFFFF0. diff --git a/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs b/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs index 3e905139b09db8..bc365c9b2ffeee 100644 --- a/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs +++ b/src/libraries/System.IO.Hashing/tests/Adler32Tests.cs @@ -152,9 +152,10 @@ public void VerifyHashToUInt32(TestCase testCase) } [Theory] - [InlineData(5553, 0x62C69C89u)] - [InlineData(11104, 0xA8AE3724u)] - public void LargeInput_ExceedsNMax(int length, uint expected) + [InlineData(5553)] + [InlineData(11104)] + [InlineData(65536)] + public void LargeInput_ExceedsNMax(int length) { // This test ensures that Adler32 optimizations involving delayed modulo // do not overflow a 32-bit intermediate at any point. @@ -167,22 +168,25 @@ public void LargeInput_ExceedsNMax(int length, uint expected) byte[] primer = new byte[65519]; primer.AsSpan().Fill(1); - alg.Append(primer); + Assert.Equal(0xFFF0FFF0, alg.GetCurrentHashAsUInt32()); // Starting from an already-maxed checksum, a stream of 5553 max value // bytes will overflow if not reduced by mod 65521 before the last byte. + // Of course, once overflowed, the result will be incorrect for any larger + // input as well. byte[] data = new byte[length]; data.AsSpan().Fill(byte.MaxValue); - alg.Append(data); + + uint expected = ReferenceAdler32(data, 0xFFF0FFF0); Assert.Equal(expected, alg.GetCurrentHashAsUInt32()); } /// - /// Tests a wide variety of lengths to exercise scalar, Vector128, Vector256, and Vector512 + /// Tests a wide variety of lengths to exercise scalar, Vector128, and Vector256 /// code paths as well as their transitions and tail handling. /// [Theory] From e379f9f19977ee3667ae01eb8800292c6a3f7568 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 4 Mar 2026 19:33:57 -0800 Subject: [PATCH 3/6] tidying --- .../src/System/IO/Hashing/Adler32Simd.cs | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs index 4fd61b5c8d6fed..69b142c3d6d786 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs @@ -84,14 +84,13 @@ private static uint UpdateCore(uint adl bufRef = ref Unsafe.Add(ref bufRef, loopVectors * (uint)Vector128.Count); Vector128 weights = Vector128.CreateSequence((byte)16, unchecked((byte)-1)); - Vector128 vps; if ((vectors & 1) != 0) { Vector128 bytes = Vector128.LoadUnsafe(ref bufRef); bufRef = ref Unsafe.Add(ref bufRef, (uint)Vector128.Count); - vps = vs1; + Vector128 vps = vs1; vs1 = TAccumulate.Accumulate(vs1, bytes); vs2 = TDotProduct.DotProduct(vs2, bytes, weights); @@ -106,7 +105,7 @@ private static uint UpdateCore(uint adl Vector128 bytes = Vector128.LoadUnsafe(ref Unsafe.Subtract(ref bufRef, (uint)Vector128.Count - len)); bytes &= Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(MaskBytes), len); - vps = vs1; + Vector128 vps = vs1; vs1 = TAccumulate.Accumulate(vs1, bytes); vs2 = TDotProduct.DotProduct(vs2, bytes, weights); @@ -170,14 +169,13 @@ public static (Vector128 vs1, Vector128 vs2) VectorLoop vs1, Vector128 vs2) VectorLoop Accumulate(Vector128 sums, Vector128 b => AdvSimd.Arm64.AddAcrossWidening(bytes).AsUInt32().ToVector128Unsafe() + sums; public static Vector256 Accumulate(Vector256 sums, Vector256 bytes) - => throw new NotImplementedException(); + => throw new UnreachableException(); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 Accumulate(Vector128 sums, Vector128 bytes1, Vector128 bytes2) => AdvSimd.AddPairwiseWideningAndAdd(sums, AdvSimd.AddPairwiseWideningAndAdd(AdvSimd.AddPairwiseWidening(bytes1), bytes2)); public static Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2) - => throw new NotImplementedException(); + => throw new UnreachableException(); } file struct AccumulateXplat : ISimdAccumulate @@ -315,7 +312,7 @@ public static Vector128 Accumulate(Vector128 sums, Vector128 b } public static Vector256 Accumulate(Vector256 sums, Vector256 bytes) - => throw new NotImplementedException(); + => throw new UnreachableException(); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 Accumulate(Vector128 sums, Vector128 bytes1, Vector128 bytes2) @@ -327,7 +324,7 @@ public static Vector128 Accumulate(Vector128 sums, Vector128 b } public static Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2) - => throw new NotImplementedException(); + => throw new UnreachableException(); } file struct DotProductX86 : ISimdDotProduct @@ -358,7 +355,7 @@ public static Vector128 DotProduct(Vector128 addend, Vector128 } public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) - => throw new NotImplementedException(); + => throw new UnreachableException(); } file struct DotProductArm64Dp : ISimdDotProduct @@ -368,7 +365,7 @@ public static Vector128 DotProduct(Vector128 addend, Vector128 => Dp.DotProduct(addend, left, right); public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) - => throw new NotImplementedException(); + => throw new UnreachableException(); } file struct DotProductXplat : ISimdDotProduct @@ -383,7 +380,7 @@ public static Vector128 DotProduct(Vector128 addend, Vector128 } public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) - => throw new NotImplementedException(); + => throw new UnreachableException(); } file interface ISimdAccumulate From f35db7827522a30f2121b6769b3720cae6a16d54 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 5 Mar 2026 09:28:35 -0800 Subject: [PATCH 4/6] tidying --- .../System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs index 69b142c3d6d786..a654503cd839d7 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs @@ -242,12 +242,12 @@ public static (Vector128 vs1, Vector128 vs2) VectorLoop bytes = Vector256.LoadUnsafe(ref sourceRef); sourceRef = ref Unsafe.Add(ref sourceRef, (uint)Vector256.Count); - Vector256 w_ps = ws1; + Vector256 wps = ws1; ws1 = TAccumulate.Accumulate(ws1, bytes); ws2 = TDotProduct.DotProduct(ws2, bytes, weights2); - ws2 += w_ps << 5; + ws2 += wps << 5; } vs1 = ws1.GetLower() + ws1.GetUpper(); From d4110329973d16749fa1b4dd75c1150463862215 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Fri, 6 Mar 2026 10:47:23 -0800 Subject: [PATCH 5/6] more tidying, add more length asserts --- .../src/System.IO.Hashing.csproj | 4 +- .../{Adler32Simd.cs => Adler32.Vectorized.cs} | 114 ++++++++---------- 2 files changed, 49 insertions(+), 69 deletions(-) rename src/libraries/System.IO.Hashing/src/System/IO/Hashing/{Adler32Simd.cs => Adler32.Vectorized.cs} (86%) diff --git a/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj b/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj index 29cc99349682d0..81d97845da26b6 100644 --- a/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj +++ b/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj @@ -1,4 +1,4 @@ - + $(NetCoreAppCurrent);$(NetCoreAppPrevious);$(NetCoreAppMinimum);netstandard2.0;$(NetFrameworkMinimum) @@ -35,7 +35,7 @@ System.IO.Hashing.XxHash32 - + diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs similarity index 86% rename from src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs rename to src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs index a654503cd839d7..119d27fcd9b1cc 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32Simd.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs @@ -67,15 +67,16 @@ private static uint UpdateCore(uint adl { Debug.Assert(source.Length >= Vector128.Count); - uint s1 = (ushort)adler; - uint s2 = adler >>> 16; - ref byte bufRef = ref MemoryMarshal.GetReference(source); - uint len = (uint)source.Length; + uint totalLength = (uint)source.Length; + uint totalVectors = totalLength / (uint)Vector128.Count; - uint vectors = len / (uint)Vector128.Count; - uint loopVectors = vectors & ~1u; - len -= vectors * (uint)Vector128.Count; + uint loopVectors = totalVectors & ~1u; + uint tailVectors = totalVectors - loopVectors; + uint tailLength = totalLength - totalVectors * (uint)Vector128.Count; + + uint s1 = (ushort)adler; + uint s2 = adler >>> 16; Vector128 vs1 = Vector128.CreateScalar(s1); Vector128 vs2 = Vector128.CreateScalar(s2); @@ -85,8 +86,10 @@ private static uint UpdateCore(uint adl Vector128 weights = Vector128.CreateSequence((byte)16, unchecked((byte)-1)); - if ((vectors & 1) != 0) + if (tailVectors != 0) { + Debug.Assert(tailVectors == 1); + Vector128 bytes = Vector128.LoadUnsafe(ref bufRef); bufRef = ref Unsafe.Add(ref bufRef, (uint)Vector128.Count); @@ -98,19 +101,19 @@ private static uint UpdateCore(uint adl vs2 += vps << 4; } - if (len != 0) + if (tailLength != 0) { - Debug.Assert(len < (uint)Vector128.Count); + Debug.Assert(tailLength < (uint)Vector128.Count); - Vector128 bytes = Vector128.LoadUnsafe(ref Unsafe.Subtract(ref bufRef, (uint)Vector128.Count - len)); - bytes &= Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(MaskBytes), len); + Vector128 bytes = Vector128.LoadUnsafe(ref Unsafe.Subtract(ref bufRef, (uint)Vector128.Count - tailLength)); + bytes &= Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(MaskBytes), tailLength); Vector128 vps = vs1; vs1 = TAccumulate.Accumulate(vs1, bytes); vs2 = TDotProduct.DotProduct(vs2, bytes, weights); - vs2 += vps * Vector128.Create(len); + vs2 += vps * Vector128.Create(tailLength); } s1 = Vector128.Sum(vs1) % Adler32.ModBase; @@ -147,6 +150,8 @@ public static (Vector128 vs1, Vector128 vs2) VectorLoop weights1 = Vector128.CreateSequence((byte)32, unchecked((byte)-1)); @@ -195,11 +200,32 @@ public static Vector256 QuickModBase(Vector256 values) return (vhi << 4) - vhi + vlo; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 Accumulate(Vector256 sums, Vector256 bytes) + => Avx2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2) + { + Vector256 zero = default; + Vector256 sad = Avx2.SumAbsoluteDifferences(bytes1, zero).AsUInt32(); + return sad + Avx2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) + { + Vector256 mad = Avx2.MultiplyAddAdjacent(left, right.AsSByte()); + return Avx2.MultiplyAddAdjacent(mad, Vector256.One).AsUInt32() + addend; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static (Vector128 vs1, Vector128 vs2) VectorLoop(Vector128 vs1, Vector128 vs2, ref byte sourceRef, uint vectors) where TAccumulate : struct, ISimdAccumulate where TDotProduct : struct, ISimdDotProduct { + Debug.Assert(uint.IsEvenInteger(vectors)); + const uint blockSize = 4; Vector256 weights1 = Vector256.CreateSequence((byte)64, unchecked((byte)-1)); @@ -224,9 +250,9 @@ public static (Vector128 vs1, Vector128 vs2) VectorLoop vs1, Vector128 vs2) VectorLoop bytes = Vector256.LoadUnsafe(ref sourceRef); - sourceRef = ref Unsafe.Add(ref sourceRef, (uint)Vector256.Count); + Debug.Assert(vectors == 2); + Vector256 bytes = Vector256.LoadUnsafe(ref sourceRef); Vector256 wps = ws1; - ws1 = TAccumulate.Accumulate(ws1, bytes); - ws2 = TDotProduct.DotProduct(ws2, bytes, weights2); + ws1 = Accumulate(ws1, bytes); + ws2 = DotProduct(ws2, bytes, weights2); ws2 += wps << 5; } @@ -263,10 +289,6 @@ public static (Vector128 vs1, Vector128 vs2) VectorLoop Accumulate(Vector128 sums, Vector128 bytes) => Sse2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums; - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 Accumulate(Vector256 sums, Vector256 bytes) - => Avx2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums; - [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 Accumulate(Vector128 sums, Vector128 bytes1, Vector128 bytes2) { @@ -274,14 +296,6 @@ public static Vector128 Accumulate(Vector128 sums, Vector128 b Vector128 sad = Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32(); return sad + Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums; } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2) - { - Vector256 zero = default; - Vector256 sad = Avx2.SumAbsoluteDifferences(bytes1, zero).AsUInt32(); - return sad + Avx2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums; - } } file struct AccumulateArm64 : ISimdAccumulate @@ -290,15 +304,9 @@ public static Vector256 Accumulate(Vector256 sums, Vector256 b public static Vector128 Accumulate(Vector128 sums, Vector128 bytes) => AdvSimd.Arm64.AddAcrossWidening(bytes).AsUInt32().ToVector128Unsafe() + sums; - public static Vector256 Accumulate(Vector256 sums, Vector256 bytes) - => throw new UnreachableException(); - [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 Accumulate(Vector128 sums, Vector128 bytes1, Vector128 bytes2) => AdvSimd.AddPairwiseWideningAndAdd(sums, AdvSimd.AddPairwiseWideningAndAdd(AdvSimd.AddPairwiseWidening(bytes1), bytes2)); - - public static Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2) - => throw new UnreachableException(); } file struct AccumulateXplat : ISimdAccumulate @@ -311,9 +319,6 @@ public static Vector128 Accumulate(Vector128 sums, Vector128 b return sums + sl + sh; } - public static Vector256 Accumulate(Vector256 sums, Vector256 bytes) - => throw new UnreachableException(); - [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 Accumulate(Vector128 sums, Vector128 bytes1, Vector128 bytes2) { @@ -322,9 +327,6 @@ public static Vector128 Accumulate(Vector128 sums, Vector128 b (Vector128 sl, Vector128 sh) = Vector128.Widen(b1l + b1h + b2l + b2h); return sums + sl + sh; } - - public static Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2) - => throw new UnreachableException(); } file struct DotProductX86 : ISimdDotProduct @@ -335,13 +337,6 @@ public static Vector128 DotProduct(Vector128 addend, Vector128 Vector128 mad = Ssse3.MultiplyAddAdjacent(left, right.AsSByte()); return Sse2.MultiplyAddAdjacent(mad, Vector128.One).AsUInt32() + addend; } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) - { - Vector256 mad = Avx2.MultiplyAddAdjacent(left, right.AsSByte()); - return Avx2.MultiplyAddAdjacent(mad, Vector256.One).AsUInt32() + addend; - } } file struct DotProductArm64 : ISimdDotProduct @@ -353,9 +348,6 @@ public static Vector128 DotProduct(Vector128 addend, Vector128 mad = AdvSimd.MultiplyWideningUpperAndAdd(mad, left, right); return AdvSimd.AddPairwiseWideningAndAdd(addend, mad); } - - public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) - => throw new UnreachableException(); } file struct DotProductArm64Dp : ISimdDotProduct @@ -363,9 +355,6 @@ public static Vector256 DotProduct(Vector256 addend, Vector256 [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 DotProduct(Vector128 addend, Vector128 left, Vector128 right) => Dp.DotProduct(addend, left, right); - - public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) - => throw new UnreachableException(); } file struct DotProductXplat : ISimdDotProduct @@ -378,27 +367,18 @@ public static Vector128 DotProduct(Vector128 addend, Vector128 (Vector128 ml, Vector128 mh) = Vector128.Widen(ll * rl + lh * rh); return addend + ml + mh; } - - public static Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right) - => throw new UnreachableException(); } file interface ISimdAccumulate { static abstract Vector128 Accumulate(Vector128 sums, Vector128 bytes); - static abstract Vector256 Accumulate(Vector256 sums, Vector256 bytes); - static abstract Vector128 Accumulate(Vector128 sums, Vector128 bytes1, Vector128 bytes2); - - static abstract Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2); } file interface ISimdDotProduct { static abstract Vector128 DotProduct(Vector128 addend, Vector128 left, Vector128 right); - - static abstract Vector256 DotProduct(Vector256 addend, Vector256 left, Vector256 right); } file interface ISimdStrategy From 2a9af33d3d0b077f1dc8299ea869d39f07a6f92c Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Mon, 9 Mar 2026 17:13:56 -0700 Subject: [PATCH 6/6] address feedback --- .../System/IO/Hashing/Adler32.Vectorized.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs index 119d27fcd9b1cc..88d064912d9d41 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Adler32.Vectorized.cs @@ -138,11 +138,11 @@ private static Vector128 QuickModBase(Vector128 values) // the reduction, resulting in a maximum possible residual of 0xFFFF0. // // This is further optimized to: `high * 16 - high + low` - // and implemented as: `(high << 4) - high + low`. + // and implemented as: `(high << 4) + (low - high)`. Vector128 vlo = values & (Vector128.AllBitsSet >>> 16); Vector128 vhi = values >>> 16; - return (vhi << 4) - vhi + vlo; + return (vhi << 4) + (vlo - vhi); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -159,8 +159,8 @@ public static (Vector128 vs1, Vector128 vs2) VectorLoop= blockSize) { - Vector128 vs3 = default; - Vector128 vps = default; + Vector128 vs3 = Vector128.Zero; + Vector128 vps = Vector128.Zero; uint blocks = uint.Min(vectors, Adler32Simd.VMax) / blockSize; vectors -= blocks * blockSize; @@ -197,17 +197,17 @@ public static Vector256 QuickModBase(Vector256 values) { Vector256 vlo = values & (Vector256.AllBitsSet >>> 16); Vector256 vhi = values >>> 16; - return (vhi << 4) - vhi + vlo; + return (vhi << 4) + (vlo - vhi); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 Accumulate(Vector256 sums, Vector256 bytes) - => Avx2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums; + => Avx2.SumAbsoluteDifferences(bytes, Vector256.Zero).AsUInt32() + sums; [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 Accumulate(Vector256 sums, Vector256 bytes1, Vector256 bytes2) { - Vector256 zero = default; + Vector256 zero = Vector256.Zero; Vector256 sad = Avx2.SumAbsoluteDifferences(bytes1, zero).AsUInt32(); return sad + Avx2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums; } @@ -236,8 +236,8 @@ public static (Vector128 vs1, Vector128 vs2) VectorLoop= blockSize) { - Vector256 ws3 = default; - Vector256 wps = default; + Vector256 ws3 = Vector256.Zero; + Vector256 wps = Vector256.Zero; uint blocks = uint.Min(vectors, Adler32Simd.VMax) / blockSize; vectors -= blocks * blockSize; @@ -287,12 +287,12 @@ public static (Vector128 vs1, Vector128 vs2) VectorLoop Accumulate(Vector128 sums, Vector128 bytes) - => Sse2.SumAbsoluteDifferences(bytes, default).AsUInt32() + sums; + => Sse2.SumAbsoluteDifferences(bytes, Vector128.Zero).AsUInt32() + sums; [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 Accumulate(Vector128 sums, Vector128 bytes1, Vector128 bytes2) { - Vector128 zero = default; + Vector128 zero = Vector128.Zero; Vector128 sad = Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32(); return sad + Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32() + sums; } @@ -324,7 +324,7 @@ public static Vector128 Accumulate(Vector128 sums, Vector128 b { (Vector128 b1l, Vector128 b1h) = Vector128.Widen(bytes1); (Vector128 b2l, Vector128 b2h) = Vector128.Widen(bytes2); - (Vector128 sl, Vector128 sh) = Vector128.Widen(b1l + b1h + b2l + b2h); + (Vector128 sl, Vector128 sh) = Vector128.Widen((b1l + b1h) + (b2l + b2h)); return sums + sl + sh; } }