diff --git a/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj b/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj index 06555e0e92fc41..14970fa6744bf5 100644 --- a/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj +++ b/src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj @@ -37,7 +37,10 @@ System.IO.Hashing.XxHash32 + + + diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32ParameterSet.Table.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32ParameterSet.Table.cs index ad5970a1fddf23..db156dde0054e0 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32ParameterSet.Table.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32ParameterSet.Table.cs @@ -2,6 +2,9 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Diagnostics; +#if NET +using System.Runtime.Intrinsics; +#endif namespace System.IO.Hashing { @@ -59,18 +62,33 @@ private static uint[] GenerateLookupTable(uint polynomial, bool reflectInput) return table; } - private sealed class ReflectedTableBasedCrc32 : Crc32ParameterSet + private sealed partial class ReflectedTableBasedCrc32 : Crc32ParameterSet { private readonly uint[] _lookupTable; + partial void InitializeVectorized(); + internal ReflectedTableBasedCrc32(uint polynomial, uint initialValue, uint finalXorValue) : base(polynomial, initialValue, finalXorValue, reflectValues: true) { _lookupTable = GenerateLookupTable(polynomial, reflectInput: true); + InitializeVectorized(); } internal override uint Update(uint value, ReadOnlySpan source) { +#if NET + if (_canVectorize && source.Length >= Vector128.Count) + { + return UpdateVectorized(value, source); + } +#endif + + return UpdateScalar(value, source); + } + + private uint UpdateScalar(uint value, ReadOnlySpan source) + { uint[] lookupTable = _lookupTable; uint crc = value; @@ -86,18 +104,33 @@ internal override uint Update(uint value, ReadOnlySpan source) } } - private sealed class ForwardTableBasedCrc32 : Crc32ParameterSet + private sealed partial class ForwardTableBasedCrc32 : Crc32ParameterSet { private readonly uint[] _lookupTable; + partial void InitializeVectorized(); + internal ForwardTableBasedCrc32(uint polynomial, uint initialValue, uint finalXorValue) : base(polynomial, initialValue, finalXorValue, reflectValues: false) { _lookupTable = GenerateLookupTable(polynomial, reflectInput: false); + InitializeVectorized(); } internal override uint Update(uint value, ReadOnlySpan source) { +#if NET + if (_canVectorize && source.Length >= Vector128.Count) + { + return UpdateVectorized(value, source); + } +#endif + + return UpdateScalar(value, source); + } + + private uint UpdateScalar(uint value, ReadOnlySpan source) + { uint[] lookupTable = _lookupTable; uint crc = value; diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32ParameterSet.Vectorized.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32ParameterSet.Vectorized.cs new file mode 100644 index 00000000000000..b38bc0a93905f1 --- /dev/null +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32ParameterSet.Vectorized.cs @@ -0,0 +1,277 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#if NET + +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using static System.IO.Hashing.VectorHelper; + +namespace System.IO.Hashing +{ + public partial class Crc32ParameterSet + { + private partial class ReflectedTableBasedCrc32 + { + // Precomputed constants for PCLMULQDQ-based folding. + private bool _canVectorize; + private ulong _k1, _k2; // 4-way fold constants + private ulong _k3, _k4; // 1-way fold constants + private ulong _k5; // 128-to-64 fold constant + private ulong _pStar, _mu; // Barrett reduction constants + + partial void InitializeVectorized() + { + if (!BitConverter.IsLittleEndian || !VectorHelper.IsSupported) + return; + + ulong polynomial = Polynomial; + CrcPolynomialHelper.UInt640 fullPoly = new((1UL << 32) | polynomial); + int polyDeg = 32; + + // Reflected folding constants: reverse_bits(x^power mod fullPoly, polyDeg+1) + _k1 = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 4 * 128 + polyDeg), polyDeg + 1); + _k2 = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 4 * 128 - polyDeg), polyDeg + 1); + _k3 = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 128 + polyDeg), polyDeg + 1); + _k4 = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 128 - polyDeg), polyDeg + 1); + _k5 = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 2 * polyDeg), polyDeg + 1); + + // Barrett reduction constants + _pStar = CrcPolynomialHelper.ReverseBits((1UL << polyDeg) | polynomial, polyDeg + 1); + _mu = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeBarrettConstant(fullPoly, 2 * polyDeg), polyDeg + 1); + + _canVectorize = true; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private uint UpdateVectorized(uint crc, ReadOnlySpan source) + { + Debug.Assert(_canVectorize); + Debug.Assert(source.Length >= Vector128.Count); + + ref byte srcRef = ref MemoryMarshal.GetReference(source); + int length = source.Length; + + Vector128 kConstants; + Vector128 x1; + Vector128 x2; + + if (length >= Vector128.Count * 8) + { + x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); + x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64(); + Vector128 x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64(); + Vector128 x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64(); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 4); + length -= Vector128.Count * 4; + + x1 ^= Vector128.CreateScalar(crc).AsUInt64(); + + kConstants = Vector128.Create(_k1, _k2); + + do + { + Vector128 y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); + Vector128 y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64(); + Vector128 y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64(); + Vector128 y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64(); + + x1 = FoldPolynomialPair(y5, x1, kConstants); + x2 = FoldPolynomialPair(y6, x2, kConstants); + x3 = FoldPolynomialPair(y7, x3, kConstants); + x4 = FoldPolynomialPair(y8, x4, kConstants); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 4); + length -= Vector128.Count * 4; + } while (length >= Vector128.Count * 4); + + kConstants = Vector128.Create(_k3, _k4); + x1 = FoldPolynomialPair(x2, x1, kConstants); + x1 = FoldPolynomialPair(x3, x1, kConstants); + x1 = FoldPolynomialPair(x4, x1, kConstants); + } + else + { + Debug.Assert(length >= 16); + + x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); + x1 ^= Vector128.CreateScalar(crc).AsUInt64(); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); + length -= Vector128.Count; + } + + kConstants = Vector128.Create(_k3, _k4); + + while (length >= Vector128.Count) + { + x1 = FoldPolynomialPair(Vector128.LoadUnsafe(ref srcRef).AsUInt64(), x1, kConstants); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); + length -= Vector128.Count; + } + + // Fold 128 bits to 64 bits. + Vector128 bitmask = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); + x1 = ShiftRightBytesInVector(x1, 8) ^ + CarrylessMultiplyLower(x1, Vector128.CreateScalar(_k4)); + x1 = CarrylessMultiplyLower(x1 & bitmask, Vector128.CreateScalar(_k5)) ^ + ShiftRightBytesInVector(x1, 4); + + // Reduce to 32 bits via Barrett reduction. + kConstants = Vector128.Create(_pStar, _mu); + x2 = CarrylessMultiplyLeftLowerRightUpper(x1 & bitmask, kConstants) & bitmask; + x2 = CarrylessMultiplyLower(x2, kConstants); + x1 ^= x2; + + uint result = x1.AsUInt32().GetElement(1); + return length > 0 + ? UpdateScalar(result, MemoryMarshal.CreateReadOnlySpan(ref srcRef, length)) + : result; + } + } + + private partial class ForwardTableBasedCrc32 + { + // Precomputed constants for PCLMULQDQ-based folding. + private bool _canVectorize; + private ulong _k1, _k2; // 4-way fold constants + private ulong _k3, _k4; // 1-way fold constants + private ulong _k5; // 128-to-64 fold constant + private ulong _poly, _mu; // Barrett reduction constants + + partial void InitializeVectorized() + { + if (!VectorHelper.IsSupported) + return; + + ulong polynomial = Polynomial; + CrcPolynomialHelper.UInt640 fullPoly = new((1UL << 32) | polynomial); + + // Forward folding constants: x^power mod fullPoly + _k1 = CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 4 * 128); + _k2 = CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 4 * 128 + 64); + _k3 = CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 128); + _k4 = CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 128 + 64); + _k5 = CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 128); + + // Barrett reduction constants + _poly = polynomial; + _mu = CrcPolynomialHelper.ComputeBarrettConstant(fullPoly, 2 * 32) & 0xFFFFFFFF; + + _canVectorize = true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 LoadFromSourceByteSwapped(ref byte source, nuint elementOffset) + { + Vector128 vector = Vector128.LoadUnsafe(ref source, elementOffset); + + if (BitConverter.IsLittleEndian) + { + vector = Vector128.Shuffle(vector, + Vector128.Create((byte)0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, + 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00)); + } + + return vector.AsUInt64(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private uint UpdateVectorized(uint crc, ReadOnlySpan source) + { + Debug.Assert(_canVectorize); + Debug.Assert(source.Length >= Vector128.Count); + + ref byte srcRef = ref MemoryMarshal.GetReference(source); + int length = source.Length; + + Vector128 x7; + Vector128 kConstants; + + if (length >= Vector128.Count * 8) + { + Vector128 x0 = LoadFromSourceByteSwapped(ref srcRef, 0); + Vector128 x1 = LoadFromSourceByteSwapped(ref srcRef, 16); + Vector128 x2 = LoadFromSourceByteSwapped(ref srcRef, 32); + x7 = LoadFromSourceByteSwapped(ref srcRef, 48); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 4); + length -= Vector128.Count * 4; + + x0 ^= ShiftLowerToUpper(Vector128.CreateScalar((ulong)crc)); + + kConstants = Vector128.Create(_k1, _k2); + + do + { + Vector128 y1 = LoadFromSourceByteSwapped(ref srcRef, 0); + Vector128 y2 = LoadFromSourceByteSwapped(ref srcRef, 16); + Vector128 y3 = LoadFromSourceByteSwapped(ref srcRef, 32); + Vector128 y4 = LoadFromSourceByteSwapped(ref srcRef, 48); + + x0 = FoldPolynomialPair(y1, x0, kConstants); + x1 = FoldPolynomialPair(y2, x1, kConstants); + x2 = FoldPolynomialPair(y3, x2, kConstants); + x7 = FoldPolynomialPair(y4, x7, kConstants); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 4); + length -= Vector128.Count * 4; + } while (length >= Vector128.Count * 4); + + kConstants = Vector128.Create(_k3, _k4); + x7 = FoldPolynomialPair(x7, x0, kConstants); + x7 = FoldPolynomialPair(x7, x1, kConstants); + x7 = FoldPolynomialPair(x7, x2, kConstants); + } + else + { + Debug.Assert(length >= 16); + + x7 = LoadFromSourceByteSwapped(ref srcRef, 0); + x7 ^= ShiftLowerToUpper(Vector128.CreateScalar((ulong)crc)); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); + length -= Vector128.Count; + } + + kConstants = Vector128.Create(_k3, _k4); + + while (length >= Vector128.Count) + { + x7 = FoldPolynomialPair(LoadFromSourceByteSwapped(ref srcRef, 0), x7, kConstants); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); + length -= Vector128.Count; + } + + // Compute CRC of a 128-bit value and fold to the upper 64-bits. + x7 = CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(_k5)) ^ + ShiftLowerToUpper(x7); + + // Barrett reduction. + kConstants = Vector128.Create(_mu, _poly); + Vector128 temp = x7; + x7 = CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL)); + x7 = CarrylessMultiplyUpper(x7, kConstants); + x7 ^= temp; + + uint result = (uint)x7.GetElement(0); + return length > 0 + ? UpdateScalar(result, MemoryMarshal.CreateReadOnlySpan(ref srcRef, length)) + : result; + } + } + } +} + +#endif diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64ParameterSet.Table.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64ParameterSet.Table.cs index 820fc8093fbdc9..d3025ae85808f6 100644 --- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64ParameterSet.Table.cs +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64ParameterSet.Table.cs @@ -2,6 +2,9 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Diagnostics; +#if NET +using System.Runtime.Intrinsics; +#endif namespace System.IO.Hashing { @@ -59,18 +62,33 @@ private static ulong[] GenerateLookupTable(ulong polynomial, bool reflectInput) return table; } - private sealed class ReflectedTableBasedCrc64 : Crc64ParameterSet + private sealed partial class ReflectedTableBasedCrc64 : Crc64ParameterSet { private readonly ulong[] _lookupTable; + partial void InitializeVectorized(); + internal ReflectedTableBasedCrc64(ulong polynomial, ulong initialValue, ulong finalXorValue) : base(polynomial, initialValue, finalXorValue, reflectValues: true) { _lookupTable = GenerateLookupTable(polynomial, reflectInput: true); + InitializeVectorized(); } internal override ulong Update(ulong value, ReadOnlySpan data) { +#if NET + if (_canVectorize && data.Length >= Vector128.Count) + { + return UpdateVectorized(value, data); + } +#endif + + return UpdateScalar(value, data); + } + + private ulong UpdateScalar(ulong value, ReadOnlySpan data) + { ulong[] lookupTable = _lookupTable; ulong crc = value; @@ -86,18 +104,33 @@ internal override ulong Update(ulong value, ReadOnlySpan data) } } - private sealed class ForwardTableBasedCrc64 : Crc64ParameterSet + private sealed partial class ForwardTableBasedCrc64 : Crc64ParameterSet { private readonly ulong[] _lookupTable; + partial void InitializeVectorized(); + internal ForwardTableBasedCrc64(ulong polynomial, ulong initialValue, ulong finalXorValue) : base(polynomial, initialValue, finalXorValue, reflectValues: false) { _lookupTable = GenerateLookupTable(polynomial, reflectInput: false); + InitializeVectorized(); } internal override ulong Update(ulong value, ReadOnlySpan data) { +#if NET + if (_canVectorize && data.Length >= Vector128.Count) + { + return UpdateVectorized(value, data); + } +#endif + + return UpdateScalar(value, data); + } + + private ulong UpdateScalar(ulong value, ReadOnlySpan data) + { ulong[] lookupTable = _lookupTable; ulong crc = value; diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64ParameterSet.Vectorized.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64ParameterSet.Vectorized.cs new file mode 100644 index 00000000000000..5b34cd59cb06e7 --- /dev/null +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64ParameterSet.Vectorized.cs @@ -0,0 +1,293 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#if NET + +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using static System.IO.Hashing.VectorHelper; + +namespace System.IO.Hashing +{ + public partial class Crc64ParameterSet + { + private partial class ReflectedTableBasedCrc64 + { + // Precomputed constants for PCLMULQDQ-based folding. + private bool _canVectorize; + private ulong _k1, _k2; // 4-way fold constants + private ulong _k3, _k4; // 1-way fold constants + private ulong _k5; // 128-to-64 fold constant + private ulong _pStar, _mu; // Barrett reduction constants + + partial void InitializeVectorized() + { + if (!BitConverter.IsLittleEndian || !VectorHelper.IsSupported) + return; + + ulong polynomial = Polynomial; + int polyDeg = 64; + + // Build 65-bit full polynomial: x^64 + polynomial + CrcPolynomialHelper.UInt640 fullPoly = new(polynomial); + { + CrcPolynomialHelper.UInt640 highBit = new(1); + highBit.ShiftLeftEquals(polyDeg); + fullPoly.XorEquals(ref highBit); + } + + // Reflected folding constants: reverse_bits(x^power mod fullPoly, polyDeg+1) + _k1 = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 4 * 128 + polyDeg), polyDeg + 1); + _k2 = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 4 * 128 - polyDeg), polyDeg + 1); + _k3 = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 128 + polyDeg), polyDeg + 1); + _k4 = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 128 - polyDeg), polyDeg + 1); + _k5 = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 2 * polyDeg), polyDeg + 1); + + // Barrett reduction constants + // P* = reverse_bits(fullPoly, polyDeg+1) + // For 64-bit CRC: fullPoly = x^64 + polynomial (65 bits). + // reverse_bits(polynomial, 65) where bit 64 maps to bit 0: + // lower 64 bits of the reversed value, plus the leading 1 maps to bit 0. + _pStar = CrcPolynomialHelper.ReverseBits(polynomial, polyDeg) | 1; + _mu = CrcPolynomialHelper.ReverseBits( + CrcPolynomialHelper.ComputeBarrettConstant(fullPoly, 2 * polyDeg), polyDeg + 1); + + _canVectorize = true; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private ulong UpdateVectorized(ulong crc, ReadOnlySpan data) + { + Debug.Assert(_canVectorize); + Debug.Assert(data.Length >= Vector128.Count); + + ref byte srcRef = ref MemoryMarshal.GetReference(data); + int length = data.Length; + + Vector128 kConstants; + Vector128 x1; + Vector128 x2; + + if (length >= Vector128.Count * 8) + { + x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); + x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64(); + Vector128 x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64(); + Vector128 x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64(); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 4); + length -= Vector128.Count * 4; + + x1 ^= Vector128.CreateScalar(crc); + + kConstants = Vector128.Create(_k1, _k2); + + do + { + Vector128 y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); + Vector128 y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64(); + Vector128 y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64(); + Vector128 y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64(); + + x1 = FoldPolynomialPair(y5, x1, kConstants); + x2 = FoldPolynomialPair(y6, x2, kConstants); + x3 = FoldPolynomialPair(y7, x3, kConstants); + x4 = FoldPolynomialPair(y8, x4, kConstants); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 4); + length -= Vector128.Count * 4; + } while (length >= Vector128.Count * 4); + + kConstants = Vector128.Create(_k3, _k4); + x1 = FoldPolynomialPair(x2, x1, kConstants); + x1 = FoldPolynomialPair(x3, x1, kConstants); + x1 = FoldPolynomialPair(x4, x1, kConstants); + } + else + { + Debug.Assert(length >= 16); + + x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); + x1 ^= Vector128.CreateScalar(crc); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); + length -= Vector128.Count; + } + + kConstants = Vector128.Create(_k3, _k4); + + while (length >= Vector128.Count) + { + x1 = FoldPolynomialPair(Vector128.LoadUnsafe(ref srcRef).AsUInt64(), x1, kConstants); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); + length -= Vector128.Count; + } + + // Fold 128 bits to 64 bits. + x1 = ShiftRightBytesInVector(x1, 8) ^ + CarrylessMultiplyLower(x1, Vector128.CreateScalar(_k4)); + + // Barrett reduction. + kConstants = Vector128.Create(_pStar, _mu); + x2 = CarrylessMultiplyLower(x1, kConstants); + x2 = CarrylessMultiplyLeftLowerRightUpper(x2, kConstants); + x1 ^= x2; + + ulong result = x1.GetElement(0); + return length > 0 + ? UpdateScalar(result, MemoryMarshal.CreateReadOnlySpan(ref srcRef, length)) + : result; + } + } + + private partial class ForwardTableBasedCrc64 + { + // Precomputed constants for PCLMULQDQ-based folding. + private bool _canVectorize; + private ulong _k1, _k2; // 4-way fold constants + private ulong _k3, _k4; // 1-way fold constants + private ulong _k5; // 128-to-64 fold constant + private ulong _poly, _mu; // Barrett reduction constants + + partial void InitializeVectorized() + { + if (!VectorHelper.IsSupported) + return; + + ulong polynomial = Polynomial; + int polyDeg = 64; + + // Build 65-bit full polynomial: x^64 + polynomial + CrcPolynomialHelper.UInt640 fullPoly = new(polynomial); + { + CrcPolynomialHelper.UInt640 highBit = new(1); + highBit.ShiftLeftEquals(polyDeg); + fullPoly.XorEquals(ref highBit); + } + + // Forward folding constants: x^power mod fullPoly + _k1 = CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 4 * 128); + _k2 = CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 4 * 128 + 64); + _k3 = CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 128); + _k4 = CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 128 + 64); + _k5 = CrcPolynomialHelper.ComputeFoldingConstant(fullPoly, 128); + + // Barrett reduction constants + _poly = polynomial; + _mu = CrcPolynomialHelper.ComputeBarrettConstant(fullPoly, 2 * polyDeg) & ((1UL << polyDeg) - 1); + + _canVectorize = true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 LoadFromSourceByteSwapped(ref byte source, nuint elementOffset) + { + Vector128 vector = Vector128.LoadUnsafe(ref source, elementOffset); + + if (BitConverter.IsLittleEndian) + { + vector = Vector128.Shuffle(vector, + Vector128.Create((byte)0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, + 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00)); + } + + return vector.AsUInt64(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private ulong UpdateVectorized(ulong crc, ReadOnlySpan data) + { + Debug.Assert(_canVectorize); + Debug.Assert(data.Length >= Vector128.Count); + + ref byte srcRef = ref MemoryMarshal.GetReference(data); + int length = data.Length; + + Vector128 x7; + Vector128 kConstants; + + if (length >= Vector128.Count * 8) + { + Vector128 x0 = LoadFromSourceByteSwapped(ref srcRef, 0); + Vector128 x1 = LoadFromSourceByteSwapped(ref srcRef, 16); + Vector128 x2 = LoadFromSourceByteSwapped(ref srcRef, 32); + x7 = LoadFromSourceByteSwapped(ref srcRef, 48); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 4); + length -= Vector128.Count * 4; + + x0 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc)); + + kConstants = Vector128.Create(_k1, _k2); + + do + { + Vector128 y1 = LoadFromSourceByteSwapped(ref srcRef, 0); + Vector128 y2 = LoadFromSourceByteSwapped(ref srcRef, 16); + Vector128 y3 = LoadFromSourceByteSwapped(ref srcRef, 32); + Vector128 y4 = LoadFromSourceByteSwapped(ref srcRef, 48); + + x0 = FoldPolynomialPair(y1, x0, kConstants); + x1 = FoldPolynomialPair(y2, x1, kConstants); + x2 = FoldPolynomialPair(y3, x2, kConstants); + x7 = FoldPolynomialPair(y4, x7, kConstants); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count * 4); + length -= Vector128.Count * 4; + } while (length >= Vector128.Count * 4); + + kConstants = Vector128.Create(_k3, _k4); + x7 = FoldPolynomialPair(x7, x0, kConstants); + x7 = FoldPolynomialPair(x7, x1, kConstants); + x7 = FoldPolynomialPair(x7, x2, kConstants); + } + else + { + Debug.Assert(length >= 16); + + x7 = LoadFromSourceByteSwapped(ref srcRef, 0); + x7 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc)); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); + length -= Vector128.Count; + } + + kConstants = Vector128.Create(_k3, _k4); + + while (length >= Vector128.Count) + { + x7 = FoldPolynomialPair(LoadFromSourceByteSwapped(ref srcRef, 0), x7, kConstants); + + srcRef = ref Unsafe.Add(ref srcRef, Vector128.Count); + length -= Vector128.Count; + } + + // Compute CRC of a 128-bit value and fold to the upper 64-bits. + x7 = CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(_k5)) ^ + ShiftLowerToUpper(x7); + + // Barrett reduction. + kConstants = Vector128.Create(_mu, _poly); + Vector128 temp = x7; + x7 = CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL)); + x7 = CarrylessMultiplyUpper(x7, kConstants); + x7 ^= temp; + + ulong result = x7.GetElement(0); + return length > 0 + ? UpdateScalar(result, MemoryMarshal.CreateReadOnlySpan(ref srcRef, length)) + : result; + } + } + } +} + +#endif diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/CrcPolynomialHelper.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/CrcPolynomialHelper.cs new file mode 100644 index 00000000000000..a88b901d75ce72 --- /dev/null +++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/CrcPolynomialHelper.cs @@ -0,0 +1,173 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#if NET + +using System.Buffers.Binary; +using System.Runtime.CompilerServices; + +namespace System.IO.Hashing +{ + /// + /// Provides GF(2) polynomial arithmetic for computing CRC folding constants. + /// + internal static class CrcPolynomialHelper + { + /// + /// Computes x^ mod in GF(2). + /// + /// The polynomial (with leading bit) to reduce by. + /// The power of x. + /// The remainder, which has fewer bits than . + internal static ulong ComputeFoldingConstant(UInt640 poly, int power) + { + int polyDeg = poly.Degree; + + UInt640 value = new(1); + value.ShiftLeftEquals(power); + + while (value.Degree >= polyDeg) + { + int shift = value.Degree - polyDeg; + UInt640 polyShifted = poly; + polyShifted.ShiftLeftEquals(shift); + value.XorEquals(ref polyShifted); + } + + return value.ToUInt64(); + } + + /// + /// Computes floor(x^ / ) in GF(2). + /// + /// The polynomial (with leading bit) to divide by. + /// The power of x. + /// The quotient. + internal static ulong ComputeBarrettConstant(UInt640 poly, int power) + { + int polyDeg = poly.Degree; + + UInt640 value = new(1); + value.ShiftLeftEquals(power); + + UInt640 quotient = default; + + while (value.Degree >= polyDeg) + { + int shift = value.Degree - polyDeg; + UInt640 polyShifted = poly; + polyShifted.ShiftLeftEquals(shift); + value.XorEquals(ref polyShifted); + + UInt640 bit = new(1); + bit.ShiftLeftEquals(shift); + quotient.XorEquals(ref bit); + } + + return quotient.ToUInt64(); + } + + /// + /// Reverses the lowest bits of . + /// + internal static ulong ReverseBits(ulong value, int width) + { + ulong result = 0; + + for (int i = 0; i < width; i++) + { + if ((value & (1UL << i)) != 0) + { + result |= 1UL << (width - 1 - i); + } + } + + return result; + } + + /// + /// A 640-bit unsigned integer for GF(2) polynomial arithmetic. + /// + [InlineArray(Length)] + internal struct UInt640 + { + private const int Length = 10; + private ulong _element; + + internal UInt640(ulong value) + { + this = default; + this[0] = value; + } + + internal readonly int Degree + { + get + { + for (int i = Length - 1; i >= 0; i--) + { + if (this[i] != 0) + { + return (i * 64) + (63 - BitOperations.LeadingZeroCount(this[i])); + } + } + + return -1; + } + } + + internal void ShiftLeftEquals(int count) + { + int wordShift = count >> 6; // count / 64 + int bitShift = count & 63; // count % 64 + + if (wordShift > 0) + { + for (int i = Length - 1; i >= wordShift; i--) + { + this[i] = this[i - wordShift]; + } + + for (int i = wordShift - 1; i >= 0; i--) + { + this[i] = 0; + } + } + + if (bitShift > 0) + { + for (int i = Length - 1; i > 0; i--) + { + this[i] = (this[i] << bitShift) | (this[i - 1] >> (64 - bitShift)); + } + + this[0] <<= bitShift; + } + } + + internal void XorEquals(ref UInt640 other) + { + for (int i = 0; i < Length; i++) + { + this[i] ^= other[i]; + } + } + + internal readonly ulong ToUInt64() => this[0]; + } + + /// + /// Polyfill for . + /// + private static class BitOperations + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int LeadingZeroCount(ulong value) + { + return System.Numerics.BitOperations.LeadingZeroCount(value); + } + } + } +} + +#endif