From 85037d2fc893145a55e652a7f4a7cd9d42d169f4 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Tue, 21 Feb 2017 05:36:06 +0000 Subject: [PATCH] Faster IndexOfVectorized from https://github.com/dotnet/corefx/pull/16222 --- .../System/Buffers/BufferExtensions.cs | 236 +++++++++++++----- 1 file changed, 168 insertions(+), 68 deletions(-) diff --git a/src/System.Buffers.Experimental/System/Buffers/BufferExtensions.cs b/src/System.Buffers.Experimental/System/Buffers/BufferExtensions.cs index b8f10a949e4..6d5906858f6 100644 --- a/src/System.Buffers.Experimental/System/Buffers/BufferExtensions.cs +++ b/src/System.Buffers.Experimental/System/Buffers/BufferExtensions.cs @@ -122,103 +122,203 @@ internal static int IndexOfStraddling(this ReadOnlySpan first, IReadOnlyMe static readonly int s_longSize = Vector.Count; static readonly int s_byteSize = Vector.Count; - public static int IndexOfVectorized(this Span buffer, byte value) + public unsafe static int IndexOfVectorized(this Span buffer, byte value) { - Debug.Assert(s_longSize == 4 || s_longSize == 2); + var index = -1; + var length = buffer.Length; + if (length == 0) + { + goto exit; + } - var byteSize = s_byteSize; + fixed (byte* pHaystack = &buffer.DangerousGetPinnableReference()) + { + var haystack = pHaystack; + index = 0; - if (buffer.Length < byteSize * 2 || !Vector.IsHardwareAccelerated) return buffer.IndexOf(value); + if (Vector.IsHardwareAccelerated) + { + if (length - Vector.Count >= index) + { + Vector needles = GetVector(value); + do + { + var flaggedMatches = Vector.Equals(Unsafe.Read>(haystack + index), needles); + if (flaggedMatches.Equals(Vector.Zero)) + { + index += Vector.Count; + continue; + } + + index += LocateFirstFoundByte(flaggedMatches); + goto exitFixed; + + } while (length - Vector.Count >= index); + } + } - Vector match = new Vector(value); - var vectors = buffer.NonPortableCast>(); - var zero = Vector.Zero; + while (length - sizeof(ulong) >= index) + { + var flaggedMatches = SetLowBitsForByteMatch(*(ulong*)(haystack + index), value); + if (flaggedMatches == 0) + { + index += sizeof(ulong); + continue; + } - for (int vectorIndex = 0; vectorIndex < vectors.Length; vectorIndex++) - { - var vector = vectors.GetItem(vectorIndex); - var result = Vector.Equals(vector, match); - if (result != zero) + index += LocateFirstFoundByte(flaggedMatches); + goto exitFixed; + } + + for (; index < length; index++) { - var longer = Vector.AsVectorUInt64(result); - Debug.Assert(s_longSize == 4 || s_longSize == 2); - - var candidate = longer[0]; - if (candidate != 0) return vectorIndex * byteSize + IndexOf(candidate); - candidate = longer[1]; - if (candidate != 0) return 8 + vectorIndex * byteSize + IndexOf(candidate); - if (s_longSize == 4) + if (*(haystack + index) == value) { - candidate = longer[2]; - if (candidate != 0) return 16 + vectorIndex * byteSize + IndexOf(candidate); - candidate = longer[3]; - if (candidate != 0) return 24 + vectorIndex * byteSize + IndexOf(candidate); + goto exitFixed; } } + // No Matches + index = -1; + // Don't goto out of fixed block + exitFixed:; } - - var processed = vectors.Length * byteSize; - var index = buffer.Slice(processed).IndexOf(value); - if (index == -1) return -1; - return index + processed; + exit: + return index; } [MethodImpl(MethodImplOptions.NoInlining)] - public static int IndexOfVectorized(this ReadOnlySpan buffer, byte value) + public unsafe static int IndexOfVectorized(this ReadOnlySpan buffer, byte value) { Debug.Assert(s_longSize == 4 || s_longSize == 2); - var byteSize = s_byteSize; + var index = -1; + var length = buffer.Length; + if (length == 0) + { + goto exit; + } - if (buffer.Length < byteSize * 2 || !Vector.IsHardwareAccelerated) return buffer.IndexOf(value); + fixed (byte* pHaystack = &buffer.DangerousGetPinnableReference()) + { + var haystack = pHaystack; + index = 0; - Vector match = new Vector(value); - var vectors = buffer.NonPortableCast>(); - var zero = Vector.Zero; + if (Vector.IsHardwareAccelerated) + { + if (length - Vector.Count >= index) + { + Vector needles = GetVector(value); + do + { + var flaggedMatches = Vector.Equals(Unsafe.Read>(haystack + index), needles); + if (flaggedMatches.Equals(Vector.Zero)) + { + index += Vector.Count; + continue; + } + + index += LocateFirstFoundByte(flaggedMatches); + goto exitFixed; + + } while (length - Vector.Count >= index); + } + } - for (int vectorIndex = 0; vectorIndex < vectors.Length; vectorIndex++) - { - var vector = vectors[vectorIndex]; - var result = Vector.Equals(vector, match); - if (result != zero) + while (length - sizeof(ulong) >= index) { - var longer = Vector.AsVectorUInt64(result); - var candidate = longer[0]; - if (candidate != 0) return vectorIndex * byteSize + IndexOf(candidate); - candidate = longer[1]; - if (candidate != 0) return 8 + vectorIndex * byteSize + IndexOf(candidate); - if (s_longSize == 4) + var flaggedMatches = SetLowBitsForByteMatch(*(ulong*)(haystack + index), value); + if (flaggedMatches == 0) { - candidate = longer[2]; - if (candidate != 0) return 16 + vectorIndex * byteSize + IndexOf(candidate); - candidate = longer[3]; - if (candidate != 0) return 24 + vectorIndex * byteSize + IndexOf(candidate); + index += sizeof(ulong); + continue; } + + index += LocateFirstFoundByte(flaggedMatches); + goto exitFixed; } + + for (; index < length; index++) + { + if (*(haystack + index) == value) + { + goto exitFixed; + } + } + // No Matches + index = -1; + // Don't goto out of fixed block + exitFixed:; + } + exit: + return index; + } + + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int LocateFirstFoundByte(Vector match) + { + var vector64 = Vector.AsVectorUInt64(match); + ulong candidate = 0; + var i = 0; + // Pattern unrolled by jit https://github.com/dotnet/coreclr/pull/8001 + for (; i < Vector.Count; i++) + { + candidate = vector64[i]; + if (candidate == 0) continue; + break; + } + + // Single LEA instruction with jitted const (using function result) + return i * 8 + LocateFirstFoundByte(candidate); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int LocateFirstFoundByte(ulong match) + { + unchecked + { + // Flag least significant power of two bit + var powerOfTwoFlag = match ^ (match - 1); + // Shift all powers of two into the high byte and extract + return (int)((powerOfTwoFlag * xorPowerOfTwoToHighByte) >> 57); } + } - var processed = vectors.Length * byteSize; - var index = buffer.Slice(processed).IndexOf(value); - if (index == -1) return -1; - return index + processed; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong SetLowBitsForByteMatch(ulong potentialMatch, byte search) + { + unchecked + { + var flaggedValue = potentialMatch ^ (byteBroadcastToUlong * search); + return ( + (flaggedValue - byteBroadcastToUlong) & + ~(flaggedValue) & + filterByteHighBitsInUlong + ) >> 7; + } } - // used by IndexOfVectorized - static int IndexOf(ulong next) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector GetVector(byte vectorByte) { - // Flag least significant power of two bit - var powerOfTwoFlag = (next ^ (next - 1)); - // Shift all powers of two into the high byte and extract - var foundByteIndex = (int)((powerOfTwoFlag * _xorPowerOfTwoToHighByte) >> 57); - return foundByteIndex; +#if !NETCOREAPP1_2 + // Vector .ctor doesn't become an intrinsic due to detection issue + // However this does cause it to become an intrinsic (with additional multiply and reg->reg copy) + // https://github.com/dotnet/coreclr/issues/7459#issuecomment-253965670 + return Vector.AsVectorByte(new Vector(vectorByte * 0x01010101u)); +#else + return new Vector(vectorByte); +#endif } - const ulong _xorPowerOfTwoToHighByte = (0x07ul | - 0x06ul << 8 | - 0x05ul << 16 | - 0x04ul << 24 | - 0x03ul << 32 | - 0x02ul << 40 | - 0x01ul << 48) + 1; + private const ulong xorPowerOfTwoToHighByte = (0x07ul | + 0x06ul << 8 | + 0x05ul << 16 | + 0x04ul << 24 | + 0x03ul << 32 | + 0x02ul << 40 | + 0x01ul << 48) + 1; + private const ulong byteBroadcastToUlong = ~0UL / byte.MaxValue; + private const ulong filterByteHighBitsInUlong = (byteBroadcastToUlong >> 1) | (byteBroadcastToUlong << (sizeof(ulong) * 8 - 1)); } } \ No newline at end of file