Skip to content
This repository was archived by the owner on Aug 2, 2023. It is now read-only.
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 168 additions & 68 deletions src/System.Buffers.Experimental/System/Buffers/BufferExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -122,103 +122,203 @@ internal static int IndexOfStraddling(this ReadOnlySpan<byte> first, IReadOnlyMe
static readonly int s_longSize = Vector<ulong>.Count;
static readonly int s_byteSize = Vector<byte>.Count;

public static int IndexOfVectorized(this Span<byte> buffer, byte value)
public unsafe static int IndexOfVectorized(this Span<byte> buffer, byte value)
{
Debug.Assert(s_longSize == 4 || s_longSize == 2);
var index = -1;
var length = buffer.Length;
if (length == 0)
{
goto exit;
}

var byteSize = s_byteSize;
fixed (byte* pHaystack = &buffer.DangerousGetPinnableReference())
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

haystack is a unorganized pile. Span is an ordered list. I think the parameter name needs to change :-)

{
var haystack = pHaystack;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you share code between Span and ReadOnlySpan implementations after getting the pointer?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes they are identical

index = 0;

if (buffer.Length < byteSize * 2 || !Vector.IsHardwareAccelerated) return buffer.IndexOf(value);
if (Vector.IsHardwareAccelerated)
{
if (length - Vector<byte>.Count >= index)
{
Vector<byte> needles = GetVector(value);
do
{
var flaggedMatches = Vector.Equals(Unsafe.Read<Vector<byte>>(haystack + index), needles);
if (flaggedMatches.Equals(Vector<byte>.Zero))
{
index += Vector<byte>.Count;
continue;
}

index += LocateFirstFoundByte(flaggedMatches);
goto exitFixed;

} while (length - Vector<byte>.Count >= index);
}
}

Vector<byte> match = new Vector<byte>(value);
var vectors = buffer.NonPortableCast<byte, Vector<byte>>();
var zero = Vector<byte>.Zero;
while (length - sizeof(ulong) >= index)
{
var flaggedMatches = SetLowBitsForByteMatch(*(ulong*)(haystack + index), value);
if (flaggedMatches == 0)
{
index += sizeof(ulong);
continue;
}

for (int vectorIndex = 0; vectorIndex < vectors.Length; vectorIndex++)
{
var vector = vectors.GetItem(vectorIndex);
var result = Vector.Equals(vector, match);
if (result != zero)
index += LocateFirstFoundByte(flaggedMatches);
goto exitFixed;
}

for (; index < length; index++)
{
var longer = Vector.AsVectorUInt64(result);
Debug.Assert(s_longSize == 4 || s_longSize == 2);

var candidate = longer[0];
if (candidate != 0) return vectorIndex * byteSize + IndexOf(candidate);
candidate = longer[1];
if (candidate != 0) return 8 + vectorIndex * byteSize + IndexOf(candidate);
if (s_longSize == 4)
if (*(haystack + index) == value)
{
candidate = longer[2];
if (candidate != 0) return 16 + vectorIndex * byteSize + IndexOf(candidate);
candidate = longer[3];
if (candidate != 0) return 24 + vectorIndex * byteSize + IndexOf(candidate);
goto exitFixed;
}
}
// No Matches
index = -1;
// Don't goto out of fixed block
exitFixed:;
}

var processed = vectors.Length * byteSize;
var index = buffer.Slice(processed).IndexOf(value);
if (index == -1) return -1;
return index + processed;
exit:
return index;
}

[MethodImpl(MethodImplOptions.NoInlining)]
public static int IndexOfVectorized(this ReadOnlySpan<byte> buffer, byte value)
public unsafe static int IndexOfVectorized(this ReadOnlySpan<byte> buffer, byte value)
{
Debug.Assert(s_longSize == 4 || s_longSize == 2);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assert is removed in above (Span overload).


var byteSize = s_byteSize;
var index = -1;
var length = buffer.Length;
if (length == 0)
{
goto exit;
}

if (buffer.Length < byteSize * 2 || !Vector.IsHardwareAccelerated) return buffer.IndexOf(value);
fixed (byte* pHaystack = &buffer.DangerousGetPinnableReference())
{
var haystack = pHaystack;
index = 0;

Vector<byte> match = new Vector<byte>(value);
var vectors = buffer.NonPortableCast<byte, Vector<byte>>();
var zero = Vector<byte>.Zero;
if (Vector.IsHardwareAccelerated)
{
if (length - Vector<byte>.Count >= index)
{
Vector<byte> needles = GetVector(value);
do
{
var flaggedMatches = Vector.Equals(Unsafe.Read<Vector<byte>>(haystack + index), needles);
if (flaggedMatches.Equals(Vector<byte>.Zero))
{
index += Vector<byte>.Count;
continue;
}

index += LocateFirstFoundByte(flaggedMatches);
goto exitFixed;

} while (length - Vector<byte>.Count >= index);
}
}

for (int vectorIndex = 0; vectorIndex < vectors.Length; vectorIndex++)
{
var vector = vectors[vectorIndex];
var result = Vector.Equals(vector, match);
if (result != zero)
while (length - sizeof(ulong) >= index)
{
var longer = Vector.AsVectorUInt64(result);
var candidate = longer[0];
if (candidate != 0) return vectorIndex * byteSize + IndexOf(candidate);
candidate = longer[1];
if (candidate != 0) return 8 + vectorIndex * byteSize + IndexOf(candidate);
if (s_longSize == 4)
var flaggedMatches = SetLowBitsForByteMatch(*(ulong*)(haystack + index), value);
if (flaggedMatches == 0)
{
candidate = longer[2];
if (candidate != 0) return 16 + vectorIndex * byteSize + IndexOf(candidate);
candidate = longer[3];
if (candidate != 0) return 24 + vectorIndex * byteSize + IndexOf(candidate);
index += sizeof(ulong);
continue;
}

index += LocateFirstFoundByte(flaggedMatches);
goto exitFixed;
}

for (; index < length; index++)
{
if (*(haystack + index) == value)
{
goto exitFixed;
}
}
// No Matches
index = -1;
// Don't goto out of fixed block
exitFixed:;
}
exit:
return index;
}


[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateFirstFoundByte(Vector<byte> match)
{
var vector64 = Vector.AsVectorUInt64(match);
ulong candidate = 0;
var i = 0;
// Pattern unrolled by jit https://github.com/dotnet/coreclr/pull/8001
for (; i < Vector<ulong>.Count; i++)
{
candidate = vector64[i];
if (candidate == 0) continue;
break;
}

// Single LEA instruction with jitted const (using function result)
return i * 8 + LocateFirstFoundByte(candidate);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateFirstFoundByte(ulong match)
{
unchecked
{
// Flag least significant power of two bit
var powerOfTwoFlag = match ^ (match - 1);
// Shift all powers of two into the high byte and extract
return (int)((powerOfTwoFlag * xorPowerOfTwoToHighByte) >> 57);
}
}

var processed = vectors.Length * byteSize;
var index = buffer.Slice(processed).IndexOf(value);
if (index == -1) return -1;
return index + processed;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ulong SetLowBitsForByteMatch(ulong potentialMatch, byte search)
{
unchecked
{
var flaggedValue = potentialMatch ^ (byteBroadcastToUlong * search);
return (
(flaggedValue - byteBroadcastToUlong) &
~(flaggedValue) &
filterByteHighBitsInUlong
) >> 7;
}
}

// used by IndexOfVectorized
static int IndexOf(ulong next)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<byte> GetVector(byte vectorByte)
{
// Flag least significant power of two bit
var powerOfTwoFlag = (next ^ (next - 1));
// Shift all powers of two into the high byte and extract
var foundByteIndex = (int)((powerOfTwoFlag * _xorPowerOfTwoToHighByte) >> 57);
return foundByteIndex;
#if !NETCOREAPP1_2
// Vector<byte> .ctor doesn't become an intrinsic due to detection issue
// However this does cause it to become an intrinsic (with additional multiply and reg->reg copy)
// https://github.com/dotnet/coreclr/issues/7459#issuecomment-253965670
return Vector.AsVectorByte(new Vector<uint>(vectorByte * 0x01010101u));
#else
return new Vector<byte>(vectorByte);
#endif
}

const ulong _xorPowerOfTwoToHighByte = (0x07ul |
0x06ul << 8 |
0x05ul << 16 |
0x04ul << 24 |
0x03ul << 32 |
0x02ul << 40 |
0x01ul << 48) + 1;
private const ulong xorPowerOfTwoToHighByte = (0x07ul |
0x06ul << 8 |
0x05ul << 16 |
0x04ul << 24 |
0x03ul << 32 |
0x02ul << 40 |
0x01ul << 48) + 1;
private const ulong byteBroadcastToUlong = ~0UL / byte.MaxValue;
private const ulong filterByteHighBitsInUlong = (byteBroadcastToUlong >> 1) | (byteBroadcastToUlong << (sizeof(ulong) * 8 - 1));
}
}