-
Notifications
You must be signed in to change notification settings - Fork 337
Added SIMD-based IndexOf #1222
Added SIMD-based IndexOf #1222
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
|
|
||
| using System.Collections.Sequences; | ||
| using System.Diagnostics; | ||
| using System.Numerics; | ||
| using System.Runtime.CompilerServices; | ||
|
|
||
| namespace System.Buffers | ||
|
|
@@ -117,5 +118,107 @@ internal static int IndexOfStraddling(this ReadOnlySpan<byte> first, IReadOnlyMe | |
|
|
||
| return -1; | ||
| } | ||
|
|
||
| static readonly int s_longSize = Vector<ulong>.Count; | ||
| static readonly int s_byteSize = Vector<byte>.Count; | ||
|
|
||
| public static int IndexOfVectorized(this Span<byte> buffer, byte value) | ||
| { | ||
| Debug.Assert(s_longSize == 4 || s_longSize == 2); | ||
|
|
||
| var byteSize = s_byteSize; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Jit team told me off for not using
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah yeah. I think @sivarv told me about it. I will change. |
||
|
|
||
| if (buffer.Length < byteSize * 2 || !Vector.IsHardwareAccelerated) return buffer.IndexOf(value); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test public static int IndexOfVectorized(this Span<byte> buffer, byte value)
{
if (Vector.IsHardwareAccelerated && buffer.Length >= Vector<byte>.Count)
{
return buffer.IndexOfVectorizedImpl(value);
}
return buffer.IndexOf(value);
}
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. Will do |
||
|
|
||
| Vector<byte> match = new Vector<byte>(value); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Needs jit fix dotnet/coreclr#7683 to be performant; else use Vector<byte> match = Vector.AsVectorByte(new Vector<uint>(value * 0x01010101u));
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I know about this issue. The question is: is it worth doing the workaround above? It's going to be slower when the fix is in.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could |
||
| var vectors = buffer.NonPortableCast<byte, Vector<byte>>(); | ||
| var zero = Vector<byte>.Zero; | ||
|
|
||
| for (int vectorIndex = 0; vectorIndex < vectors.Length; vectorIndex++) | ||
| { | ||
| var vector = vectors.GetItem(vectorIndex); | ||
| var result = Vector.Equals(vector, match); | ||
| if (result != zero) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Zero test directly rather than via variable; easier for Jit to pick up dotnet/coreclr#7367 !result.Equals(Vector<byte>.Zero)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought I tested it and local was faster, but you are right that it should be slower. I will retest and possibly change. |
||
| { | ||
| var longer = Vector.AsVectorUInt64(result); | ||
| Debug.Assert(s_longSize == 4 || s_longSize == 2); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might not be true on AVX -512
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thus the assert :-)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @benaadams Will the JIT emit AVX-512 instructions on processors that support it today? @KrzysztofCwalina Does the corefx(lab) CI run tests in Debug mode? If the JIT changes to support AVX-512, do any of the CI servers run a Xeon Phi processor or whatever it would take for this Debug.Assert to fail? If we were to ship a System.Buffers package that didn't support a
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Xeon Phi won't help you, it's a co-processor card you have to specifically target (usually with an intel c++ complier). I think some of the Sandy Bridge EP Xeon's have it, but I suspect it will be not a straight exposure of the registers because the AVX512 spec is a bit all over the shop. |
||
|
|
||
| var candidate = longer[0]; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change to for loop using
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had it as such loop. Was 10% slower. This is also due to a missing feature in JIT. Once we fully run on 2.0, the loop (as you say) will be auto unrolled.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. /cc @JosephTremoulet |
||
| if (candidate != 0) return vectorIndex * byteSize + IndexOf(candidate); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. break and continue as inline returns make codegen nasty; and loop for unrolling as above dotnet/coreclr#7912 ulong candidate = 0;
int longIndex = 0;
for (; longIndex < Vector<ulong>.Count; longIndex++)
{
var candidate = longer[longIndex];
if (candidate == 0) continue;
break;
}
return 8 * longIndex + vectorIndex * Vector<byte>.Count + IndexOf(candidate); |
||
| candidate = longer[1]; | ||
| if (candidate != 0) return 8 + vectorIndex * byteSize + IndexOf(candidate); | ||
| if (s_longSize == 4) | ||
| { | ||
| candidate = longer[2]; | ||
| if (candidate != 0) return 16 + vectorIndex * byteSize + IndexOf(candidate); | ||
| candidate = longer[3]; | ||
| if (candidate != 0) return 24 + vectorIndex * byteSize + IndexOf(candidate); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| var processed = vectors.Length * byteSize; | ||
| var index = buffer.Slice(processed).IndexOf(value); | ||
| if (index == -1) return -1; | ||
| return index + processed; | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.NoInlining)] | ||
| public static int IndexOfVectorized(this ReadOnlySpan<byte> buffer, byte value) | ||
| { | ||
| Debug.Assert(s_longSize == 4 || s_longSize == 2); | ||
|
|
||
| var byteSize = s_byteSize; | ||
|
|
||
| if (buffer.Length < byteSize * 2 || !Vector.IsHardwareAccelerated) return buffer.IndexOf(value); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As above |
||
|
|
||
| Vector<byte> match = new Vector<byte>(value); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As above |
||
| var vectors = buffer.NonPortableCast<byte, Vector<byte>>(); | ||
| var zero = Vector<byte>.Zero; | ||
|
|
||
| for (int vectorIndex = 0; vectorIndex < vectors.Length; vectorIndex++) | ||
| { | ||
| var vector = vectors[vectorIndex]; | ||
| var result = Vector.Equals(vector, match); | ||
| if (result != zero) | ||
| { | ||
| var longer = Vector.AsVectorUInt64(result); | ||
| var candidate = longer[0]; | ||
| if (candidate != 0) return vectorIndex * byteSize + IndexOf(candidate); | ||
| candidate = longer[1]; | ||
| if (candidate != 0) return 8 + vectorIndex * byteSize + IndexOf(candidate); | ||
| if (s_longSize == 4) | ||
| { | ||
| candidate = longer[2]; | ||
| if (candidate != 0) return 16 + vectorIndex * byteSize + IndexOf(candidate); | ||
| candidate = longer[3]; | ||
| if (candidate != 0) return 24 + vectorIndex * byteSize + IndexOf(candidate); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| var processed = vectors.Length * byteSize; | ||
| var index = buffer.Slice(processed).IndexOf(value); | ||
| if (index == -1) return -1; | ||
| return index + processed; | ||
| } | ||
|
|
||
| // used by IndexOfVectorized | ||
| static int IndexOf(ulong next) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Force inline? (as only called once per vector, if loop changed as suggested)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How did the attribute disappear from the PR? Seriously :-) |
||
| { | ||
| // Flag least significant power of two bit | ||
| var powerOfTwoFlag = (next ^ (next - 1)); | ||
| // Shift all powers of two into the high byte and extract | ||
| var foundByteIndex = (int)((powerOfTwoFlag * _xorPowerOfTwoToHighByte) >> 57); | ||
| return foundByteIndex; | ||
| } | ||
|
|
||
| const ulong _xorPowerOfTwoToHighByte = (0x07ul | | ||
| 0x06ul << 8 | | ||
| 0x05ul << 16 | | ||
| 0x04ul << 24 | | ||
| 0x03ul << 32 | | ||
| 0x02ul << 40 | | ||
| 0x01ul << 48) + 1; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
| // See the LICENSE file in the project root for more information. | ||
|
|
||
| using Microsoft.Xunit.Performance; | ||
| using System; | ||
| using System.Buffers; | ||
| using System.Numerics; | ||
| using System.Text; | ||
|
|
||
| public class IndexOfBench | ||
| { | ||
| static int s_bufferLength = 1000; | ||
| static byte[] s_buffer = new byte[s_bufferLength]; | ||
| static int s_loops = 1000; | ||
|
|
||
| static IndexOfBench() | ||
| { | ||
| s_buffer[s_bufferLength - 100] = 255; | ||
| } | ||
|
|
||
| [Benchmark] | ||
| static int SpanIndexOf() | ||
| { | ||
| Span<byte> buffer = s_buffer; | ||
| int index = 0; | ||
| foreach (var iteration in Benchmark.Iterations) | ||
| { | ||
| using (iteration.StartMeasurement()) | ||
| { | ||
| for(int i=0; i<s_loops; i++) { | ||
| index += buffer.IndexOf(255); | ||
| } | ||
| } | ||
| } | ||
| return index; | ||
| } | ||
|
|
||
| [Benchmark] | ||
| static int VectorizedIndexOf() | ||
| { | ||
| if(!Vector.IsHardwareAccelerated) return 0; | ||
|
|
||
| Span<byte> buffer = s_buffer; | ||
| int index = 0; | ||
| foreach (var iteration in Benchmark.Iterations) | ||
| { | ||
| using (iteration.StartMeasurement()) | ||
| { | ||
| for(int i=0; i<s_loops; i++) { | ||
| index += buffer.IndexOfVectorized(255); | ||
| } | ||
| } | ||
| } | ||
| return index; | ||
| } | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| // Copyright (c) Microsoft. All rights reserved. | ||
| // Licensed under the MIT license. See LICENSE file in the project root for full license information. | ||
| using System.Buffers; | ||
| using Xunit; | ||
|
|
||
| namespace System.Buffers.Tests | ||
| { | ||
| public class VectorizedOperationsTests | ||
| { | ||
| [Fact] | ||
| public void SpanIndexOf() | ||
| { | ||
| int len = 10000; | ||
| byte[] buffer = new byte[len]; | ||
| buffer[0] = 1; | ||
| buffer[len / 2] = 2; | ||
| buffer[len - 1] = 3; | ||
|
|
||
| Span<byte> span = buffer; | ||
| Assert.Equal(0, span.IndexOfVectorized(1)); | ||
| Assert.Equal(len/2, span.IndexOfVectorized(2)); | ||
| Assert.Equal(len-1, span.IndexOfVectorized(3)); | ||
| Assert.Equal(-1, span.IndexOfVectorized(4)); | ||
| } | ||
|
|
||
| [Fact] | ||
| public void ReadOnlySpanIndexOf() | ||
| { | ||
| int len = 10000; | ||
| byte[] buffer = new byte[len]; | ||
| buffer[0] = 1; | ||
| buffer[len / 2] = 2; | ||
| buffer[len - 1] = 3; | ||
|
|
||
| ReadOnlySpan<byte> span = buffer; | ||
| Assert.Equal(0, span.IndexOfVectorized(1)); | ||
| Assert.Equal(len/2, span.IndexOfVectorized(2)); | ||
| Assert.Equal(len-1, span.IndexOfVectorized(3)); | ||
| Assert.Equal(-1, span.IndexOfVectorized(4)); | ||
| } | ||
|
|
||
| [Fact] | ||
| public void EmptySpanIndexOf() | ||
| { | ||
| int len = 0; | ||
| byte[] buffer = new byte[len]; | ||
| Span<byte> span = buffer; | ||
| Assert.Equal(-1, span.IndexOfVectorized(4)); | ||
| } | ||
|
|
||
| [Fact] | ||
| public void EmptyReadOnlySpanIndexOf() | ||
| { | ||
| int len = 10000; | ||
| byte[] buffer = new byte[len]; | ||
| buffer[0] = 1; | ||
| buffer[len / 2] = 2; | ||
| buffer[len - 1] = 3; | ||
|
|
||
| ReadOnlySpan<byte> span = buffer; | ||
| Assert.Equal(-1, span.IndexOfVectorized(4)); | ||
| } | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With
AVX-512this could go tos_longSize == 8; going above 64 bytes is probably unlikely in near term as cache line is 64 bytes which changing would probably break lots of assumptions in software