-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Some test fixes for the x86 HWIntrinsics #18734
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -241,6 +241,10 @@ public static class Avx | |
| /// </summary> | ||
| public static byte Extract(Vector256<byte> value, byte index) | ||
| { | ||
| if (!IsSupported) | ||
| { | ||
| throw new PlatformNotSupportedException(); | ||
| } | ||
| return Unsafe.Add<byte>(ref Unsafe.As<Vector256<byte>, byte>(ref value), index & 0x1F); | ||
| } | ||
|
|
||
|
|
@@ -251,6 +255,10 @@ public static byte Extract(Vector256<byte> value, byte index) | |
| /// </summary> | ||
| public static ushort Extract(Vector256<ushort> value, byte index) | ||
| { | ||
| if (!IsSupported) | ||
| { | ||
| throw new PlatformNotSupportedException(); | ||
| } | ||
| return Unsafe.Add<ushort>(ref Unsafe.As<Vector256<ushort>, ushort>(ref value), index & 0xF); | ||
| } | ||
|
|
||
|
|
@@ -260,6 +268,10 @@ public static ushort Extract(Vector256<ushort> value, byte index) | |
| /// </summary> | ||
| public static int Extract(Vector256<int> value, byte index) | ||
| { | ||
| if (!IsSupported) | ||
| { | ||
| throw new PlatformNotSupportedException(); | ||
| } | ||
| return Unsafe.Add<int>(ref Unsafe.As<Vector256<int>, int>(ref value), index & 0x7); | ||
| } | ||
|
|
||
|
|
@@ -269,6 +281,10 @@ public static int Extract(Vector256<int> value, byte index) | |
| /// </summary> | ||
| public static uint Extract(Vector256<uint> value, byte index) | ||
| { | ||
| if (!IsSupported) | ||
| { | ||
| throw new PlatformNotSupportedException(); | ||
| } | ||
| return Unsafe.Add<uint>(ref Unsafe.As<Vector256<uint>, uint>(ref value), index & 0x7); | ||
| } | ||
|
|
||
|
|
@@ -278,7 +294,7 @@ public static uint Extract(Vector256<uint> value, byte index) | |
| /// </summary> | ||
| public static long Extract(Vector256<long> value, byte index) | ||
| { | ||
| if (IntPtr.Size != 8) | ||
| if (!IsSupported || (IntPtr.Size != 8)) | ||
| { | ||
| throw new PlatformNotSupportedException(); | ||
| } | ||
|
|
@@ -291,7 +307,7 @@ public static long Extract(Vector256<long> value, byte index) | |
| /// </summary> | ||
| public static ulong Extract(Vector256<ulong> value, byte index) | ||
| { | ||
| if (IntPtr.Size != 8) | ||
| if (!IsSupported || (IntPtr.Size != 8)) | ||
| { | ||
| throw new PlatformNotSupportedException(); | ||
| } | ||
|
|
@@ -523,6 +539,11 @@ public static Vector256<uint> Insert(Vector256<uint> value, uint data, byte inde | |
| /// </summary> | ||
| public static Vector256<long> Insert(Vector256<long> value, long data, byte index) | ||
| { | ||
| if (IntPtr.Size != 8) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We shouldn't have some We should have a discussion on whether we:
CC. @eerhardt There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Agree. I think we should disable these two helpers for 32-bit platforms, native compilers also do it.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So with this change, all
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @eerhardt, not quite. There are two basic types of operations:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems to me that we shouldn't do anything tantamount to emulation when executing on a 32-bit machine. That said, I'm wondering if there are suggestions as to how these limitations/exclusions are reflected in the usage model?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Arm32/64 is differentiating by the Exposing a new class would probably work, but would also break the Duplicating all the members under a separate I had also thought of just updating the reference assembly to have a x86/x64 specific version; but we are exposing all architectures from a single package so the cross-architecture story is nicer (and so you don't need
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Relying on an analyzer/documentation to help notify users of these incompatibilities seems like it might the easiest way to do this...
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| { | ||
| throw new PlatformNotSupportedException(); | ||
| } | ||
|
|
||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as with vpinsrb ; When index is into the lower 128-bits
vinsertf128or vextractf128
vpinsrb ; When index is into the upper 128-bits
vinsertf128There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as the comment to |
||
| unsafe | ||
| { | ||
| index &= 0x3; | ||
|
|
@@ -539,6 +560,11 @@ public static Vector256<long> Insert(Vector256<long> value, long data, byte inde | |
| /// </summary> | ||
| public static Vector256<ulong> Insert(Vector256<ulong> value, ulong data, byte index) | ||
| { | ||
| if (IntPtr.Size != 8) | ||
| { | ||
| throw new PlatformNotSupportedException(); | ||
| } | ||
|
|
||
| unsafe | ||
| { | ||
| index &= 0x3; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't believe we want any methods to work when
IsSupportedis false, even helpers.We should also investigate if this is really better than (the below is what all three major native compilers do):
or
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, RyuJIT also generates the above code as well as native compilers. The managed implementation here is just for the non-constant fallback. So, I believe this is really better than calling into one (lower 128-bits) or two (upper 128-bits) large jump-tables.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmmm. I'm not so sure here. I always forget that the jit implementation exists (when I see the managed implementation here) and this is one of the smaller jump tables that we could generate (considering we only need to handle 32 cases max)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will say that, as a min bar, we should add a comment indicating that this is the software fallback.
However, I think that having this "different" software fallback goes against the principle of ensuring the helper indirect invocations execute the same instructions that the hardware accelerated path uses.
As it is now, we need to ensure that the software behavior is semantically equivalent to the hardware instructions that would have been executed (and it has already had to have been fixed a number of times due to minor issues).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree; I think there's a lot of value in the uniformity of the current non-constant approach.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://github.com/dotnet/coreclr/issues/18743
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, some native compilers (i.e., clang) also optimize the non-const Extract/Insert to the memory operations. It would be better to get some data when we unify the implementation.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@fiigii, I think the point is that the value of uniformity far outweighs any slight perf increase (if one exists) for the non-constant case (especially considering that the
non-constantcase already takes a perf hit from the call indirection/etc and is expected to be an edge case anyways).For the constant case, all three compilers definitely emit the SIMD instructions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, I just meant the non-const situations.