Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,10 @@ public static class Avx
/// </summary>
public static byte Extract(Vector256<byte> value, byte index)
{
if (!IsSupported)
Copy link
Copy Markdown
Member Author

@tannergooding tannergooding Jul 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe we want any methods to work when IsSupported is false, even helpers.

We should also investigate if this is really better than (the below is what all three major native compilers do):

vpextrb        ; When index is into the lower 128-bits

or

vextractf128   ; When index is into the upper 128-bits
vpextrb

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should also investigate if this is really better than

Actually, RyuJIT also generates the above code as well as native compilers. The managed implementation here is just for the non-constant fallback. So, I believe this is really better than calling into one (lower 128-bits) or two (upper 128-bits) large jump-tables.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm. I'm not so sure here. I always forget that the jit implementation exists (when I see the managed implementation here) and this is one of the smaller jump tables that we could generate (considering we only need to handle 32 cases max)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will say that, as a min bar, we should add a comment indicating that this is the software fallback.

However, I think that having this "different" software fallback goes against the principle of ensuring the helper indirect invocations execute the same instructions that the hardware accelerated path uses.

As it is now, we need to ensure that the software behavior is semantically equivalent to the hardware instructions that would have been executed (and it has already had to have been fixed a number of times due to minor issues).

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, I think that having this "different" software fallback goes against the principle of ensuring the helper indirect invocations execute the same instructions that the hardware accelerated path uses.

I agree; I think there's a lot of value in the uniformity of the current non-constant approach.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, some native compilers (i.e., clang) also optimize the non-const Extract/Insert to the memory operations. It would be better to get some data when we unify the implementation.

Copy link
Copy Markdown
Member Author

@tannergooding tannergooding Jul 2, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fiigii, I think the point is that the value of uniformity far outweighs any slight perf increase (if one exists) for the non-constant case (especially considering that the non-constant case already takes a perf hit from the call indirection/etc and is expected to be an edge case anyways).

For the constant case, all three compilers definitely emit the SIMD instructions.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I just meant the non-const situations.

{
throw new PlatformNotSupportedException();
}
return Unsafe.Add<byte>(ref Unsafe.As<Vector256<byte>, byte>(ref value), index & 0x1F);
}

Expand All @@ -251,6 +255,10 @@ public static byte Extract(Vector256<byte> value, byte index)
/// </summary>
public static ushort Extract(Vector256<ushort> value, byte index)
{
if (!IsSupported)
{
throw new PlatformNotSupportedException();
}
return Unsafe.Add<ushort>(ref Unsafe.As<Vector256<ushort>, ushort>(ref value), index & 0xF);
}

Expand All @@ -260,6 +268,10 @@ public static ushort Extract(Vector256<ushort> value, byte index)
/// </summary>
public static int Extract(Vector256<int> value, byte index)
{
if (!IsSupported)
{
throw new PlatformNotSupportedException();
}
return Unsafe.Add<int>(ref Unsafe.As<Vector256<int>, int>(ref value), index & 0x7);
}

Expand All @@ -269,6 +281,10 @@ public static int Extract(Vector256<int> value, byte index)
/// </summary>
public static uint Extract(Vector256<uint> value, byte index)
{
if (!IsSupported)
{
throw new PlatformNotSupportedException();
}
return Unsafe.Add<uint>(ref Unsafe.As<Vector256<uint>, uint>(ref value), index & 0x7);
}

Expand All @@ -278,7 +294,7 @@ public static uint Extract(Vector256<uint> value, byte index)
/// </summary>
public static long Extract(Vector256<long> value, byte index)
{
if (IntPtr.Size != 8)
if (!IsSupported || (IntPtr.Size != 8))
{
throw new PlatformNotSupportedException();
}
Expand All @@ -291,7 +307,7 @@ public static long Extract(Vector256<long> value, byte index)
/// </summary>
public static ulong Extract(Vector256<ulong> value, byte index)
{
if (IntPtr.Size != 8)
if (!IsSupported || (IntPtr.Size != 8))
{
throw new PlatformNotSupportedException();
}
Expand Down Expand Up @@ -523,6 +539,11 @@ public static Vector256<uint> Insert(Vector256<uint> value, uint data, byte inde
/// </summary>
public static Vector256<long> Insert(Vector256<long> value, long data, byte index)
{
if (IntPtr.Size != 8)
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We shouldn't have some long/ulong helpers work on 32-bit and others not (disabled here since these were the only 64-bit helpers that were "working").

We should have a discussion on whether we:

  • Always support 64-bit overloads (when IsSupported is true)
    • This will require emulation on 32-bit
    • There is one instruction that I know of (movq) that deals with scalars and works on 32-bit
    • The vector intrinsics don't directly deal with 64-bit registers so Vector128<long/ulong> tends to work
  • Sometimes supports 64-bit overloads (depending on the underlying instruction and if it is a helper or not)
    • If we go this route, we should probably come up with a partitioning scheme for telling users these aren't supported on 32-bit
  • Never support 64-bit overloads on 32-bit OS

CC. @eerhardt

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We shouldn't have some long/ulong helpers work on 32-bit and others not

Agree. I think we should disable these two helpers for 32-bit platforms, native compilers also do it.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So with this change, all long/ulong operations will throw PNSE when running in a 32-bit process?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eerhardt, not quite.

There are two basic types of operations:

  • Those that take/return only Vector128<long> or Vector128<ulong>
    • These are fine in both 32-bit and 64-bit mode, as they operation on 128-bit registers (which exist in both 32-bit and 64-bit modes), on 64-bit memory (Scalar Vector128<T>) , or on 128/256-bit memory (Packed Vector128<T> and Packed Vector256<T>)
  • Those that take/return long or ulong directly, which falls into two-subcategories
    • Those that require a 64-bit register and cannot work on 32-bit
      • Sse2.StoreNonTemporal and Sse2.ConvertToInt64(Vector128<double>) are examples that don't work, as they require at least one 64-bit register
    • Those that can operate directly on 64-bit memory, and don't require a register
      • These sometimes work on 32-bit machines
      • ConvertToInt64(Vector128<long>) is an example that, as it uses a special encoding of the movq instruction
      • ConvertScalarToVector128Double(Vector128<double>, long) is an example of one that supports taking just a 64-bit memory address, but is still not encodable on a 32-bit machine

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to me that we shouldn't do anything tantamount to emulation when executing on a 32-bit machine. That said, I'm wondering if there are suggestions as to how these limitations/exclusions are reflected in the usage model?

Copy link
Copy Markdown
Member Author

@tannergooding tannergooding Jul 2, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arm32/64 is differentiating by the Arm and Arm.Arm64 namespaces (however, this is also partially due to these being entirely different architectures, rather than hierarchical, like x86/x64).

Exposing a new class would probably work, but would also break the make Sse2 inherit from Sse proposal since we don't have multiple inheritance or interfaces that can declare static methods.

Duplicating all the members under a separate x64 namespace would also work, but that is a lot of duplicated metadata for relatively few methods.

I had also thought of just updating the reference assembly to have a x86/x64 specific version; but we are exposing all architectures from a single package so the cross-architecture story is nicer (and so you don't need #ifdef)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Relying on an analyzer/documentation to help notify users of these incompatibilities seems like it might the easiest way to do this...

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

{
throw new PlatformNotSupportedException();
}

Copy link
Copy Markdown
Member Author

@tannergooding tannergooding Jul 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as with Extract, we should determine if this is "better" than what all three native compilers do, which is:

vpinsrb        ; When index is into the lower 128-bits
vinsertf128

or

vextractf128
vpinsrb        ; When index is into the upper 128-bits
vinsertf128

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as the comment to Extract, the The managed implementation here is just the non-constant fallback.

unsafe
{
index &= 0x3;
Expand All @@ -539,6 +560,11 @@ public static Vector256<long> Insert(Vector256<long> value, long data, byte inde
/// </summary>
public static Vector256<ulong> Insert(Vector256<ulong> value, ulong data, byte index)
{
if (IntPtr.Size != 8)
{
throw new PlatformNotSupportedException();
}

unsafe
{
index &= 0x3;
Expand Down
11 changes: 2 additions & 9 deletions tests/src/JIT/HardwareIntrinsics/X86/Avx/Extract.Byte.1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ public static partial class Program
private static void ExtractByte1()
{
var test = new SimpleUnaryOpTest__ExtractByte1();

try
{

if (test.IsSupported)
{
// Validates basic functionality works, using Unsafe.Read
Expand Down Expand Up @@ -77,11 +75,6 @@ private static void ExtractByte1()
// Validates we throw on unsupported hardware
test.RunUnsupportedScenario();
}
}
catch (PlatformNotSupportedException)
{
test.Succeeded = true;
}

if (!test.Succeeded)
{
Expand Down Expand Up @@ -126,7 +119,7 @@ public SimpleUnaryOpTest__ExtractByte1()
_dataTable = new SimpleUnaryOpTest__DataTable<Byte, Byte>(_data, new Byte[RetElementCount], LargestVectorSize);
}

public bool IsSupported => Avx.IsSupported;
public bool IsSupported => Avx.IsSupported && (Environment.Is64BitProcess || ((typeof(Byte) != typeof(long)) && (typeof(Byte) != typeof(ulong))));

public bool Succeeded { get; set; }

Expand Down
11 changes: 2 additions & 9 deletions tests/src/JIT/HardwareIntrinsics/X86/Avx/Extract.Byte.20.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ public static partial class Program
private static void ExtractByte20()
{
var test = new SimpleUnaryOpTest__ExtractByte20();

try
{

if (test.IsSupported)
{
// Validates basic functionality works, using Unsafe.Read
Expand Down Expand Up @@ -77,11 +75,6 @@ private static void ExtractByte20()
// Validates we throw on unsupported hardware
test.RunUnsupportedScenario();
}
}
catch (PlatformNotSupportedException)
{
test.Succeeded = true;
}

if (!test.Succeeded)
{
Expand Down Expand Up @@ -126,7 +119,7 @@ public SimpleUnaryOpTest__ExtractByte20()
_dataTable = new SimpleUnaryOpTest__DataTable<Byte, Byte>(_data, new Byte[RetElementCount], LargestVectorSize);
}

public bool IsSupported => Avx.IsSupported;
public bool IsSupported => Avx.IsSupported && (Environment.Is64BitProcess || ((typeof(Byte) != typeof(long)) && (typeof(Byte) != typeof(ulong))));

public bool Succeeded { get; set; }

Expand Down
11 changes: 2 additions & 9 deletions tests/src/JIT/HardwareIntrinsics/X86/Avx/Extract.Byte.52.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ public static partial class Program
private static void ExtractByte52()
{
var test = new SimpleUnaryOpTest__ExtractByte52();

try
{

if (test.IsSupported)
{
// Validates basic functionality works, using Unsafe.Read
Expand Down Expand Up @@ -77,11 +75,6 @@ private static void ExtractByte52()
// Validates we throw on unsupported hardware
test.RunUnsupportedScenario();
}
}
catch (PlatformNotSupportedException)
{
test.Succeeded = true;
}

if (!test.Succeeded)
{
Expand Down Expand Up @@ -126,7 +119,7 @@ public SimpleUnaryOpTest__ExtractByte52()
_dataTable = new SimpleUnaryOpTest__DataTable<Byte, Byte>(_data, new Byte[RetElementCount], LargestVectorSize);
}

public bool IsSupported => Avx.IsSupported;
public bool IsSupported => Avx.IsSupported && (Environment.Is64BitProcess || ((typeof(Byte) != typeof(long)) && (typeof(Byte) != typeof(ulong))));

public bool Succeeded { get; set; }

Expand Down
11 changes: 2 additions & 9 deletions tests/src/JIT/HardwareIntrinsics/X86/Avx/Extract.Int32.1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ public static partial class Program
private static void ExtractInt321()
{
var test = new SimpleUnaryOpTest__ExtractInt321();

try
{

if (test.IsSupported)
{
// Validates basic functionality works, using Unsafe.Read
Expand Down Expand Up @@ -77,11 +75,6 @@ private static void ExtractInt321()
// Validates we throw on unsupported hardware
test.RunUnsupportedScenario();
}
}
catch (PlatformNotSupportedException)
{
test.Succeeded = true;
}

if (!test.Succeeded)
{
Expand Down Expand Up @@ -126,7 +119,7 @@ public SimpleUnaryOpTest__ExtractInt321()
_dataTable = new SimpleUnaryOpTest__DataTable<Int32, Int32>(_data, new Int32[RetElementCount], LargestVectorSize);
}

public bool IsSupported => Avx.IsSupported;
public bool IsSupported => Avx.IsSupported && (Environment.Is64BitProcess || ((typeof(Int32) != typeof(long)) && (typeof(Int32) != typeof(ulong))));

public bool Succeeded { get; set; }

Expand Down
11 changes: 2 additions & 9 deletions tests/src/JIT/HardwareIntrinsics/X86/Avx/Extract.Int32.22.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ public static partial class Program
private static void ExtractInt3222()
{
var test = new SimpleUnaryOpTest__ExtractInt3222();

try
{

if (test.IsSupported)
{
// Validates basic functionality works, using Unsafe.Read
Expand Down Expand Up @@ -77,11 +75,6 @@ private static void ExtractInt3222()
// Validates we throw on unsupported hardware
test.RunUnsupportedScenario();
}
}
catch (PlatformNotSupportedException)
{
test.Succeeded = true;
}

if (!test.Succeeded)
{
Expand Down Expand Up @@ -126,7 +119,7 @@ public SimpleUnaryOpTest__ExtractInt3222()
_dataTable = new SimpleUnaryOpTest__DataTable<Int32, Int32>(_data, new Int32[RetElementCount], LargestVectorSize);
}

public bool IsSupported => Avx.IsSupported;
public bool IsSupported => Avx.IsSupported && (Environment.Is64BitProcess || ((typeof(Int32) != typeof(long)) && (typeof(Int32) != typeof(ulong))));

public bool Succeeded { get; set; }

Expand Down
11 changes: 2 additions & 9 deletions tests/src/JIT/HardwareIntrinsics/X86/Avx/Extract.Int32.6.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ public static partial class Program
private static void ExtractInt326()
{
var test = new SimpleUnaryOpTest__ExtractInt326();

try
{

if (test.IsSupported)
{
// Validates basic functionality works, using Unsafe.Read
Expand Down Expand Up @@ -77,11 +75,6 @@ private static void ExtractInt326()
// Validates we throw on unsupported hardware
test.RunUnsupportedScenario();
}
}
catch (PlatformNotSupportedException)
{
test.Succeeded = true;
}

if (!test.Succeeded)
{
Expand Down Expand Up @@ -126,7 +119,7 @@ public SimpleUnaryOpTest__ExtractInt326()
_dataTable = new SimpleUnaryOpTest__DataTable<Int32, Int32>(_data, new Int32[RetElementCount], LargestVectorSize);
}

public bool IsSupported => Avx.IsSupported;
public bool IsSupported => Avx.IsSupported && (Environment.Is64BitProcess || ((typeof(Int32) != typeof(long)) && (typeof(Int32) != typeof(ulong))));

public bool Succeeded { get; set; }

Expand Down
11 changes: 2 additions & 9 deletions tests/src/JIT/HardwareIntrinsics/X86/Avx/Extract.Int64.1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ public static partial class Program
private static void ExtractInt641()
{
var test = new SimpleUnaryOpTest__ExtractInt641();

try
{

if (test.IsSupported)
{
// Validates basic functionality works, using Unsafe.Read
Expand Down Expand Up @@ -77,11 +75,6 @@ private static void ExtractInt641()
// Validates we throw on unsupported hardware
test.RunUnsupportedScenario();
}
}
catch (PlatformNotSupportedException)
{
test.Succeeded = true;
}

if (!test.Succeeded)
{
Expand Down Expand Up @@ -126,7 +119,7 @@ public SimpleUnaryOpTest__ExtractInt641()
_dataTable = new SimpleUnaryOpTest__DataTable<Int64, Int64>(_data, new Int64[RetElementCount], LargestVectorSize);
}

public bool IsSupported => Avx.IsSupported;
public bool IsSupported => Avx.IsSupported && (Environment.Is64BitProcess || ((typeof(Int64) != typeof(long)) && (typeof(Int64) != typeof(ulong))));

public bool Succeeded { get; set; }

Expand Down
11 changes: 2 additions & 9 deletions tests/src/JIT/HardwareIntrinsics/X86/Avx/Extract.Int64.19.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ public static partial class Program
private static void ExtractInt6419()
{
var test = new SimpleUnaryOpTest__ExtractInt6419();

try
{

if (test.IsSupported)
{
// Validates basic functionality works, using Unsafe.Read
Expand Down Expand Up @@ -77,11 +75,6 @@ private static void ExtractInt6419()
// Validates we throw on unsupported hardware
test.RunUnsupportedScenario();
}
}
catch (PlatformNotSupportedException)
{
test.Succeeded = true;
}

if (!test.Succeeded)
{
Expand Down Expand Up @@ -126,7 +119,7 @@ public SimpleUnaryOpTest__ExtractInt6419()
_dataTable = new SimpleUnaryOpTest__DataTable<Int64, Int64>(_data, new Int64[RetElementCount], LargestVectorSize);
}

public bool IsSupported => Avx.IsSupported;
public bool IsSupported => Avx.IsSupported && (Environment.Is64BitProcess || ((typeof(Int64) != typeof(long)) && (typeof(Int64) != typeof(ulong))));

public bool Succeeded { get; set; }

Expand Down
11 changes: 2 additions & 9 deletions tests/src/JIT/HardwareIntrinsics/X86/Avx/Extract.Int64.3.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ public static partial class Program
private static void ExtractInt643()
{
var test = new SimpleUnaryOpTest__ExtractInt643();

try
{

if (test.IsSupported)
{
// Validates basic functionality works, using Unsafe.Read
Expand Down Expand Up @@ -77,11 +75,6 @@ private static void ExtractInt643()
// Validates we throw on unsupported hardware
test.RunUnsupportedScenario();
}
}
catch (PlatformNotSupportedException)
{
test.Succeeded = true;
}

if (!test.Succeeded)
{
Expand Down Expand Up @@ -126,7 +119,7 @@ public SimpleUnaryOpTest__ExtractInt643()
_dataTable = new SimpleUnaryOpTest__DataTable<Int64, Int64>(_data, new Int64[RetElementCount], LargestVectorSize);
}

public bool IsSupported => Avx.IsSupported;
public bool IsSupported => Avx.IsSupported && (Environment.Is64BitProcess || ((typeof(Int64) != typeof(long)) && (typeof(Int64) != typeof(ulong))));

public bool Succeeded { get; set; }

Expand Down
Loading