From c28ba7efcea82b035ce5c3b4edd888fc38f8fdf9 Mon Sep 17 00:00:00 2001 From: Fei Peng Date: Sat, 3 Feb 2018 23:40:04 -0800 Subject: [PATCH 1/3] Implement AVX LoadAlignedVector256, LoadDquVector256, and LoadVector256 --- src/jit/hwintrinsiclistxarch.h | 3 + src/jit/instrsxarch.h | 2 + .../X86/Avx/LoadAlignedVector256.cs | 279 ++++++++++++++++++ .../X86/Avx/LoadAlignedVector256_r.csproj | 34 +++ .../X86/Avx/LoadAlignedVector256_ro.csproj | 34 +++ .../X86/Avx/LoadDquVector256.cs | 203 +++++++++++++ .../X86/Avx/LoadDquVector256_r.csproj | 34 +++ .../X86/Avx/LoadDquVector256_ro.csproj | 34 +++ .../X86/Avx/LoadVector256.cs | 237 +++++++++++++++ .../X86/Avx/LoadVector256_r.csproj | 34 +++ .../X86/Avx/LoadVector256_ro.csproj | 34 +++ 11 files changed, 928 insertions(+) create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256.cs create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256_r.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256_ro.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256.cs create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256_r.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256_ro.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256.cs create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256_r.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256_ro.csproj diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index 7e180099985d..a7824e2fc442 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -200,6 +200,9 @@ HARDWARE_INTRINSIC(AVX_Add, "Add", HARDWARE_INTRINSIC(AVX_Multiply, "Multiply", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX_Reciprocal, "Reciprocal", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_BlendVariable, "BlendVariable", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_LoadAlignedVector256, "LoadAlignedVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_LoadDquVector256, "LoadDquVector256", AVX, -1, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_LoadVector256, "LoadVector256", AVX, -1, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_Store, "Store", AVX, -1, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_StoreAligned, "StoreAligned", AVX, -1, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", AVX, - 1, 32, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoFlag) diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index 9b8f71bd6c56..5d934aed58a2 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -413,6 +413,8 @@ INST3( pmuldq, "pmuldq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( blendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x14)) // Variable Blend Packed Singles INST3( blendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x15)) // Variable Blend Packed Doubles INST3( pblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x10)) // Variable Blend Packed Bytes +INST3( lddqu, "lddqu" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEDBL(0xF0)) // Load Unaligned integer +INST3( movntdqa, "movntdqa" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2A)) // Load Double Quadword Non-Temporal Aligned Hint INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256.cs b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256.cs new file mode 100644 index 000000000000..27587caeac0e --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256.cs @@ -0,0 +1,279 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace IntelHardwareIntrinsicTest +{ + class Program + { + const int Pass = 100; + const int Fail = 0; + + static unsafe int Main(string[] args) + { + int testResult = Pass; + + if (Avx.IsSupported) + { + { + byte* inBuffer = stackalloc byte[64]; + float* inArray = (float*)Align(inBuffer, 32); + float* outArray = stackalloc float[8]; + var vf = Avx.LoadAlignedVector256(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 8; i++) + { + if (BitConverter.SingleToInt32Bits(inArray[i]) != BitConverter.SingleToInt32Bits(outArray[i])) + { + Console.WriteLine("AVX LoadAlignedVector256 failed on float:"); + for (var n = 0; n < 8; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + double* inArray = (double*)Align(inBuffer, 32); + double* outArray = stackalloc double[4]; + var vf = Avx.LoadAlignedVector256(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 4; i++) + { + if (BitConverter.DoubleToInt64Bits(inArray[i]) != BitConverter.DoubleToInt64Bits(outArray[i])) + { + Console.WriteLine("AVX LoadAlignedVector256 failed on double:"); + for (var n = 0; n < 4; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + int* inArray = (int*)Align(inBuffer, 32); + int* outArray = stackalloc int[8]; + var vf = Avx.LoadAlignedVector256(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 8; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("AVX LoadAlignedVector256 failed on int:"); + for (var n = 0; n < 8; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + long* inArray = (long*)Align(inBuffer, 32); + long* outArray = stackalloc long[4]; + var vf = Avx.LoadAlignedVector256(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 4; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("AVX LoadAlignedVector256 failed on long:"); + for (var n = 0; n < 4; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + uint* inArray = (uint*)Align(inBuffer, 32); + uint* outArray = stackalloc uint[8]; + var vf = Avx.LoadAlignedVector256(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 8; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("AVX LoadAlignedVector256 failed on uint:"); + for (var n = 0; n < 8; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + ulong* inArray = (ulong*)Align(inBuffer, 32); + ulong* outArray = stackalloc ulong[4]; + var vf = Avx.LoadAlignedVector256(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 4; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("AVX LoadAlignedVector256 failed on ulong:"); + for (var n = 0; n < 4; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + short* inArray = (short*)Align(inBuffer, 32); + short* outArray = stackalloc short[16]; + var vf = Avx.LoadAlignedVector256(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 16; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("AVX LoadAlignedVector256 failed on short:"); + for (var n = 0; n < 16; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + ushort* inArray = (ushort*)Align(inBuffer, 32); + ushort* outArray = stackalloc ushort[16]; + var vf = Avx.LoadAlignedVector256(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 16; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("AVX LoadAlignedVector256 failed on ushort:"); + for (var n = 0; n < 16; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + sbyte* inArray = (sbyte*)Align(inBuffer, 32); + sbyte* outArray = stackalloc sbyte[32]; + var vf = Avx.LoadAlignedVector256(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 32; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("AVX LoadAlignedVector256 failed on sbyte:"); + for (var n = 0; n < 32; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + byte* inArray = (byte*)Align(inBuffer, 32); + byte* outArray = stackalloc byte[32]; + var vf = Avx.LoadAlignedVector256(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 32; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("AVX LoadAlignedVector256 failed on byte:"); + for (var n = 0; n < 32; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + } + + return testResult; + } + + static unsafe void* Align(byte* buffer, byte expectedAlignment) + { + // Compute how bad the misalignment is, which is at most (expectedAlignment - 1). + // Then subtract that from the expectedAlignment and add it to the original address + // to compute the aligned address. + + var misalignment = expectedAlignment - ((ulong)(buffer) % expectedAlignment); + return (void*)(buffer + misalignment); + } + } +} diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256_r.csproj new file mode 100644 index 000000000000..1aa37e5c2a74 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256_r.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + + + + + + + + + + + diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256_ro.csproj new file mode 100644 index 000000000000..9a50380fb458 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadAlignedVector256_ro.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + True + + + + + + + + + + diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256.cs b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256.cs new file mode 100644 index 000000000000..f22165f0a579 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256.cs @@ -0,0 +1,203 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace IntelHardwareIntrinsicTest +{ + class Program + { + const int Pass = 100; + const int Fail = 0; + + static unsafe int Main(string[] args) + { + int testResult = Pass; + + if (Avx.IsSupported) + { + using (TestTable intTable = new TestTable(new int[8] { 1, -5, 100, 0, 1, 2, 3, 4 }, new int[8])) + { + var vf = Avx.LoadDquVector256((int*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadDquVector256 failed on int:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new uint[8] { 1, 5, 100, 0, 1, 2, 3, 4 }, new uint[8])) + { + var vf = Avx.LoadDquVector256((uint*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadDquVector256 failed on uint:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new long[4] { 1, -5, 100, 0 }, new long[4])) + { + var vf = Avx.LoadDquVector256((long*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadDquVector256 failed on long:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new ulong[4] { 1, 5, 100, 0 }, new ulong[4])) + { + var vf = Avx.LoadDquVector256((ulong*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadDquVector256 failed on ulong:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new short[16] { 1, -5, 100, 0, 1, 2, 3, 4, 1, -5, 100, 0, 1, 2, 3, 4 }, new short[16])) + { + var vf = Avx.LoadDquVector256((short*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadDquVector256 failed on short:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new ushort[16] { 1, 5, 100, 0, 1, 2, 3, 4, 1, 5, 100, 0, 1, 2, 3, 4 }, new ushort[16])) + { + var vf = Avx.LoadDquVector256((ushort*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadDquVector256 failed on ushort:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new byte[32] { 1, 5, 100, 0, 1, 2, 3, 4, 1, 5, 100, 0, 1, 2, 3, 4, 1, 5, 100, 0, 1, 2, 3, 4, 1, 5, 100, 0, 1, 2, 3, 4 }, new byte[32])) + { + var vf = Avx.LoadDquVector256((byte*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadDquVector256 failed on byte:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new sbyte[32] { 1, -5, 100, 0, 1, 2, 3, 4, 1, -5, 100, 0, 1, 2, 3, 4, 1, -5, 100, 0, 1, 2, 3, 4, 1, -5, 100, 0, 1, 2, 3, 4 }, new sbyte[32])) + { + var vf = Avx.LoadDquVector256((sbyte*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadDquVector256 failed on sbyte:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + } + + return testResult; + } + + public unsafe struct TestTable : IDisposable where T : struct + { + public T[] inArray; + public T[] outArray; + + public void* inArrayPtr => inHandle.AddrOfPinnedObject().ToPointer(); + public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer(); + + GCHandle inHandle; + GCHandle outHandle; + public TestTable(T[] a, T[] b) + { + this.inArray = a; + this.outArray = b; + + inHandle = GCHandle.Alloc(inArray, GCHandleType.Pinned); + outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned); + } + public bool CheckResult(Func check) + { + for (int i = 0; i < inArray.Length; i++) + { + if (!check(inArray[i], outArray[i])) + { + return false; + } + } + return true; + } + + public void Dispose() + { + inHandle.Free(); + outHandle.Free(); + } + } + + } +} diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256_r.csproj new file mode 100644 index 000000000000..c95a9b32f237 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256_r.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + + + + + + + + + + + diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256_ro.csproj new file mode 100644 index 000000000000..c6f2efe9af4b --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadDquVector256_ro.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + True + + + + + + + + + + diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256.cs b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256.cs new file mode 100644 index 000000000000..8393a9002256 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256.cs @@ -0,0 +1,237 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace IntelHardwareIntrinsicTest +{ + class Program + { + const int Pass = 100; + const int Fail = 0; + + static unsafe int Main(string[] args) + { + int testResult = Pass; + + if (Avx.IsSupported) + { + using (TestTable floatTable = new TestTable(new float[8] { 1, -5, 100, 0, 1, 2, 3, 4 }, new float[8])) + { + var vf = Avx.LoadVector256((float*)(floatTable.inArrayPtr)); + Unsafe.Write(floatTable.outArrayPtr, vf); + + if (!floatTable.CheckResult((x, y) => BitConverter.SingleToInt32Bits(x) == BitConverter.SingleToInt32Bits(y))) + { + Console.WriteLine("AVX LoadVector256 failed on float:"); + foreach (var item in floatTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable doubleTable = new TestTable(new double[4] { 1, -5, 100, 0}, new double[4])) + { + var vf = Avx.LoadVector256((double*)(doubleTable.inArrayPtr)); + Unsafe.Write(doubleTable.outArrayPtr, vf); + + if (!doubleTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y))) + { + Console.WriteLine("AVX LoadVector256 failed on double:"); + foreach (var item in doubleTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new int[8] { 1, -5, 100, 0, 1, 2, 3, 4 }, new int[8])) + { + var vf = Avx.LoadVector256((int*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadVector256 failed on int:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new uint[8] { 1, 5, 100, 0, 1, 2, 3, 4 }, new uint[8])) + { + var vf = Avx.LoadVector256((uint*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadVector256 failed on uint:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new long[4] { 1, -5, 100, 0 }, new long[4])) + { + var vf = Avx.LoadVector256((long*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadVector256 failed on long:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new ulong[4] { 1, 5, 100, 0 }, new ulong[4])) + { + var vf = Avx.LoadVector256((ulong*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadVector256 failed on ulong:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new short[16] { 1, -5, 100, 0, 1, 2, 3, 4, 1, -5, 100, 0, 1, 2, 3, 4 }, new short[16])) + { + var vf = Avx.LoadVector256((short*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadVector256 failed on short:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new ushort[16] { 1, 5, 100, 0, 1, 2, 3, 4, 1, 5, 100, 0, 1, 2, 3, 4 }, new ushort[16])) + { + var vf = Avx.LoadVector256((ushort*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadVector256 failed on ushort:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new byte[32] { 1, 5, 100, 0, 1, 2, 3, 4, 1, 5, 100, 0, 1, 2, 3, 4, 1, 5, 100, 0, 1, 2, 3, 4, 1, 5, 100, 0, 1, 2, 3, 4 }, new byte[32])) + { + var vf = Avx.LoadVector256((byte*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadVector256 failed on byte:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new sbyte[32] { 1, -5, 100, 0, 1, 2, 3, 4, 1, -5, 100, 0, 1, 2, 3, 4, 1, -5, 100, 0, 1, 2, 3, 4, 1, -5, 100, 0, 1, 2, 3, 4 }, new sbyte[32])) + { + var vf = Avx.LoadVector256((sbyte*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("AVX LoadVector256 failed on sbyte:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + } + + return testResult; + } + + public unsafe struct TestTable : IDisposable where T : struct + { + public T[] inArray; + public T[] outArray; + + public void* inArrayPtr => inHandle.AddrOfPinnedObject().ToPointer(); + public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer(); + + GCHandle inHandle; + GCHandle outHandle; + public TestTable(T[] a, T[] b) + { + this.inArray = a; + this.outArray = b; + + inHandle = GCHandle.Alloc(inArray, GCHandleType.Pinned); + outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned); + } + public bool CheckResult(Func check) + { + for (int i = 0; i < inArray.Length; i++) + { + if (!check(inArray[i], outArray[i])) + { + return false; + } + } + return true; + } + + public void Dispose() + { + inHandle.Free(); + outHandle.Free(); + } + } + + } +} diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256_r.csproj new file mode 100644 index 000000000000..881d59843e2c --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256_r.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + + + + + + + + + + + diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256_ro.csproj new file mode 100644 index 000000000000..7a6927ceaf2d --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/LoadVector256_ro.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + True + + + + + + + + + + From 8f2ff46bb3a3b5105f5cea0b4a0ab2451fa46f1a Mon Sep 17 00:00:00 2001 From: Fei Peng Date: Sun, 4 Feb 2018 11:33:51 -0800 Subject: [PATCH 2/3] Implement SSE3 LoadAndDuplicateToVector128 and LoadDquVector128 --- src/jit/hwintrinsiclistxarch.h | 2 + .../X86/Sse3.PlatformNotSupported.cs | 1 + .../src/System/Runtime/Intrinsics/X86/Sse3.cs | 1 + .../X86/Sse3/LoadAndDuplicateToVector128.cs | 85 ++++++++ .../Sse3/LoadAndDuplicateToVector128_r.csproj | 34 +++ .../LoadAndDuplicateToVector128_ro.csproj | 34 +++ .../X86/Sse3/LoadDquVector128.cs | 203 ++++++++++++++++++ .../X86/Sse3/LoadDquVector128_r.csproj | 34 +++ .../X86/Sse3/LoadDquVector128_ro.csproj | 34 +++ 9 files changed, 428 insertions(+) create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128.cs create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128_r.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128_ro.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128.cs create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128_r.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128_ro.csproj diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index a7824e2fc442..2b44ad78fcd9 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -179,6 +179,8 @@ HARDWARE_INTRINSIC(SSE2_Xor, "Xor", // SSE3 Intrinsics HARDWARE_INTRINSIC(SSE3_IsSupported, "get_IsSupported", SSE3, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE3_LoadAndDuplicateToVector128, "LoadAndDuplicateToVector128", SSE3, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE3_LoadDquVector128, "LoadDquVector128", SSE3, -1, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) // SSSE3 Intrinsics HARDWARE_INTRINSIC(SSSE3_IsSupported, "get_IsSupported", SSSE3, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) diff --git a/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse3.PlatformNotSupported.cs b/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse3.PlatformNotSupported.cs index b7eb406107bc..f2d21bb51e68 100644 --- a/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse3.PlatformNotSupported.cs +++ b/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse3.PlatformNotSupported.cs @@ -50,6 +50,7 @@ public static class Sse3 /// /// __m128d _mm_loaddup_pd (double const* mem_addr) + /// MOVDDUP xmm, m64 /// public static unsafe Vector128 LoadAndDuplicateToVector128(double* address) { throw new PlatformNotSupportedException(); } diff --git a/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse3.cs b/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse3.cs index 5f183dd30f16..b7dd1de014e5 100644 --- a/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse3.cs +++ b/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse3.cs @@ -50,6 +50,7 @@ public static class Sse3 /// /// __m128d _mm_loaddup_pd (double const* mem_addr) + /// MOVDDUP xmm, m64 /// public static unsafe Vector128 LoadAndDuplicateToVector128(double* address) => LoadAndDuplicateToVector128(address); diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128.cs b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128.cs new file mode 100644 index 000000000000..a9a05cf1e5ca --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128.cs @@ -0,0 +1,85 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace IntelHardwareIntrinsicTest +{ + class Program + { + const int Pass = 100; + const int Fail = 0; + + static unsafe int Main(string[] args) + { + int testResult = Pass; + + if (Sse3.IsSupported) + { + using (TestTable doubleTable = new TestTable(new double[2] { 1, -5 }, new double[4])) + { + var vf = Sse3.LoadAndDuplicateToVector128((double*)(doubleTable.inArrayPtr)); + Unsafe.Write(doubleTable.outArrayPtr, vf); + + if (BitConverter.DoubleToInt64Bits(doubleTable.inArray[0]) != BitConverter.DoubleToInt64Bits(doubleTable.outArray[0]) || + BitConverter.DoubleToInt64Bits(doubleTable.inArray[0]) != BitConverter.DoubleToInt64Bits(doubleTable.outArray[1])) + { + Console.WriteLine("Sse3 LoadAndDuplicateToVector128 failed on double:"); + foreach (var item in doubleTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + } + + return testResult; + } + + public unsafe struct TestTable : IDisposable where T : struct + { + public T[] inArray; + public T[] outArray; + + public void* inArrayPtr => inHandle.AddrOfPinnedObject().ToPointer(); + public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer(); + + GCHandle inHandle; + GCHandle outHandle; + public TestTable(T[] a, T[] b) + { + this.inArray = a; + this.outArray = b; + + inHandle = GCHandle.Alloc(inArray, GCHandleType.Pinned); + outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned); + } + public bool CheckResult(Func check) + { + for (int i = 0; i < inArray.Length; i++) + { + if (!check(inArray[i], outArray[i])) + { + return false; + } + } + return true; + } + + public void Dispose() + { + inHandle.Free(); + outHandle.Free(); + } + } + + } +} diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128_r.csproj new file mode 100644 index 000000000000..b29afad37d50 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128_r.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + + + + + + + + + + + diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128_ro.csproj new file mode 100644 index 000000000000..224e4aea7b1c --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadAndDuplicateToVector128_ro.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + True + + + + + + + + + + diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128.cs b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128.cs new file mode 100644 index 000000000000..ae7c06871b37 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128.cs @@ -0,0 +1,203 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace IntelHardwareIntrinsicTest +{ + class Program + { + const int Pass = 100; + const int Fail = 0; + + static unsafe int Main(string[] args) + { + int testResult = Pass; + + if (Sse3.IsSupported) + { + using (TestTable intTable = new TestTable(new int[4] { 1, -5, 100, 0 }, new int[4])) + { + var vf = Sse3.LoadDquVector128((int*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("Sse3 LoadDquVector128 failed on int:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new uint[4] { 1, 5, 100, 0 }, new uint[4])) + { + var vf = Sse3.LoadDquVector128((uint*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("Sse3 LoadDquVector128 failed on uint:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new long[2] { 1, -5 }, new long[2])) + { + var vf = Sse3.LoadDquVector128((long*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("Sse3 LoadDquVector128 failed on long:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new ulong[2] { 1, 5 }, new ulong[2])) + { + var vf = Sse3.LoadDquVector128((ulong*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("Sse3 LoadDquVector128 failed on ulong:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new short[8] { 1, -5, 100, 0, 1, 2, 3, 4 }, new short[8])) + { + var vf = Sse3.LoadDquVector128((short*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("Sse3 LoadDquVector128 failed on short:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new ushort[8] { 1, 5, 100, 0, 1, 2, 3, 4 }, new ushort[8])) + { + var vf = Sse3.LoadDquVector128((ushort*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("Sse3 LoadDquVector128 failed on ushort:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new byte[16] { 1, 5, 100, 0, 1, 2, 3, 4, 1, 5, 100, 0, 1, 2, 3, 4 }, new byte[16])) + { + var vf = Sse3.LoadDquVector128((byte*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("Sse3 LoadDquVector128 failed on byte:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + + using (TestTable intTable = new TestTable(new sbyte[16] { 1, -5, 100, 0, 1, 2, 3, 4, 1, -5, 100, 0, 1, 2, 3, 4 }, new sbyte[16])) + { + var vf = Sse3.LoadDquVector128((sbyte*)(intTable.inArrayPtr)); + Unsafe.Write(intTable.outArrayPtr, vf); + + if (!intTable.CheckResult((x, y) => x == y)) + { + Console.WriteLine("Sse3 LoadDquVector128 failed on sbyte:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + } + + return testResult; + } + + public unsafe struct TestTable : IDisposable where T : struct + { + public T[] inArray; + public T[] outArray; + + public void* inArrayPtr => inHandle.AddrOfPinnedObject().ToPointer(); + public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer(); + + GCHandle inHandle; + GCHandle outHandle; + public TestTable(T[] a, T[] b) + { + this.inArray = a; + this.outArray = b; + + inHandle = GCHandle.Alloc(inArray, GCHandleType.Pinned); + outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned); + } + public bool CheckResult(Func check) + { + for (int i = 0; i < inArray.Length; i++) + { + if (!check(inArray[i], outArray[i])) + { + return false; + } + } + return true; + } + + public void Dispose() + { + inHandle.Free(); + outHandle.Free(); + } + } + + } +} diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128_r.csproj new file mode 100644 index 000000000000..ef3080cc0fe4 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128_r.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + + + + + + + + + + + diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128_ro.csproj new file mode 100644 index 000000000000..00df60bc3ece --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Sse3/LoadDquVector128_ro.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + True + + + + + + + + + + From 747b121284f53cd7aec2915cbc15138508826fe8 Mon Sep 17 00:00:00 2001 From: Fei Peng Date: Mon, 5 Feb 2018 12:14:52 -0800 Subject: [PATCH 3/3] Implement AVX2 LoadAlignedVector256/128NonTemporal --- src/jit/hwintrinsiclistxarch.h | 1 + src/jit/instrsxarch.h | 1 + src/jit/lowerxarch.cpp | 2 + .../Avx2/LoadAlignedVector256NonTemporal.cs | 231 ++++++++++++++++++ .../LoadAlignedVector256NonTemporal_r.csproj | 34 +++ .../LoadAlignedVector256NonTemporal_ro.csproj | 34 +++ 6 files changed, 303 insertions(+) create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal.cs create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal_r.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal_ro.csproj diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index 2b44ad78fcd9..05f00d283dc1 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -214,6 +214,7 @@ HARDWARE_INTRINSIC(AVX2_IsSupported, "get_IsSupp HARDWARE_INTRINSIC(AVX2_Add, "Add", AVX2, -1, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2_Multiply, "Multiply", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2_BlendVariable, "BlendVariable", AVX2, -1, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal, "LoadAlignedVector256NonTemporal", AVX2, -1, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) // AES Intrinsics HARDWARE_INTRINSIC(AES_IsSupported, "get_IsSupported", AES, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index 5d934aed58a2..eedfd6352b23 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -415,6 +415,7 @@ INST3( blendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( pblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x10)) // Variable Blend Packed Bytes INST3( lddqu, "lddqu" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEDBL(0xF0)) // Load Unaligned integer INST3( movntdqa, "movntdqa" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2A)) // Load Double Quadword Non-Temporal Aligned Hint +INST3( movddup, "movddup" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEDBL(0x12)) // Replicate Double FP Values INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index 28ce7a8a45b0..0c2f2a250b87 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2344,6 +2344,8 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge // VEX encoding supports unaligned memory ops, so we can fold them case NI_SSE_LoadVector128: case NI_SSE2_LoadVector128: + case NI_AVX_LoadVector256: + case NI_AVX_LoadAlignedVector256: isContainable = (containingCategory == HW_Category_SimpleSIMD) && comp->canUseVexEncoding(); break; diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal.cs b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal.cs new file mode 100644 index 000000000000..cf0c29a4c641 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal.cs @@ -0,0 +1,231 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace IntelHardwareIntrinsicTest +{ + class Program + { + const int Pass = 100; + const int Fail = 0; + + static unsafe int Main(string[] args) + { + int testResult = Pass; + + if (Avx2.IsSupported) + { + { + byte* inBuffer = stackalloc byte[64]; + int* inArray = (int*)Align(inBuffer, 32); + int* outArray = stackalloc int[8]; + var vf = Avx2.LoadAlignedVector256NonTemporal(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 8; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("Avx2 LoadAlignedVector256NonTemporal failed on int:"); + for (var n = 0; n < 8; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + long* inArray = (long*)Align(inBuffer, 32); + long* outArray = stackalloc long[4]; + var vf = Avx2.LoadAlignedVector256NonTemporal(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 4; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("Avx2 LoadAlignedVector256NonTemporal failed on long:"); + for (var n = 0; n < 4; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + uint* inArray = (uint*)Align(inBuffer, 32); + uint* outArray = stackalloc uint[8]; + var vf = Avx2.LoadAlignedVector256NonTemporal(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 8; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("Avx2 LoadAlignedVector256NonTemporal failed on uint:"); + for (var n = 0; n < 8; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + ulong* inArray = (ulong*)Align(inBuffer, 32); + ulong* outArray = stackalloc ulong[4]; + var vf = Avx2.LoadAlignedVector256NonTemporal(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 4; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("Avx2 LoadAlignedVector256NonTemporal failed on ulong:"); + for (var n = 0; n < 4; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + short* inArray = (short*)Align(inBuffer, 32); + short* outArray = stackalloc short[16]; + var vf = Avx2.LoadAlignedVector256NonTemporal(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 16; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("Avx2 LoadAlignedVector256NonTemporal failed on short:"); + for (var n = 0; n < 16; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + ushort* inArray = (ushort*)Align(inBuffer, 32); + ushort* outArray = stackalloc ushort[16]; + var vf = Avx2.LoadAlignedVector256NonTemporal(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 16; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("Avx2 LoadAlignedVector256NonTemporal failed on ushort:"); + for (var n = 0; n < 16; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + sbyte* inArray = (sbyte*)Align(inBuffer, 32); + sbyte* outArray = stackalloc sbyte[32]; + var vf = Avx2.LoadAlignedVector256NonTemporal(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 32; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("Avx2 LoadAlignedVector256NonTemporal failed on sbyte:"); + for (var n = 0; n < 32; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + + { + byte* inBuffer = stackalloc byte[64]; + byte* inArray = (byte*)Align(inBuffer, 32); + byte* outArray = stackalloc byte[32]; + var vf = Avx2.LoadAlignedVector256NonTemporal(inArray); + Unsafe.Write(outArray, vf); + + for (var i = 0; i < 32; i++) + { + if (inArray[i] != outArray[i]) + { + Console.WriteLine("Avx2 LoadAlignedVector256NonTemporal failed on byte:"); + for (var n = 0; n < 32; n++) + { + Console.Write(outArray[n] + ", "); + } + Console.WriteLine(); + + testResult = Fail; + break; + } + } + } + } + + return testResult; + } + + static unsafe void* Align(byte* buffer, byte expectedAlignment) + { + // Compute how bad the misalignment is, which is at most (expectedAlignment - 1). + // Then subtract that from the expectedAlignment and add it to the original address + // to compute the aligned address. + + var misalignment = expectedAlignment - ((ulong)(buffer) % expectedAlignment); + return (void*)(buffer + misalignment); + } + } +} diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal_r.csproj new file mode 100644 index 000000000000..bd210dc17aa8 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal_r.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + + + + + + + + + + + diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal_ro.csproj new file mode 100644 index 000000000000..164b3f8bf61e --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/LoadAlignedVector256NonTemporal_ro.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + True + + + + + + + + + +