From 89538c11ef45f1e27bd1eeeb2f26cea502378287 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Wed, 7 Jul 2021 13:10:15 -0700 Subject: [PATCH 01/26] Saving current state, current state does not currently pass tests --- src/coreclr/jit/codegenxarch.cpp | 38 +++++++++++++++++-- .../src/System/AppContext.cs | 5 +++ 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 2bd0142381f628..cc72e39feb12d0 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3016,26 +3016,56 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT); instruction simdMov = simdUnalignedMovIns(); + + const bool canUse16BytesSimdMov = (node->gtType == TYP_STRUCT); + + if (canUse16BytesSimdMov && size >= YMM_REGSIZE_BYTES) + { + for (unsigned regSize = YMM_REGSIZE_BYTES; size >= regSize; + size -= regSize, srcOffset += regSize, dstOffset += regSize) + { + if (srcLclNum != BAD_VAR_NUM) + { + emit->emitIns_R_S(simdMov, EA_32BYTE, tempReg, srcLclNum, srcOffset); + } + else + { + emit->emitIns_R_ARX(simdMov, EA_32BYTE, tempReg, srcAddrBaseReg, srcAddrIndexReg, + srcAddrIndexScale, srcOffset); + } + + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(simdMov, EA_32BYTE, tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(simdMov, EA_32BYTE, tempReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } + } + for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) { if (srcLclNum != BAD_VAR_NUM) { - emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); + emit->emitIns_R_S(simdMov, EA_16BYTE, tempReg, srcLclNum, srcOffset); } else { - emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + emit->emitIns_R_ARX(simdMov, EA_16BYTE, tempReg, srcAddrBaseReg, srcAddrIndexReg, srcAddrIndexScale, srcOffset); } if (dstLclNum != BAD_VAR_NUM) { - emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + emit->emitIns_S_R(simdMov, EA_16BYTE, tempReg, dstLclNum, dstOffset); } else { - emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + emit->emitIns_ARX_R(simdMov, EA_16BYTE, tempReg, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale, dstOffset); } } diff --git a/src/libraries/System.Private.CoreLib/src/System/AppContext.cs b/src/libraries/System.Private.CoreLib/src/System/AppContext.cs index 35e45f472bb8b1..270e48bb46b047 100644 --- a/src/libraries/System.Private.CoreLib/src/System/AppContext.cs +++ b/src/libraries/System.Private.CoreLib/src/System/AppContext.cs @@ -142,6 +142,11 @@ internal static unsafe void Setup(char** pNames, char** pValues, int count) s_dataStore = new Dictionary(count); for (int i = 0; i < count; i++) { + Debug.WriteLine("DEBUG SETUP"); + Debug.WriteLine(i); + Debug.WriteLine(new string(pNames[i])); + Debug.WriteLine(new string(pValues[i])); + Debug.WriteLine(""); s_dataStore.Add(new string(pNames[i]), new string(pValues[i])); } } From 068f5bfdf41bd150e13fa6c46a4cee65916b317b Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Thu, 8 Jul 2021 11:21:37 -0700 Subject: [PATCH 02/26] Getting available vector register size from compiler instance instead, resolves failing tests. --- src/coreclr/jit/codegenxarch.cpp | 41 +++++++++----------------------- src/coreclr/jit/lsraxarch.cpp | 2 +- 2 files changed, 12 insertions(+), 31 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index cc72e39feb12d0..6a84eb9d75361e 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3017,56 +3017,37 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) instruction simdMov = simdUnalignedMovIns(); - const bool canUse16BytesSimdMov = (node->gtType == TYP_STRUCT); + unsigned regSize = compiler->getSIMDVectorRegisterByteLength(); - if (canUse16BytesSimdMov && size >= YMM_REGSIZE_BYTES) + while (size >= XMM_REGSIZE_BYTES) { - for (unsigned regSize = YMM_REGSIZE_BYTES; size >= regSize; + for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) { if (srcLclNum != BAD_VAR_NUM) { - emit->emitIns_R_S(simdMov, EA_32BYTE, tempReg, srcLclNum, srcOffset); + emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); } else { - emit->emitIns_R_ARX(simdMov, EA_32BYTE, tempReg, srcAddrBaseReg, srcAddrIndexReg, - srcAddrIndexScale, srcOffset); + emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, srcAddrIndexScale, + srcOffset); } if (dstLclNum != BAD_VAR_NUM) { - emit->emitIns_S_R(simdMov, EA_32BYTE, tempReg, dstLclNum, dstOffset); + emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); } else { - emit->emitIns_ARX_R(simdMov, EA_32BYTE, tempReg, dstAddrBaseReg, dstAddrIndexReg, - dstAddrIndexScale, dstOffset); + emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale, + dstOffset); } } - } - for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; - size -= regSize, srcOffset += regSize, dstOffset += regSize) - { - if (srcLclNum != BAD_VAR_NUM) - { - emit->emitIns_R_S(simdMov, EA_16BYTE, tempReg, srcLclNum, srcOffset); - } - else + if (regSize == YMM_REGSIZE_BYTES) { - emit->emitIns_R_ARX(simdMov, EA_16BYTE, tempReg, srcAddrBaseReg, srcAddrIndexReg, - srcAddrIndexScale, srcOffset); - } - - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_R(simdMov, EA_16BYTE, tempReg, dstLclNum, dstOffset); - } - else - { - emit->emitIns_ARX_R(simdMov, EA_16BYTE, tempReg, dstAddrBaseReg, dstAddrIndexReg, - dstAddrIndexScale, dstOffset); + regSize = XMM_REGSIZE_BYTES; } } diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 854e4521ec9001..7cc7005fa8b072 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1381,7 +1381,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) if (size >= XMM_REGSIZE_BYTES) { buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); - SetContainsAVXFlags(); + SetContainsAVXFlags(size); } break; From 1e11d3ee870a2eedf5c4a3f549f1796fa027daa0 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Thu, 8 Jul 2021 15:01:23 -0700 Subject: [PATCH 03/26] Emit vzeroupper if there is potentially an AVX-SSE transition --- src/coreclr/jit/codegenxarch.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 6a84eb9d75361e..06486bbb8d86c4 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3045,12 +3045,16 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) } } - if (regSize == YMM_REGSIZE_BYTES) + if (regSize == YMM_REGSIZE_BYTES && size >= 0) { regSize = XMM_REGSIZE_BYTES; + + // Avoid AVX-SSE transition penalty + instGen(INS_vzeroupper); } } + // TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores. // On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to // allocate a GPR just for the remainder. From 0b7f1381613b78bfe0c1cfe2291ac1f16ba8076f Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Thu, 8 Jul 2021 16:20:36 -0700 Subject: [PATCH 04/26] Copy remainder using same tempReg allocated during SIMD moves instead of allocating a GPR --- src/coreclr/jit/codegenxarch.cpp | 80 ++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 29 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 06486bbb8d86c4..47116c438ddf39 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3017,47 +3017,69 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) instruction simdMov = simdUnalignedMovIns(); - unsigned regSize = compiler->getSIMDVectorRegisterByteLength(); + unsigned regSize = XMM_REGSIZE_BYTES; // compiler->getSIMDVectorRegisterByteLength(); - while (size >= XMM_REGSIZE_BYTES) + if (regSize == YMM_REGSIZE_BYTES && size < regSize) { - for (; size >= regSize; - size -= regSize, srcOffset += regSize, dstOffset += regSize) - { - if (srcLclNum != BAD_VAR_NUM) - { - emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); - } - else - { - emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, srcAddrIndexScale, - srcOffset); - } + regSize = XMM_REGSIZE_BYTES; + } - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); - } - else - { - emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale, - dstOffset); - } + for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) + { + if (srcLclNum != BAD_VAR_NUM) + { + emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); } - - if (regSize == YMM_REGSIZE_BYTES && size >= 0) + else { - regSize = XMM_REGSIZE_BYTES; + emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + srcAddrIndexScale, srcOffset); + } - // Avoid AVX-SSE transition penalty - instGen(INS_vzeroupper); + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); } } - // TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores. // On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to // allocate a GPR just for the remainder. + + if (size > 0) + { + // Copy the remainder by moving the last regSize bytes of the buffer + unsigned remainder = regSize - size; + size += remainder; + srcOffset -= remainder; + dstOffset -= remainder; + + if (srcLclNum != BAD_VAR_NUM) + { + emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); + } + else + { + emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + srcAddrIndexScale, srcOffset); + } + + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } + return; } if (size > 0) From fc0cf3df43fd14659c46063fb1954ee0324d2cf5 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Fri, 9 Jul 2021 15:48:28 -0700 Subject: [PATCH 05/26] Better guard for maximum SIMD register size --- src/coreclr/jit/codegenxarch.cpp | 57 ++++++++++++++------------------ src/coreclr/jit/lsraxarch.cpp | 1 + 2 files changed, 26 insertions(+), 32 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 47116c438ddf39..dfc070a881614d 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3017,12 +3017,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) instruction simdMov = simdUnalignedMovIns(); - unsigned regSize = XMM_REGSIZE_BYTES; // compiler->getSIMDVectorRegisterByteLength(); - - if (regSize == YMM_REGSIZE_BYTES && size < regSize) - { - regSize = XMM_REGSIZE_BYTES; - } + // Get the largest SIMD register available if the size is larger than an xmm register + unsigned regSize = size >= YMM_REGSIZE_BYTES ? compiler->getSIMDVectorRegisterByteLength() : XMM_REGSIZE_BYTES; for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) { @@ -3082,36 +3078,33 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) return; } - if (size > 0) - { - regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT); + regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT); - for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize) + for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize) + { + while (regSize > size) { - while (regSize > size) - { - regSize /= 2; - } + regSize /= 2; + } - if (srcLclNum != BAD_VAR_NUM) - { - emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); - } - else - { - emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, - srcAddrIndexScale, srcOffset); - } + if (srcLclNum != BAD_VAR_NUM) + { + emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); + } + else + { + emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, srcAddrIndexScale, + srcOffset); + } - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); - } - else - { - emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, - dstAddrIndexScale, dstOffset); - } + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale, + dstOffset); } } } diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 7cc7005fa8b072..71bfb859e98a5d 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1381,6 +1381,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) if (size >= XMM_REGSIZE_BYTES) { buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + // SetContainsAVXFlags(); SetContainsAVXFlags(size); } break; From 0c982dc5e78b91fb21b0b53cb7dd5abc6028e0d1 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Fri, 9 Jul 2021 15:52:19 -0700 Subject: [PATCH 06/26] Fix typo --- src/coreclr/jit/codegenxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index dfc070a881614d..cd2e2fc8fefcbe 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3017,7 +3017,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) instruction simdMov = simdUnalignedMovIns(); - // Get the largest SIMD register available if the size is larger than an xmm register + // Get the largest SIMD register available if the size is large enough unsigned regSize = size >= YMM_REGSIZE_BYTES ? compiler->getSIMDVectorRegisterByteLength() : XMM_REGSIZE_BYTES; for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) From 9952bbdb327412c59b04f4602f8c59ebbc71d404 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Fri, 9 Jul 2021 15:54:57 -0700 Subject: [PATCH 07/26] Remove commented out lines --- src/coreclr/jit/lsraxarch.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 71bfb859e98a5d..7cc7005fa8b072 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1381,7 +1381,6 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) if (size >= XMM_REGSIZE_BYTES) { buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); - // SetContainsAVXFlags(); SetContainsAVXFlags(size); } break; From f3852953b370390e9105161daac7f8cd9708f267 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Mon, 12 Jul 2021 11:12:34 -0700 Subject: [PATCH 08/26] Insert vzeroupper if using AVX2 --- src/coreclr/jit/codegenxarch.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index cd2e2fc8fefcbe..3cfa463a592ec6 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3075,6 +3075,12 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) dstAddrIndexScale, dstOffset); } } + + if (regSize == YMM_REGSIZE_BYTES) + { + instGen(INS_vzeroupper); + } + return; } From 58be507c6c2e4577c7df964c3eff82a0a4b0bbb7 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Mon, 12 Jul 2021 12:08:44 -0700 Subject: [PATCH 09/26] Add another vzeroupper --- src/coreclr/jit/codegenxarch.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 3cfa463a592ec6..09babb6d15cc01 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3020,6 +3020,11 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) // Get the largest SIMD register available if the size is large enough unsigned regSize = size >= YMM_REGSIZE_BYTES ? compiler->getSIMDVectorRegisterByteLength() : XMM_REGSIZE_BYTES; + if (regSize == YMM_REGSIZE_BYTES) + { + instGen(INS_vzeroupper); + } + for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) { if (srcLclNum != BAD_VAR_NUM) @@ -3043,15 +3048,10 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) } } - // TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores. - // On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to - // allocate a GPR just for the remainder. - if (size > 0) { // Copy the remainder by moving the last regSize bytes of the buffer unsigned remainder = regSize - size; - size += remainder; srcOffset -= remainder; dstOffset -= remainder; From c784c45ac9d84e50485bd23af5b05488122f8cc8 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Mon, 12 Jul 2021 12:16:23 -0700 Subject: [PATCH 10/26] Remove vzeroupper, inserting the instruction hurts performance --- src/coreclr/jit/codegenxarch.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 09babb6d15cc01..6bccca9e8abf13 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3020,11 +3020,6 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) // Get the largest SIMD register available if the size is large enough unsigned regSize = size >= YMM_REGSIZE_BYTES ? compiler->getSIMDVectorRegisterByteLength() : XMM_REGSIZE_BYTES; - if (regSize == YMM_REGSIZE_BYTES) - { - instGen(INS_vzeroupper); - } - for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) { if (srcLclNum != BAD_VAR_NUM) @@ -3076,11 +3071,6 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) } } - if (regSize == YMM_REGSIZE_BYTES) - { - instGen(INS_vzeroupper); - } - return; } From b0199b158d2c0e8a1a810d8d3c5831a499f927f2 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Tue, 13 Jul 2021 09:00:55 -0700 Subject: [PATCH 11/26] Adding vzeroupper again for testing --- src/coreclr/jit/codegenxarch.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 6bccca9e8abf13..09babb6d15cc01 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3020,6 +3020,11 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) // Get the largest SIMD register available if the size is large enough unsigned regSize = size >= YMM_REGSIZE_BYTES ? compiler->getSIMDVectorRegisterByteLength() : XMM_REGSIZE_BYTES; + if (regSize == YMM_REGSIZE_BYTES) + { + instGen(INS_vzeroupper); + } + for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) { if (srcLclNum != BAD_VAR_NUM) @@ -3071,6 +3076,11 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) } } + if (regSize == YMM_REGSIZE_BYTES) + { + instGen(INS_vzeroupper); + } + return; } From 47fcb1dbd07dc1336a9262edbeb9cc9c7a5a5ad2 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Tue, 13 Jul 2021 13:34:49 -0700 Subject: [PATCH 12/26] Remove debug lines --- .../System.Private.CoreLib/src/System/AppContext.cs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/AppContext.cs b/src/libraries/System.Private.CoreLib/src/System/AppContext.cs index 270e48bb46b047..35e45f472bb8b1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/AppContext.cs +++ b/src/libraries/System.Private.CoreLib/src/System/AppContext.cs @@ -142,11 +142,6 @@ internal static unsafe void Setup(char** pNames, char** pValues, int count) s_dataStore = new Dictionary(count); for (int i = 0; i < count; i++) { - Debug.WriteLine("DEBUG SETUP"); - Debug.WriteLine(i); - Debug.WriteLine(new string(pNames[i])); - Debug.WriteLine(new string(pValues[i])); - Debug.WriteLine(""); s_dataStore.Add(new string(pNames[i]), new string(pValues[i])); } } From 4cce3908fc0b4d5ff4e34e73531058b1308c8732 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Tue, 13 Jul 2021 16:40:43 -0700 Subject: [PATCH 13/26] Add assert before copying remaining bytes, change regSize condition to depend on AVX, remove vzeroupper inserts. --- src/coreclr/jit/codegenxarch.cpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 09babb6d15cc01..7197ab39d7060a 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3018,12 +3018,9 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) instruction simdMov = simdUnalignedMovIns(); // Get the largest SIMD register available if the size is large enough - unsigned regSize = size >= YMM_REGSIZE_BYTES ? compiler->getSIMDVectorRegisterByteLength() : XMM_REGSIZE_BYTES; - - if (regSize == YMM_REGSIZE_BYTES) - { - instGen(INS_vzeroupper); - } + unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX) + ? YMM_REGSIZE_BYTES + : XMM_REGSIZE_BYTES; for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) { @@ -3052,6 +3049,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) { // Copy the remainder by moving the last regSize bytes of the buffer unsigned remainder = regSize - size; + assert(remainder <= size); + srcOffset -= remainder; dstOffset -= remainder; @@ -3076,11 +3075,6 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) } } - if (regSize == YMM_REGSIZE_BYTES) - { - instGen(INS_vzeroupper); - } - return; } From 98180b4ce4d6c591939608d0894f9e70df2cdb15 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Tue, 13 Jul 2021 16:43:12 -0700 Subject: [PATCH 14/26] Fix typo --- src/coreclr/jit/codegenxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 7197ab39d7060a..14898bb66cfcd7 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3049,7 +3049,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) { // Copy the remainder by moving the last regSize bytes of the buffer unsigned remainder = regSize - size; - assert(remainder <= size); + assert(remainder <= regSize); srcOffset -= remainder; dstOffset -= remainder; From eba6f31ad0283a6f5f10f7cf047955bd6748c533 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Thu, 15 Jul 2021 09:00:20 -0700 Subject: [PATCH 15/26] Added check to ensure size is not 0 to prevent allocating register for structs of size 0. --- src/coreclr/jit/codegenxarch.cpp | 49 +++++++++++++++++--------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 14898bb66cfcd7..4f89b3a55c2633 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3078,33 +3078,36 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) return; } - regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT); - - for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize) + if (size > 0) { - while (regSize > size) - { - regSize /= 2; - } + regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT); - if (srcLclNum != BAD_VAR_NUM) - { - emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); - } - else + for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize) { - emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, srcAddrIndexScale, - srcOffset); - } + while (regSize > size) + { + regSize /= 2; + } - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); - } - else - { - emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale, - dstOffset); + if (srcLclNum != BAD_VAR_NUM) + { + emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); + } + else + { + emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + srcAddrIndexScale, srcOffset); + } + + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } } } } From 6b31381911406fc7a7ee5c75b25dfece1153aa54 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Fri, 16 Jul 2021 09:21:27 -0700 Subject: [PATCH 16/26] Using YMM registers during init block if block is large enough --- src/coreclr/jit/codegenxarch.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 4f89b3a55c2633..e50bde9a0b3463 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2796,16 +2796,24 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) { regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); + unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX) + ? YMM_REGSIZE_BYTES + : XMM_REGSIZE_BYTES; + if (src->gtSkipReloadOrCopy()->IsIntegralConst(0)) { // If the source is constant 0 then always use xorps, it's faster // than copying the constant from a GPR to a XMM register. - emit->emitIns_R_R(INS_xorps, EA_16BYTE, srcXmmReg, srcXmmReg); + emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg); } else { emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false); emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg); + + if (regSize == YMM_REGSIZE_BYTES) { + emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1); + } #ifdef TARGET_X86 // For x86, we need one more to convert it from 8 bytes to 16 bytes. emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg); @@ -2813,7 +2821,6 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) } instruction simdMov = simdUnalignedMovIns(); - unsigned regSize = XMM_REGSIZE_BYTES; unsigned bytesWritten = 0; while (bytesWritten < size) @@ -2825,6 +2832,11 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) regSize = 8; } #endif + if (regSize == YMM_REGSIZE_BYTES && bytesWritten + regSize > size) + { + regSize = XMM_REGSIZE_BYTES; + } + if (bytesWritten + regSize > size) { assert(srcIntReg != REG_NA); @@ -2843,6 +2855,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) dstOffset += regSize; bytesWritten += regSize; + } size -= bytesWritten; From 5517bdce898231d51f8f706ded1db7c9f31331f7 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Fri, 16 Jul 2021 09:36:23 -0700 Subject: [PATCH 17/26] Add some comments --- src/coreclr/jit/codegenxarch.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e50bde9a0b3463..dbc6a17d71d305 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2812,6 +2812,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg); if (regSize == YMM_REGSIZE_BYTES) { + // Extend the bytes in the lower lanes to the upper lanes emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1); } #ifdef TARGET_X86 @@ -2834,6 +2835,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) #endif if (regSize == YMM_REGSIZE_BYTES && bytesWritten + regSize > size) { + // Step down from YMM registers to XMM registers regSize = XMM_REGSIZE_BYTES; } From 8f434cb5495fa07814f1d6e1b68be64481dd346e Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Fri, 16 Jul 2021 10:25:40 -0700 Subject: [PATCH 18/26] Formatting --- src/coreclr/jit/codegenxarch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index dbc6a17d71d305..620c0abda12bab 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2811,7 +2811,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false); emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg); - if (regSize == YMM_REGSIZE_BYTES) { + if (regSize == YMM_REGSIZE_BYTES) + { // Extend the bytes in the lower lanes to the upper lanes emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1); } @@ -2857,7 +2858,6 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) dstOffset += regSize; bytesWritten += regSize; - } size -= bytesWritten; From e2e6259eb287868117b848dbc4b7546232aec7a5 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Mon, 19 Jul 2021 09:13:43 -0700 Subject: [PATCH 19/26] Rename remainder variable to shiftBack, better describes the logic for the block. --- src/coreclr/jit/codegenxarch.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 620c0abda12bab..2e66093b3b66a9 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3063,11 +3063,11 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) if (size > 0) { // Copy the remainder by moving the last regSize bytes of the buffer - unsigned remainder = regSize - size; - assert(remainder <= regSize); + unsigned shiftBack = regSize - size; + assert(shiftBack <= regSize); - srcOffset -= remainder; - dstOffset -= remainder; + srcOffset -= shiftBack; + dstOffset -= shiftBack; if (srcLclNum != BAD_VAR_NUM) { From dd7a85b3fb2ad4d42c4cdfd17fa5e31612102c08 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Mon, 19 Jul 2021 11:17:48 -0700 Subject: [PATCH 20/26] Use shift-back technique for init block as well. --- src/coreclr/jit/codegenxarch.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 2e66093b3b66a9..2c9e7695fc6839 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2834,13 +2834,14 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) regSize = 8; } #endif - if (regSize == YMM_REGSIZE_BYTES && bytesWritten + regSize > size) + if (bytesWritten + regSize > size && bytesWritten < size) { - // Step down from YMM registers to XMM registers - regSize = XMM_REGSIZE_BYTES; + // Shift dstOffset back to use full SIMD move + unsigned shiftBack = regSize - (size - bytesWritten); + bytesWritten -= shiftBack; + dstOffset -= shiftBack; } - - if (bytesWritten + regSize > size) + else if (bytesWritten + regSize > size) { assert(srcIntReg != REG_NA); break; From be3dfb21ec9311255a2ebdc99145c21b57ca2b89 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Mon, 19 Jul 2021 11:20:28 -0700 Subject: [PATCH 21/26] Add assert for init block --- src/coreclr/jit/codegenxarch.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 2c9e7695fc6839..a92f8322e550f3 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2838,6 +2838,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) { // Shift dstOffset back to use full SIMD move unsigned shiftBack = regSize - (size - bytesWritten); + assert(shiftBack <= regSize); bytesWritten -= shiftBack; dstOffset -= shiftBack; } From afa2060bfe8ebf2c9b1abe1eeaea0aa21ad12cdb Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Wed, 28 Jul 2021 10:52:36 -0700 Subject: [PATCH 22/26] Use xmm register to move remainder if it fits, shift-back for GPR for remainder instead of step-down --- src/coreclr/jit/codegenxarch.cpp | 42 +++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index a92f8322e550f3..3a9467b7f0659e 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3064,6 +3064,11 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) if (size > 0) { + if (size <= XMM_REGSIZE_BYTES) + { + regSize = XMM_REGSIZE_BYTES; + } + // Copy the remainder by moving the last regSize bytes of the buffer unsigned shiftBack = regSize - size; assert(shiftBack <= regSize); @@ -3098,13 +3103,44 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) if (size > 0) { regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT); + unsigned regSize = REGSIZE_BYTES; - for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize) + while (regSize > size) { - while (regSize > size) + regSize /= 2; + } + + for (; size > regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) + { + if (srcLclNum != BAD_VAR_NUM) { - regSize /= 2; + emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); } + else + { + emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + srcAddrIndexScale, srcOffset); + } + + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } + + if (size > 0) + { + // Copy the remainder by moving the last regSize bytes of the buffer + unsigned shiftBack = regSize - size; + assert(shiftBack <= regSize); + + srcOffset -= shiftBack; + dstOffset -= shiftBack; if (srcLclNum != BAD_VAR_NUM) { From 3fddd553576e2cd7072dda17cef91a146bfd8db1 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Wed, 28 Jul 2021 11:19:13 -0700 Subject: [PATCH 23/26] Use xmm in init block for remainder if it is an appropriate size, shift-back for GPR in init instead of step-down register size --- src/coreclr/jit/codegenxarch.cpp | 37 ++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 3a9467b7f0659e..e9056188ec8669 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2834,19 +2834,19 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) regSize = 8; } #endif - if (bytesWritten + regSize > size && bytesWritten < size) + if (bytesWritten + regSize > size) { + if (size - bytesWritten <= XMM_REGSIZE_BYTES) + { + regSize = XMM_REGSIZE_BYTES; + } + // Shift dstOffset back to use full SIMD move unsigned shiftBack = regSize - (size - bytesWritten); assert(shiftBack <= regSize); bytesWritten -= shiftBack; dstOffset -= shiftBack; } - else if (bytesWritten + regSize > size) - { - assert(srcIntReg != REG_NA); - break; - } if (dstLclNum != BAD_VAR_NUM) { @@ -2865,13 +2865,32 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) size -= bytesWritten; } + unsigned regSize = REGSIZE_BYTES; + + while (regSize > size) + { + regSize /= 2; + } + // Fill the remainder using normal stores. - for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize) + for (; size > regSize; size -= regSize, dstOffset += regSize) { - while (regSize > size) + if (dstLclNum != BAD_VAR_NUM) { - regSize /= 2; + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset); } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } + + if (size > 0) + { + unsigned shiftBack = regSize - size; + assert(shiftBack <= regSize); + dstOffset -= shiftBack; if (dstLclNum != BAD_VAR_NUM) { From 710941a69da52f6af49c5e5e0fbb10e462a5c674 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Fri, 30 Jul 2021 12:33:55 -0700 Subject: [PATCH 24/26] Avoid allocating GPR during LSRA for BlkOpKindUnroll --- src/coreclr/jit/codegenxarch.cpp | 2 +- src/coreclr/jit/lowerxarch.cpp | 2 +- src/coreclr/jit/lsraxarch.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e9056188ec8669..102a53d9ff132e 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2781,7 +2781,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) assert(src->IsIntegralConst(0)); assert(willUseSimdMov); #ifdef TARGET_AMD64 - assert(size % 16 == 0); + assert(size >= XMM_REGSIZE_BYTES); #else assert(size % 8 == 0); #endif diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 43c0df62042365..03ab9c4e29a8fc 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -227,7 +227,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) { const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences(); #ifdef TARGET_AMD64 - const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0); + const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES); #else const bool willUseOnlySimdMov = (size % 8 == 0); #endif diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 7cc7005fa8b072..630e96b018687b 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1364,7 +1364,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) switch (blkNode->gtBlkOpKind) { case GenTreeBlk::BlkOpKindUnroll: - if ((size % XMM_REGSIZE_BYTES) != 0) + if (size < XMM_REGSIZE_BYTES) { regMaskTP regMask = allRegs(TYP_INT); #ifdef TARGET_X86 From fa81b67ec14b267a20a042ac04ff4da583592d82 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Thu, 5 Aug 2021 09:34:59 -0700 Subject: [PATCH 25/26] Remove shift-back of GPR for remainder, added back the step-down to resolve test failures on x86 --- src/coreclr/jit/codegenxarch.cpp | 68 ++++++-------------------------- 1 file changed, 11 insertions(+), 57 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 42124859cad94a..291f088cf133bb 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2836,6 +2836,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) #endif if (bytesWritten + regSize > size) { +#ifdef TARGET_AMD64 if (size - bytesWritten <= XMM_REGSIZE_BYTES) { regSize = XMM_REGSIZE_BYTES; @@ -2846,6 +2847,10 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) assert(shiftBack <= regSize); bytesWritten -= shiftBack; dstOffset -= shiftBack; +#else + assert(srcIntReg != REG_NA); + break; +#endif } if (dstLclNum != BAD_VAR_NUM) @@ -2865,33 +2870,13 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) size -= bytesWritten; } - unsigned regSize = REGSIZE_BYTES; - - while (regSize > size) - { - regSize /= 2; - } - // Fill the remainder using normal stores. - for (; size > regSize; size -= regSize, dstOffset += regSize) + for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize) { - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset); - } - else + while (regSize > size) { - emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg, - dstAddrIndexScale, dstOffset); + regSize /= 2; } - } - - if (size > 0) - { - unsigned shiftBack = regSize - size; - assert(shiftBack <= regSize); - dstOffset -= shiftBack; - if (dstLclNum != BAD_VAR_NUM) { emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset); @@ -3122,44 +3107,13 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) if (size > 0) { regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT); - unsigned regSize = REGSIZE_BYTES; - - while (regSize > size) - { - regSize /= 2; - } - for (; size > regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) + for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize) { - if (srcLclNum != BAD_VAR_NUM) - { - emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); - } - else - { - emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, - srcAddrIndexScale, srcOffset); - } - - if (dstLclNum != BAD_VAR_NUM) + while (regSize > size) { - emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + regSize /= 2; } - else - { - emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, - dstAddrIndexScale, dstOffset); - } - } - - if (size > 0) - { - // Copy the remainder by moving the last regSize bytes of the buffer - unsigned shiftBack = regSize - size; - assert(shiftBack <= regSize); - - srcOffset -= shiftBack; - dstOffset -= shiftBack; if (srcLclNum != BAD_VAR_NUM) { From c1497d114e2c092d9cbb65f23baa1d8110c32801 Mon Sep 17 00:00:00 2001 From: Alex Covington Date: Thu, 5 Aug 2021 10:15:39 -0700 Subject: [PATCH 26/26] Shift-back GPR when handling remainder for AMD64 only --- src/coreclr/jit/codegenxarch.cpp | 103 ++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 291f088cf133bb..dba85ace814924 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2870,7 +2870,45 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) size -= bytesWritten; } - // Fill the remainder using normal stores. +// Fill the remainder using normal stores. +#ifdef TARGET_AMD64 + unsigned regSize = REGSIZE_BYTES; + + while (regSize > size) + { + regSize /= 2; + } + + for (; size > regSize; size -= regSize, dstOffset += regSize) + { + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } + + if (size > 0) + { + unsigned shiftBack = regSize - size; + assert(shiftBack <= regSize); + dstOffset -= shiftBack; + + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } +#else // TARGET_X86 for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize) { while (regSize > size) @@ -2887,6 +2925,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) dstAddrIndexScale, dstOffset); } } +#endif } #ifdef TARGET_AMD64 @@ -3104,10 +3143,71 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) return; } + // Fill the remainder with normal loads/stores if (size > 0) { regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT); +#ifdef TARGET_AMD64 + unsigned regSize = REGSIZE_BYTES; + + while (regSize > size) + { + regSize /= 2; + } + + for (; size > regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) + { + if (srcLclNum != BAD_VAR_NUM) + { + emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); + } + else + { + emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + srcAddrIndexScale, srcOffset); + } + + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } + + if (size > 0) + { + unsigned shiftBack = regSize - size; + assert(shiftBack <= regSize); + + srcOffset -= shiftBack; + dstOffset -= shiftBack; + + if (srcLclNum != BAD_VAR_NUM) + { + emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); + } + else + { + emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + srcAddrIndexScale, srcOffset); + } + + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } +#else // TARGET_X86 for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize) { while (regSize > size) @@ -3135,6 +3235,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) dstAddrIndexScale, dstOffset); } } +#endif } }