From b3767019dd05b1fbd1eaf3ac238c650d3a6ffc61 Mon Sep 17 00:00:00 2001 From: Sergey Andreenko Date: Mon, 16 Mar 2020 22:56:44 -0700 Subject: [PATCH 1/2] use YMM registers on x64 for BlkUnroll. --- src/coreclr/src/jit/codegenxarch.cpp | 97 ++++++++++++++++++---------- 1 file changed, 63 insertions(+), 34 deletions(-) diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp index 52095ec8560686..8f69be24ac4972 100644 --- a/src/coreclr/src/jit/codegenxarch.cpp +++ b/src/coreclr/src/jit/codegenxarch.cpp @@ -2989,13 +2989,16 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) // Fill as much as possible using SSE2 stores. if (size >= XMM_REGSIZE_BYTES) { + bool useYmm = + ((compiler->getSIMDVectorRegisterByteLength() == YMM_REGSIZE_BYTES) && (size >= YMM_REGSIZE_BYTES)); regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); if (src->gtSkipReloadOrCopy()->IsIntegralConst(0)) { + unsigned regSize = useYmm ? YMM_REGSIZE_BYTES : XMM_REGSIZE_BYTES; // If the source is constant 0 then always use xorps, it's faster // than copying the constant from a GPR to a XMM register. - emit->emitIns_R_R(INS_xorps, EA_16BYTE, srcXmmReg, srcXmmReg); + emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg); } else { @@ -3005,20 +3008,36 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) // For x86, we need one more to convert it from 8 bytes to 16 bytes. emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg); #endif + if (useYmm) + { + emit->emitIns_R_R(INS_punpckldq, EA_32BYTE, srcXmmReg, srcXmmReg); + } } instruction simdMov = simdUnalignedMovIns(); - for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize) - { - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset); - } - else + + auto unrollUsingXMM = [&](unsigned regSize) { + for (; size >= regSize; size -= regSize, dstOffset += regSize) { - emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg, - dstAddrIndexScale, dstOffset); + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } } + }; + + if (useYmm) + { + unrollUsingXMM(YMM_REGSIZE_BYTES); + } + if (size >= XMM_REGSIZE_BYTES) + { + unrollUsingXMM(XMM_REGSIZE_BYTES); } // TODO-CQ-XArch: On x86 we could initialize 8 byte at once by using MOVQ instead of two 4 byte MOV stores. @@ -3206,34 +3225,44 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) { regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT); - instruction simdMov = simdUnalignedMovIns(); - for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; - size -= regSize, srcOffset += regSize, dstOffset += regSize) - { - if (srcLclNum != BAD_VAR_NUM) - { - emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); - } - else + auto unrollUsingXMM = [&](unsigned regSize, regNumber tempReg) { + instruction simdMov = simdUnalignedMovIns(); + for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) { - emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, - srcAddrIndexScale, srcOffset); - } + if (srcLclNum != BAD_VAR_NUM) + { + emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); + } + else + { + emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + srcAddrIndexScale, srcOffset); + } - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); - } - else - { - emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, - dstAddrIndexScale, dstOffset); + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } } - } - // TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores. - // On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to - // allocate a GPR just for the remainder. + // TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores. + // On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to + // allocate a GPR just for the remainder. + }; + + if ((compiler->getSIMDVectorRegisterByteLength() == YMM_REGSIZE_BYTES) && (size >= YMM_REGSIZE_BYTES)) + { + unrollUsingXMM(YMM_REGSIZE_BYTES, tempReg); + } + if (size >= XMM_REGSIZE_BYTES) + { + unrollUsingXMM(XMM_REGSIZE_BYTES, tempReg); + } } if (size > 0) From bdec26cdad63fd6ede9f32a6b2421747627d27c6 Mon Sep 17 00:00:00 2001 From: Sergey Andreenko Date: Tue, 17 Mar 2020 00:03:48 -0700 Subject: [PATCH 2/2] Fix x86 linux build break. --- src/coreclr/src/jit/codegenxarch.cpp | 9 +++++++-- src/coreclr/src/jit/compiler.h | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp index 8f69be24ac4972..90c1d3ede03d09 100644 --- a/src/coreclr/src/jit/codegenxarch.cpp +++ b/src/coreclr/src/jit/codegenxarch.cpp @@ -2989,8 +2989,11 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) // Fill as much as possible using SSE2 stores. if (size >= XMM_REGSIZE_BYTES) { - bool useYmm = - ((compiler->getSIMDVectorRegisterByteLength() == YMM_REGSIZE_BYTES) && (size >= YMM_REGSIZE_BYTES)); +#ifdef FEATURE_SIMD + bool useYmm = (compiler->getSIMDVectorRegisterByteLength() == YMM_REGSIZE_BYTES) && (size >= YMM_REGSIZE_BYTES); +#else + bool useYmm = false; +#endif regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); if (src->gtSkipReloadOrCopy()->IsIntegralConst(0)) @@ -3255,10 +3258,12 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) // allocate a GPR just for the remainder. }; +#ifdef FEATURE_SIMD if ((compiler->getSIMDVectorRegisterByteLength() == YMM_REGSIZE_BYTES) && (size >= YMM_REGSIZE_BYTES)) { unrollUsingXMM(YMM_REGSIZE_BYTES, tempReg); } +#endif if (size >= XMM_REGSIZE_BYTES) { unrollUsingXMM(XMM_REGSIZE_BYTES, tempReg); diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h index 2a87d9c2326866..289c14b9739b14 100644 --- a/src/coreclr/src/jit/compiler.h +++ b/src/coreclr/src/jit/compiler.h @@ -8173,7 +8173,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX { return false; } -#endif // FEATURE_SIMD +#endif // !FEATURE_SIMD public: //------------------------------------------------------------------------