From a4ee9948883bbef580d54241d4200a0989e1f94a Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 28 Apr 2023 03:29:16 +0200 Subject: [PATCH 01/10] Clean up BLK unrolling on xarch --- src/coreclr/jit/codegenxarch.cpp | 65 ++++++++++++++++++-------------- src/coreclr/jit/lowerxarch.cpp | 46 ++++++++-------------- 2 files changed, 53 insertions(+), 58 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index a864529c9e0045..fe1cbfbf4b6b81 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3153,8 +3153,6 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) } else { - // If src is contained then it must be 0. - assert(src->IsIntegralConst(0)); assert(willUseSimdMov); #ifdef TARGET_AMD64 assert(size >= XMM_REGSIZE_BYTES); @@ -3187,6 +3185,35 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg); zeroing = true; } + else if (src->gtSkipReloadOrCopy()->IsIntegralConst()) + { + ssize_t fill = src->AsIntCon()->IconValue() & 0xFF; + CORINFO_FIELD_HANDLE hnd = nullptr; + if (regSize == XMM_REGSIZE_BYTES) + { + simd16_t constValue; + memset(&constValue, (uint8_t)fill, sizeof(simd16_t)); + hnd = emit->emitSimd16Const(constValue); + } + else if (regSize == YMM_REGSIZE_BYTES) + { + simd32_t constValue; + memset(&constValue, (uint8_t)fill, sizeof(simd32_t)); + hnd = emit->emitSimd32Const(constValue); + } + else if (regSize == ZMM_REGSIZE_BYTES) + { + simd64_t constValue; + memset(&constValue, (uint8_t)fill, sizeof(simd64_t)); + hnd = emit->emitSimd64Const(constValue); + } + else + { + // Unexpected regSize + unreached(); + } + emit->emitIns_R_C(ins_Load(TYP_SIMD32), EA_ATTR(regSize), srcXmmReg, hnd, 0); + } else { // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk. @@ -3237,38 +3264,20 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) emitSimdMovs(); dstOffset += regSize; bytesWritten += regSize; - - if (!zeroing) - { - assert(regSize <= YMM_REGSIZE_BYTES); - } - - if (!zeroing && regSize == YMM_REGSIZE_BYTES && size - bytesWritten < YMM_REGSIZE_BYTES) - { - regSize = XMM_REGSIZE_BYTES; - } } size -= bytesWritten; - // Handle the remainder by overlapping with previously processed data (only for zeroing) - if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES)) + // Handle the remainder by overlapping with previously processed data + if ((size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES)) { - if (isPow2(size) && (size <= REGSIZE_BYTES)) - { - // For sizes like 1,2,4 and 8 we delegate handling to normal stores - // because that will be a single instruction that is smaller than SIMD mov - } - else - { - // Get optimal register size to cover the whole remainder (with overlapping) - regSize = compiler->roundUpSIMDSize(size); + // Get optimal register size to cover the whole remainder (with overlapping) + regSize = compiler->roundUpSIMDSize(size); - // Rewind dstOffset so we can fit a vector for the while remainder - dstOffset -= (regSize - size); - emitSimdMovs(); - size = 0; - } + // Rewind dstOffset so we can fit a vector for the while remainder + dstOffset -= (regSize - size); + emitSimdMovs(); + size = 0; } } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ca3c64dd2ff24a..871c4b716b8a90 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -337,38 +337,24 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) ssize_t fill = src->AsIntCon()->IconValue() & 0xFF; - if (fill == 0) + const bool canUseSimd = !blkNode->IsOnHeapAndContainsReferences(); + if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, canUseSimd)) { - if (size >= XMM_REGSIZE_BYTES) - { - const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences(); -#ifdef TARGET_AMD64 + // It turns out we can't use SIMD so the default threshold is too big + goto TOO_BIG_TO_UNROLL; + } - bool willUseOnlySimdMov = size % XMM_REGSIZE_BYTES == 0; - if (!willUseOnlySimdMov) - { - // If we have a remainder we still might only use SIMD to process it (via overlapping) - // unless it's more efficient to do that via scalar op (for sizes 1,2,4 and 8) - const unsigned remainder = size % XMM_REGSIZE_BYTES; - if (!isPow2(remainder) || (remainder > REGSIZE_BYTES)) - { - willUseOnlySimdMov = true; - } - } -#else - const bool willUseOnlySimdMov = (size % 8 == 0); -#endif - if (willUseOnlySimdMov && canUse16BytesSimdMov) - { - src->SetContained(); - } - else if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, - /*canUseSimd*/ canUse16BytesSimdMov)) - { - // It turns out we can't use SIMD so the default threshold is too big - goto TOO_BIG_TO_UNROLL; - } - } + const bool willUseSimd = + canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported(); + if (willUseSimd) + { + // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value + // just to handle the remainder when we can do that with an overlapped SIMD load). + src->SetContained(); + } + else if (fill == 0) + { + // Leave as is - zero doesn't shouldn't be contained when we don't use SIMD. } #ifdef TARGET_AMD64 else if (size >= REGSIZE_BYTES) From 0463672f27f762d2944f5b67b7e1209e4a746ebd Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 28 Apr 2023 03:57:17 +0200 Subject: [PATCH 02/10] fix build --- src/coreclr/jit/codegenxarch.cpp | 2 -- src/coreclr/jit/lowerxarch.cpp | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index fe1cbfbf4b6b81..2f9414359eab9f 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3177,13 +3177,11 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) regSize = min(regSize, YMM_REGSIZE_BYTES); } - bool zeroing = false; if (src->gtSkipReloadOrCopy()->IsIntegralConst(0)) { // If the source is constant 0 then always use xorps, it's faster // than copying the constant from a GPR to a XMM register. emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg); - zeroing = true; } else if (src->gtSkipReloadOrCopy()->IsIntegralConst()) { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 871c4b716b8a90..c00b1302aa6356 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -345,7 +345,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } const bool willUseSimd = - canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported(); + canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported(); if (willUseSimd) { // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value From c5cc7fefc76daa3aad9f7da82eead4484296dd6c Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Fri, 28 Apr 2023 11:52:15 +0200 Subject: [PATCH 03/10] Update codegenxarch.cpp --- src/coreclr/jit/codegenxarch.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 2f9414359eab9f..32f938671859ee 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3185,32 +3185,36 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) } else if (src->gtSkipReloadOrCopy()->IsIntegralConst()) { - ssize_t fill = src->AsIntCon()->IconValue() & 0xFF; - CORINFO_FIELD_HANDLE hnd = nullptr; + ssize_t fill = src->AsIntCon()->IconValue() & 0xFF; + CORINFO_FIELD_HANDLE hnd = nullptr; + var_types loadType = TYP_UNDEF; if (regSize == XMM_REGSIZE_BYTES) { simd16_t constValue; memset(&constValue, (uint8_t)fill, sizeof(simd16_t)); - hnd = emit->emitSimd16Const(constValue); + hnd = emit->emitSimd16Const(constValue); + loadType = TYP_SIMD16; } else if (regSize == YMM_REGSIZE_BYTES) { simd32_t constValue; memset(&constValue, (uint8_t)fill, sizeof(simd32_t)); - hnd = emit->emitSimd32Const(constValue); + hnd = emit->emitSimd32Const(constValue); + loadType = TYP_SIMD32; } else if (regSize == ZMM_REGSIZE_BYTES) { simd64_t constValue; memset(&constValue, (uint8_t)fill, sizeof(simd64_t)); - hnd = emit->emitSimd64Const(constValue); + hnd = emit->emitSimd64Const(constValue); + loadType = TYP_SIMD64; } else { // Unexpected regSize unreached(); } - emit->emitIns_R_C(ins_Load(TYP_SIMD32), EA_ATTR(regSize), srcXmmReg, hnd, 0); + emit->emitIns_R_C(ins_Load(loadType), EA_ATTR(regSize), srcXmmReg, hnd, 0); } else { From c6d174a79c87349969dbfdcb818bc2246bab4ced Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 28 Apr 2023 23:54:41 +0200 Subject: [PATCH 04/10] fix 32bit --- src/coreclr/jit/lowerxarch.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index c00b1302aa6356..cee10204552fcc 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -344,8 +344,12 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) goto TOO_BIG_TO_UNROLL; } - const bool willUseSimd = - canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported(); + bool willUseSimd = canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported(); +#ifndef TARGET_AMD64 + // TODO-CQ: Current codegen logic relies on size being a multiple of 8 + // to unroll for 32-bit targets. + willUseSimd = willUseSimd && (size % 8) == 0; +#endif if (willUseSimd) { // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value From e95ad54e14c15ba5d62d4d6363278d2774cc738d Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 29 Apr 2023 00:07:25 +0200 Subject: [PATCH 05/10] Simplify --- src/coreclr/jit/codegenxarch.cpp | 37 ++++++-------------------------- 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 32f938671859ee..6793347f729992 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3185,36 +3185,13 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) } else if (src->gtSkipReloadOrCopy()->IsIntegralConst()) { - ssize_t fill = src->AsIntCon()->IconValue() & 0xFF; - CORINFO_FIELD_HANDLE hnd = nullptr; - var_types loadType = TYP_UNDEF; - if (regSize == XMM_REGSIZE_BYTES) - { - simd16_t constValue; - memset(&constValue, (uint8_t)fill, sizeof(simd16_t)); - hnd = emit->emitSimd16Const(constValue); - loadType = TYP_SIMD16; - } - else if (regSize == YMM_REGSIZE_BYTES) - { - simd32_t constValue; - memset(&constValue, (uint8_t)fill, sizeof(simd32_t)); - hnd = emit->emitSimd32Const(constValue); - loadType = TYP_SIMD32; - } - else if (regSize == ZMM_REGSIZE_BYTES) - { - simd64_t constValue; - memset(&constValue, (uint8_t)fill, sizeof(simd64_t)); - hnd = emit->emitSimd64Const(constValue); - loadType = TYP_SIMD64; - } - else - { - // Unexpected regSize - unreached(); - } - emit->emitIns_R_C(ins_Load(loadType), EA_ATTR(regSize), srcXmmReg, hnd, 0); + // Populate a constant vector from the fill value and save it + // to the data section so we can load it by address. + assert(regSize <= ZMM_REGSIZE_BYTES); + simd64_t constValue; + memset(&constValue, (uint8_t)(src->AsIntCon()->IconValue() & 0xFF), sizeof(simd64_t)); + var_types loadType = compiler->getSIMDTypeForSize(regSize); + genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue)); } else { From 2c7c62f398663570138966deb354a91eed0124bf Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 29 Apr 2023 00:42:39 +0200 Subject: [PATCH 06/10] fix build --- src/coreclr/jit/codegenxarch.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 6793347f729992..6def77b643f475 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3183,6 +3183,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) // than copying the constant from a GPR to a XMM register. emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg); } +#ifdef FEATURE_SIMD else if (src->gtSkipReloadOrCopy()->IsIntegralConst()) { // Populate a constant vector from the fill value and save it @@ -3193,6 +3194,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) var_types loadType = compiler->getSIMDTypeForSize(regSize); genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue)); } +#endif else { // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk. From 6add0913676d0ecb20525819417e3ecf980dc988 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 29 Apr 2023 12:03:10 +0200 Subject: [PATCH 07/10] revert some changes --- src/coreclr/jit/codegenxarch.cpp | 37 ++++++++++++++++++++++++++------ src/coreclr/jit/lowerxarch.cpp | 2 +- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 6def77b643f475..8c8af5faee92c1 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3183,18 +3183,41 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) // than copying the constant from a GPR to a XMM register. emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg); } -#ifdef FEATURE_SIMD else if (src->gtSkipReloadOrCopy()->IsIntegralConst()) { // Populate a constant vector from the fill value and save it // to the data section so we can load it by address. - assert(regSize <= ZMM_REGSIZE_BYTES); - simd64_t constValue; - memset(&constValue, (uint8_t)(src->AsIntCon()->IconValue() & 0xFF), sizeof(simd64_t)); - var_types loadType = compiler->getSIMDTypeForSize(regSize); - genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue)); + ssize_t fill = src->AsIntCon()->IconValue(); + CORINFO_FIELD_HANDLE hnd = nullptr; + var_types loadType = TYP_UNDEF; + if (regSize == XMM_REGSIZE_BYTES) + { + simd16_t constValue; + memset(&constValue, (uint8_t)fill, sizeof(simd16_t)); + hnd = emit->emitSimd16Const(constValue); + loadType = TYP_SIMD16; + } + else if (regSize == YMM_REGSIZE_BYTES) + { + simd32_t constValue; + memset(&constValue, (uint8_t)fill, sizeof(simd32_t)); + hnd = emit->emitSimd32Const(constValue); + loadType = TYP_SIMD32; + } + else if (regSize == ZMM_REGSIZE_BYTES) + { + simd64_t constValue; + memset(&constValue, (uint8_t)fill, sizeof(simd64_t)); + hnd = emit->emitSimd64Const(constValue); + loadType = TYP_SIMD64; + } + else + { + // Unexpected regSize + unreached(); + } + emit->emitIns_R_C(ins_Load(loadType), EA_ATTR(regSize), srcXmmReg, hnd, 0); } -#endif else { // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk. diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index cee10204552fcc..61641da6cf430c 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -347,7 +347,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) bool willUseSimd = canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported(); #ifndef TARGET_AMD64 // TODO-CQ: Current codegen logic relies on size being a multiple of 8 - // to unroll for 32-bit targets. + // to unroll for 32-bit targets (for INS_movq) willUseSimd = willUseSimd && (size % 8) == 0; #endif if (willUseSimd) From b97e09f9707aaa8604861558a529d4d1181718c6 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Tue, 2 May 2023 22:04:05 +0200 Subject: [PATCH 08/10] clean up --- src/coreclr/jit/codegenxarch.cpp | 62 ++++++-------------------------- src/coreclr/jit/lowerxarch.cpp | 20 ++++++----- 2 files changed, 21 insertions(+), 61 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 6c251529c1f147..0e82149b9f8456 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3134,17 +3134,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) // INITBLK zeroes a struct that contains GC pointers and can be observed by // other threads (i.e. when dstAddr is not an address of a local). // For example, this can happen when initializing a struct field of an object. - const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences(); - -#ifdef TARGET_AMD64 - // On Amd64 the JIT will not use SIMD stores for such structs and instead - // will always allocate a GP register for src node. - const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES); -#else - // On X86 the JIT will use movq for structs that are larger than 16 bytes - // since it is more beneficial than using two mov-s from a GP register. - const bool willUseSimdMov = (size >= 16); -#endif + const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences() && compiler->IsBaselineSimdIsaSupported(); + const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES); if (!src->isContained()) { @@ -3153,11 +3144,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) else { assert(willUseSimdMov); -#ifdef TARGET_AMD64 assert(size >= XMM_REGSIZE_BYTES); -#else - assert(size % 8 == 0); -#endif } emitter* emit = GetEmitter(); @@ -3182,41 +3169,18 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) // than copying the constant from a GPR to a XMM register. emit->emitIns_SIMD_R_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg, srcXmmReg); } +#ifdef FEATURE_SIMD else if (src->gtSkipReloadOrCopy()->IsIntegralConst()) { // Populate a constant vector from the fill value and save it // to the data section so we can load it by address. - ssize_t fill = src->AsIntCon()->IconValue(); - CORINFO_FIELD_HANDLE hnd = nullptr; - var_types loadType = TYP_UNDEF; - if (regSize == XMM_REGSIZE_BYTES) - { - simd16_t constValue; - memset(&constValue, (uint8_t)fill, sizeof(simd16_t)); - hnd = emit->emitSimd16Const(constValue); - loadType = TYP_SIMD16; - } - else if (regSize == YMM_REGSIZE_BYTES) - { - simd32_t constValue; - memset(&constValue, (uint8_t)fill, sizeof(simd32_t)); - hnd = emit->emitSimd32Const(constValue); - loadType = TYP_SIMD32; - } - else if (regSize == ZMM_REGSIZE_BYTES) - { - simd64_t constValue; - memset(&constValue, (uint8_t)fill, sizeof(simd64_t)); - hnd = emit->emitSimd64Const(constValue); - loadType = TYP_SIMD64; - } - else - { - // Unexpected regSize - unreached(); - } - emit->emitIns_R_C(ins_Load(loadType), EA_ATTR(regSize), srcXmmReg, hnd, 0); + assert(regSize <= ZMM_REGSIZE_BYTES); + simd64_t constValue; + memset(&constValue, (uint8_t)(src->AsIntCon()->IconValue() & 0xFF), sizeof(simd64_t)); + var_types loadType = compiler->getSIMDTypeForSize(regSize); + genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue)); } +#endif else { // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk. @@ -3267,15 +3231,9 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) while (bytesWritten < size) { -#ifdef TARGET_X86 - if (!canUse16BytesSimdMov || (bytesWritten + regSize > size)) - { - simdMov = INS_movq; - regSize = 8; - } -#endif if (bytesWritten + regSize > size) { + // We have a remainder that is smaller than regSize. break; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 61641da6cf430c..7576f694ec6f57 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -327,7 +327,6 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } else { - // The fill value of an initblk is interpreted to hold a // value of (unsigned int8) however a constant of any size // may practically reside on the evaluation stack. So extract @@ -345,20 +344,23 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } bool willUseSimd = canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported(); -#ifndef TARGET_AMD64 - // TODO-CQ: Current codegen logic relies on size being a multiple of 8 - // to unroll for 32-bit targets (for INS_movq) - willUseSimd = willUseSimd && (size % 8) == 0; -#endif if (willUseSimd) { - // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value - // just to handle the remainder when we can do that with an overlapped SIMD load). +// We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value +// just to handle the remainder when we can do that with an overlapped SIMD load). +#ifdef FEATURE_SIMD src->SetContained(); +#else + if (fill == 0) + { + // When we don't have FEATURE_SIMD we can only handle zeroing + src->SetContained(); + } +#endif } else if (fill == 0) { - // Leave as is - zero doesn't shouldn't be contained when we don't use SIMD. + // Leave as is - zero shouldn't be contained when we don't use SIMD. } #ifdef TARGET_AMD64 else if (size >= REGSIZE_BYTES) From 13cf9e099cb9ca953b53439d4186a53322a6592d Mon Sep 17 00:00:00 2001 From: EgorBo Date: Wed, 3 May 2023 16:54:40 +0200 Subject: [PATCH 09/10] address feedback --- src/coreclr/jit/codegen.h | 3 + src/coreclr/jit/codegenxarch.cpp | 269 +++++++++++++++---------------- 2 files changed, 129 insertions(+), 143 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 94d67a06ace088..f038c607c0dff6 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -780,6 +780,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #endif void genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree); +#if defined(FEATURE_SIMD) + void genSetRegToConst(regNumber targetReg, var_types targetType, simd_t val); +#endif void genCodeForTreeNode(GenTree* treeNode); void genCodeForBinary(GenTreeOp* treeNode); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 0e82149b9f8456..f336e9207aced7 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -417,6 +417,123 @@ void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, regSet.verifyRegUsed(reg); } +#if defined(FEATURE_SIMD) +//---------------------------------------------------------------------------------- +// genSetRegToConst: generate code to set target SIMD register to a given constant value +// +// Arguments: +// targetReg - target SIMD register +// targetType - target's type +// simd_t - constant data (its width depends on type) +// +void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t val) +{ + emitter* emit = GetEmitter(); + emitAttr attr = emitTypeSize(targetType); + + switch (targetType) + { + case TYP_SIMD8: + { + simd8_t val8 = *(simd8_t*)&val; + if (val8.IsAllBitsSet()) + { + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg); + } + else if (val8.IsZero()) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg); + } + else + { + CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(val8); + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + } + break; + } + case TYP_SIMD12: + { + simd12_t val12 = *(simd12_t*)&val; + if (val12.IsAllBitsSet()) + { + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg); + } + else if (val12.IsZero()) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg); + } + else + { + simd16_t val16 = {}; + memcpy(&val16, &val12, sizeof(val12)); + CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16); + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + } + break; + } + case TYP_SIMD16: + { + simd16_t val16 = *(simd16_t*)&val; + if (val16.IsAllBitsSet()) + { + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg); + } + else if (val16.IsZero()) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); + } + else + { + CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16); + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + } + break; + } + case TYP_SIMD32: + { + simd32_t val32 = *(simd32_t*)&val; + if (val32.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg); + } + else if (val32.IsZero()) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); + } + else + { + CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(val32); + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + } + break; + } + case TYP_SIMD64: + { + simd64_t val64 = *(simd64_t*)&val; + if (val64.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg, + static_cast(0xFF)); + } + else if (val64.IsZero()) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); + } + else + { + CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(val64); + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + } + break; + } + default: + { + unreached(); + } + } +} +#endif // FEATURE_SIMD + /*********************************************************************************** * * Generate code to set a register 'targetReg' of type 'targetType' to the constant @@ -484,147 +601,12 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre case GT_CNS_VEC: { - GenTreeVecCon* vecCon = tree->AsVecCon(); - - emitter* emit = GetEmitter(); - emitAttr attr = emitTypeSize(targetType); - - if (vecCon->IsAllBitsSet()) - { - switch (attr) - { - case EA_8BYTE: - case EA_16BYTE: - { - emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg); - return; - } - -#if defined(FEATURE_SIMD) - case EA_32BYTE: - { - if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2)) - { - emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg); - return; - } - break; - } - - case EA_64BYTE: - { - assert(compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)); - emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg, - static_cast(0xFF)); - return; - } -#endif // FEATURE_SIMD - - default: - { - unreached(); - } - } - } - - if (vecCon->IsZero()) - { - switch (attr) - { - case EA_8BYTE: - case EA_16BYTE: - { - emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg); - return; - } - - case EA_32BYTE: - { - if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)) - { - emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); - return; - } - break; - } - - case EA_64BYTE: - { - if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) - { - emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); - return; - } - break; - } - - default: - { - unreached(); - } - } - } - - switch (tree->TypeGet()) - { #if defined(FEATURE_SIMD) - case TYP_SIMD8: - { - simd8_t constValue; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd8_t)); - - CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(constValue); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); - break; - } - - case TYP_SIMD12: - { - simd16_t constValue = {}; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd12_t)); - - CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(constValue); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); - break; - } - - case TYP_SIMD16: - { - simd16_t constValue; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd16_t)); - - CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(constValue); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); - break; - } - - case TYP_SIMD32: - { - simd32_t constValue; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd32_t)); - - CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(constValue); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); - break; - } - - case TYP_SIMD64: - { - simd64_t constValue; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd64_t)); - - CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(constValue); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); - break; - } -#endif // FEATURE_SIMD - - default: - { - unreached(); - } - } - + GenTreeVecCon* vecCon = tree->AsVecCon(); + genSetRegToConst(vecCon->GetRegNum(), targetType, vecCon->gtSimdVal); +#else + unreached(); +#endif break; } @@ -3175,10 +3157,11 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) // Populate a constant vector from the fill value and save it // to the data section so we can load it by address. assert(regSize <= ZMM_REGSIZE_BYTES); - simd64_t constValue; - memset(&constValue, (uint8_t)(src->AsIntCon()->IconValue() & 0xFF), sizeof(simd64_t)); + var_types loadType = compiler->getSIMDTypeForSize(regSize); - genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue)); + simd_t vecCon; + memset(&vecCon, (uint8_t)src->AsIntCon()->IconValue(), sizeof(simd_t)); + genSetRegToConst(srcXmmReg, loadType, vecCon); } #endif else From 7dfe72091d767b2140c1f0344ec8ae067f778745 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Wed, 3 May 2023 17:50:22 +0200 Subject: [PATCH 10/10] clean up --- src/coreclr/jit/codegen.h | 2 +- src/coreclr/jit/codegenxarch.cpp | 75 ++++++-------------------------- src/coreclr/jit/lowerxarch.cpp | 19 +++----- 3 files changed, 20 insertions(+), 76 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index f038c607c0dff6..070ce82e6ec22f 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -781,7 +781,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree); #if defined(FEATURE_SIMD) - void genSetRegToConst(regNumber targetReg, var_types targetType, simd_t val); + void genSetRegToConst(regNumber targetReg, var_types targetType, simd_t* val); #endif void genCodeForTreeNode(GenTree* treeNode); void genCodeForBinary(GenTreeOp* treeNode); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index f336e9207aced7..1988aa632cf35b 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -426,7 +426,7 @@ void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, // targetType - target's type // simd_t - constant data (its width depends on type) // -void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t val) +void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t* val) { emitter* emit = GetEmitter(); emitAttr attr = emitTypeSize(targetType); @@ -435,7 +435,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t { case TYP_SIMD8: { - simd8_t val8 = *(simd8_t*)&val; + simd8_t val8 = *(simd8_t*)val; if (val8.IsAllBitsSet()) { emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg); @@ -453,7 +453,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } case TYP_SIMD12: { - simd12_t val12 = *(simd12_t*)&val; + simd12_t val12 = *(simd12_t*)val; if (val12.IsAllBitsSet()) { emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg); @@ -473,7 +473,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } case TYP_SIMD16: { - simd16_t val16 = *(simd16_t*)&val; + simd16_t val16 = *(simd16_t*)val; if (val16.IsAllBitsSet()) { emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg); @@ -491,7 +491,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } case TYP_SIMD32: { - simd32_t val32 = *(simd32_t*)&val; + simd32_t val32 = *(simd32_t*)val; if (val32.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2)) { emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg); @@ -509,7 +509,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t } case TYP_SIMD64: { - simd64_t val64 = *(simd64_t*)&val; + simd64_t val64 = *(simd64_t*)val; if (val64.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) { emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg, @@ -603,7 +603,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre { #if defined(FEATURE_SIMD) GenTreeVecCon* vecCon = tree->AsVecCon(); - genSetRegToConst(vecCon->GetRegNum(), targetType, vecCon->gtSimdVal); + genSetRegToConst(vecCon->GetRegNum(), targetType, &vecCon->gtSimdVal); #else unreached(); #endif @@ -3134,68 +3134,20 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) assert(size <= INT32_MAX); assert(dstOffset < (INT32_MAX - static_cast(size))); +#ifdef FEATURE_SIMD if (willUseSimdMov) { regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); - - unsigned regSize = compiler->roundDownSIMDSize(size); + unsigned regSize = compiler->roundDownSIMDSize(size); if (size < ZMM_RECOMMENDED_THRESHOLD) { // Involve ZMM only for large data due to possible downclocking. regSize = min(regSize, YMM_REGSIZE_BYTES); } - - if (src->gtSkipReloadOrCopy()->IsIntegralConst(0)) - { - // If the source is constant 0 then always use xorps, it's faster - // than copying the constant from a GPR to a XMM register. - emit->emitIns_SIMD_R_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg, srcXmmReg); - } -#ifdef FEATURE_SIMD - else if (src->gtSkipReloadOrCopy()->IsIntegralConst()) - { - // Populate a constant vector from the fill value and save it - // to the data section so we can load it by address. - assert(regSize <= ZMM_REGSIZE_BYTES); - - var_types loadType = compiler->getSIMDTypeForSize(regSize); - simd_t vecCon; - memset(&vecCon, (uint8_t)src->AsIntCon()->IconValue(), sizeof(simd_t)); - genSetRegToConst(srcXmmReg, loadType, vecCon); - } -#endif - else - { - // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk. - regSize = min(regSize, YMM_REGSIZE_BYTES); - - if (compiler->compOpportunisticallyDependsOn(InstructionSet_Vector512)) - { - emit->emitIns_R_R(INS_vpbroadcastd_gpr, EA_ATTR(regSize), srcXmmReg, srcIntReg); - } - else if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2)) - { - emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false); - emit->emitIns_R_R(INS_vpbroadcastd, EA_ATTR(regSize), srcXmmReg, srcXmmReg); - } - else - { - emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false); - - emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg); - -#ifdef TARGET_X86 - // For x86, we need one more to convert it from 8 bytes to 16 bytes. - emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg); -#endif - - if (regSize == YMM_REGSIZE_BYTES) - { - // Extend the bytes in the lower lanes to the upper lanes - emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1); - } - } - } + var_types loadType = compiler->getSIMDTypeForSize(regSize); + simd_t vecCon; + memset(&vecCon, (uint8_t)src->AsIntCon()->IconValue(), sizeof(simd_t)); + genSetRegToConst(srcXmmReg, loadType, &vecCon); instruction simdMov = simdUnalignedMovIns(); unsigned bytesWritten = 0; @@ -3239,6 +3191,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) size = 0; } } +#endif // FEATURE_SIMD assert((srcIntReg != REG_NA) || (size == 0)); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 7576f694ec6f57..d780773abba69a 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -336,27 +336,18 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) ssize_t fill = src->AsIntCon()->IconValue() & 0xFF; - const bool canUseSimd = !blkNode->IsOnHeapAndContainsReferences(); + const bool canUseSimd = !blkNode->IsOnHeapAndContainsReferences() && comp->IsBaselineSimdIsaSupported(); if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, canUseSimd)) { // It turns out we can't use SIMD so the default threshold is too big goto TOO_BIG_TO_UNROLL; } - - bool willUseSimd = canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported(); - if (willUseSimd) + if (canUseSimd && (size >= XMM_REGSIZE_BYTES)) { -// We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value -// just to handle the remainder when we can do that with an overlapped SIMD load). -#ifdef FEATURE_SIMD + // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register + // with a fill value just to handle the remainder when we can do that with + // an overlapped SIMD load). src->SetContained(); -#else - if (fill == 0) - { - // When we don't have FEATURE_SIMD we can only handle zeroing - src->SetContained(); - } -#endif } else if (fill == 0) {