From a4ee9948883bbef580d54241d4200a0989e1f94a Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Fri, 28 Apr 2023 03:29:16 +0200
Subject: [PATCH 01/10] Clean up BLK unrolling on xarch

---
 src/coreclr/jit/codegenxarch.cpp | 65 ++++++++++++++++++--------------
 src/coreclr/jit/lowerxarch.cpp   | 46 ++++++++--------------
 2 files changed, 53 insertions(+), 58 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index a864529c9e0045..fe1cbfbf4b6b81 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -3153,8 +3153,6 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
     }
     else
     {
-        // If src is contained then it must be 0.
-        assert(src->IsIntegralConst(0));
         assert(willUseSimdMov);
 #ifdef TARGET_AMD64
         assert(size >= XMM_REGSIZE_BYTES);
@@ -3187,6 +3185,35 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
             emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg);
             zeroing = true;
         }
+        else if (src->gtSkipReloadOrCopy()->IsIntegralConst())
+        {
+            ssize_t              fill = src->AsIntCon()->IconValue() & 0xFF;
+            CORINFO_FIELD_HANDLE hnd  = nullptr;
+            if (regSize == XMM_REGSIZE_BYTES)
+            {
+                simd16_t constValue;
+                memset(&constValue, (uint8_t)fill, sizeof(simd16_t));
+                hnd = emit->emitSimd16Const(constValue);
+            }
+            else if (regSize == YMM_REGSIZE_BYTES)
+            {
+                simd32_t constValue;
+                memset(&constValue, (uint8_t)fill, sizeof(simd32_t));
+                hnd = emit->emitSimd32Const(constValue);
+            }
+            else if (regSize == ZMM_REGSIZE_BYTES)
+            {
+                simd64_t constValue;
+                memset(&constValue, (uint8_t)fill, sizeof(simd64_t));
+                hnd = emit->emitSimd64Const(constValue);
+            }
+            else
+            {
+                // Unexpected regSize
+                unreached();
+            }
+            emit->emitIns_R_C(ins_Load(TYP_SIMD32), EA_ATTR(regSize), srcXmmReg, hnd, 0);
+        }
         else
         {
             // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk.
@@ -3237,38 +3264,20 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
             emitSimdMovs();
             dstOffset += regSize;
             bytesWritten += regSize;
-
-            if (!zeroing)
-            {
-                assert(regSize <= YMM_REGSIZE_BYTES);
-            }
-
-            if (!zeroing && regSize == YMM_REGSIZE_BYTES && size - bytesWritten < YMM_REGSIZE_BYTES)
-            {
-                regSize = XMM_REGSIZE_BYTES;
-            }
         }
 
         size -= bytesWritten;
 
-        // Handle the remainder by overlapping with previously processed data (only for zeroing)
-        if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
+        // Handle the remainder by overlapping with previously processed data
+        if ((size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
         {
-            if (isPow2(size) && (size <= REGSIZE_BYTES))
-            {
-                // For sizes like 1,2,4 and 8 we delegate handling to normal stores
-                // because that will be a single instruction that is smaller than SIMD mov
-            }
-            else
-            {
-                // Get optimal register size to cover the whole remainder (with overlapping)
-                regSize = compiler->roundUpSIMDSize(size);
+            // Get optimal register size to cover the whole remainder (with overlapping)
+            regSize = compiler->roundUpSIMDSize(size);
 
-                // Rewind dstOffset so we can fit a vector for the while remainder
-                dstOffset -= (regSize - size);
-                emitSimdMovs();
-                size = 0;
-            }
+            // Rewind dstOffset so we can fit a vector for the while remainder
+            dstOffset -= (regSize - size);
+            emitSimdMovs();
+            size = 0;
         }
     }
 
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index ca3c64dd2ff24a..871c4b716b8a90 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -337,38 +337,24 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
 
                 ssize_t fill = src->AsIntCon()->IconValue() & 0xFF;
 
-                if (fill == 0)
+                const bool canUseSimd = !blkNode->IsOnHeapAndContainsReferences();
+                if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, canUseSimd))
                 {
-                    if (size >= XMM_REGSIZE_BYTES)
-                    {
-                        const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
-#ifdef TARGET_AMD64
+                    // It turns out we can't use SIMD so the default threshold is too big
+                    goto TOO_BIG_TO_UNROLL;
+                }
 
-                        bool willUseOnlySimdMov = size % XMM_REGSIZE_BYTES == 0;
-                        if (!willUseOnlySimdMov)
-                        {
-                            // If we have a remainder we still might only use SIMD to process it (via overlapping)
-                            // unless it's more efficient to do that via scalar op (for sizes 1,2,4 and 8)
-                            const unsigned remainder = size % XMM_REGSIZE_BYTES;
-                            if (!isPow2(remainder) || (remainder > REGSIZE_BYTES))
-                            {
-                                willUseOnlySimdMov = true;
-                            }
-                        }
-#else
-                        const bool willUseOnlySimdMov = (size % 8 == 0);
-#endif
-                        if (willUseOnlySimdMov && canUse16BytesSimdMov)
-                        {
-                            src->SetContained();
-                        }
-                        else if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset,
-                                                                 /*canUseSimd*/ canUse16BytesSimdMov))
-                        {
-                            // It turns out we can't use SIMD so the default threshold is too big
-                            goto TOO_BIG_TO_UNROLL;
-                        }
-                    }
+                const bool willUseSimd =
+                    canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported();
+                if (willUseSimd)
+                {
+                    // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value
+                    // just to handle the remainder when we can do that with an overlapped SIMD load).
+                    src->SetContained();
+                }
+                else if (fill == 0)
+                {
+                    // Leave as is - zero doesn't shouldn't be contained when we don't use SIMD.
                 }
 #ifdef TARGET_AMD64
                 else if (size >= REGSIZE_BYTES)

From 0463672f27f762d2944f5b67b7e1209e4a746ebd Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Fri, 28 Apr 2023 03:57:17 +0200
Subject: [PATCH 02/10] fix build

---
 src/coreclr/jit/codegenxarch.cpp | 2 --
 src/coreclr/jit/lowerxarch.cpp   | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index fe1cbfbf4b6b81..2f9414359eab9f 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -3177,13 +3177,11 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
             regSize = min(regSize, YMM_REGSIZE_BYTES);
         }
 
-        bool zeroing = false;
         if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
         {
             // If the source is constant 0 then always use xorps, it's faster
             // than copying the constant from a GPR to a XMM register.
             emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg);
-            zeroing = true;
         }
         else if (src->gtSkipReloadOrCopy()->IsIntegralConst())
         {
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 871c4b716b8a90..c00b1302aa6356 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -345,7 +345,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
                 }
 
                 const bool willUseSimd =
-                    canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported();
+                    canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported();
                 if (willUseSimd)
                 {
                     // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value

From c5cc7fefc76daa3aad9f7da82eead4484296dd6c Mon Sep 17 00:00:00 2001
From: Egor Bogatov <egorbo@gmail.com>
Date: Fri, 28 Apr 2023 11:52:15 +0200
Subject: [PATCH 03/10] Update codegenxarch.cpp

---
 src/coreclr/jit/codegenxarch.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 2f9414359eab9f..32f938671859ee 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -3185,32 +3185,36 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
         }
         else if (src->gtSkipReloadOrCopy()->IsIntegralConst())
         {
-            ssize_t              fill = src->AsIntCon()->IconValue() & 0xFF;
-            CORINFO_FIELD_HANDLE hnd  = nullptr;
+            ssize_t              fill     = src->AsIntCon()->IconValue() & 0xFF;
+            CORINFO_FIELD_HANDLE hnd      = nullptr;
+            var_types            loadType = TYP_UNDEF;
             if (regSize == XMM_REGSIZE_BYTES)
             {
                 simd16_t constValue;
                 memset(&constValue, (uint8_t)fill, sizeof(simd16_t));
-                hnd = emit->emitSimd16Const(constValue);
+                hnd      = emit->emitSimd16Const(constValue);
+                loadType = TYP_SIMD16;
             }
             else if (regSize == YMM_REGSIZE_BYTES)
             {
                 simd32_t constValue;
                 memset(&constValue, (uint8_t)fill, sizeof(simd32_t));
-                hnd = emit->emitSimd32Const(constValue);
+                hnd      = emit->emitSimd32Const(constValue);
+                loadType = TYP_SIMD32;
             }
             else if (regSize == ZMM_REGSIZE_BYTES)
             {
                 simd64_t constValue;
                 memset(&constValue, (uint8_t)fill, sizeof(simd64_t));
-                hnd = emit->emitSimd64Const(constValue);
+                hnd      = emit->emitSimd64Const(constValue);
+                loadType = TYP_SIMD64;
             }
             else
             {
                 // Unexpected regSize
                 unreached();
             }
-            emit->emitIns_R_C(ins_Load(TYP_SIMD32), EA_ATTR(regSize), srcXmmReg, hnd, 0);
+            emit->emitIns_R_C(ins_Load(loadType), EA_ATTR(regSize), srcXmmReg, hnd, 0);
         }
         else
         {

From c6d174a79c87349969dbfdcb818bc2246bab4ced Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Fri, 28 Apr 2023 23:54:41 +0200
Subject: [PATCH 04/10] fix 32bit

---
 src/coreclr/jit/lowerxarch.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index c00b1302aa6356..cee10204552fcc 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -344,8 +344,12 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
                     goto TOO_BIG_TO_UNROLL;
                 }
 
-                const bool willUseSimd =
-                    canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported();
+                bool willUseSimd = canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported();
+#ifndef TARGET_AMD64
+                // TODO-CQ: Current codegen logic relies on size being a multiple of 8
+                // to unroll for 32-bit targets.
+                willUseSimd = willUseSimd && (size % 8) == 0;
+#endif
                 if (willUseSimd)
                 {
                     // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value

From e95ad54e14c15ba5d62d4d6363278d2774cc738d Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Sat, 29 Apr 2023 00:07:25 +0200
Subject: [PATCH 05/10] Simplify

---
 src/coreclr/jit/codegenxarch.cpp | 37 ++++++--------------------------
 1 file changed, 7 insertions(+), 30 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 32f938671859ee..6793347f729992 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -3185,36 +3185,13 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
         }
         else if (src->gtSkipReloadOrCopy()->IsIntegralConst())
         {
-            ssize_t              fill     = src->AsIntCon()->IconValue() & 0xFF;
-            CORINFO_FIELD_HANDLE hnd      = nullptr;
-            var_types            loadType = TYP_UNDEF;
-            if (regSize == XMM_REGSIZE_BYTES)
-            {
-                simd16_t constValue;
-                memset(&constValue, (uint8_t)fill, sizeof(simd16_t));
-                hnd      = emit->emitSimd16Const(constValue);
-                loadType = TYP_SIMD16;
-            }
-            else if (regSize == YMM_REGSIZE_BYTES)
-            {
-                simd32_t constValue;
-                memset(&constValue, (uint8_t)fill, sizeof(simd32_t));
-                hnd      = emit->emitSimd32Const(constValue);
-                loadType = TYP_SIMD32;
-            }
-            else if (regSize == ZMM_REGSIZE_BYTES)
-            {
-                simd64_t constValue;
-                memset(&constValue, (uint8_t)fill, sizeof(simd64_t));
-                hnd      = emit->emitSimd64Const(constValue);
-                loadType = TYP_SIMD64;
-            }
-            else
-            {
-                // Unexpected regSize
-                unreached();
-            }
-            emit->emitIns_R_C(ins_Load(loadType), EA_ATTR(regSize), srcXmmReg, hnd, 0);
+            // Populate a constant vector from the fill value and save it
+            // to the data section so we can load it by address.
+            assert(regSize <= ZMM_REGSIZE_BYTES);
+            simd64_t constValue;
+            memset(&constValue, (uint8_t)(src->AsIntCon()->IconValue() & 0xFF), sizeof(simd64_t));
+            var_types loadType = compiler->getSIMDTypeForSize(regSize);
+            genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue));
         }
         else
         {

From 2c7c62f398663570138966deb354a91eed0124bf Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Sat, 29 Apr 2023 00:42:39 +0200
Subject: [PATCH 06/10] fix build

---
 src/coreclr/jit/codegenxarch.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 6793347f729992..6def77b643f475 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -3183,6 +3183,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
             // than copying the constant from a GPR to a XMM register.
             emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg);
         }
+#ifdef FEATURE_SIMD
         else if (src->gtSkipReloadOrCopy()->IsIntegralConst())
         {
             // Populate a constant vector from the fill value and save it
@@ -3193,6 +3194,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
             var_types loadType = compiler->getSIMDTypeForSize(regSize);
             genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue));
         }
+#endif
         else
         {
             // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk.

From 6add0913676d0ecb20525819417e3ecf980dc988 Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Sat, 29 Apr 2023 12:03:10 +0200
Subject: [PATCH 07/10] revert some changes

---
 src/coreclr/jit/codegenxarch.cpp | 37 ++++++++++++++++++++++++++------
 src/coreclr/jit/lowerxarch.cpp   |  2 +-
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 6def77b643f475..8c8af5faee92c1 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -3183,18 +3183,41 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
             // than copying the constant from a GPR to a XMM register.
             emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg);
         }
-#ifdef FEATURE_SIMD
         else if (src->gtSkipReloadOrCopy()->IsIntegralConst())
         {
             // Populate a constant vector from the fill value and save it
             // to the data section so we can load it by address.
-            assert(regSize <= ZMM_REGSIZE_BYTES);
-            simd64_t constValue;
-            memset(&constValue, (uint8_t)(src->AsIntCon()->IconValue() & 0xFF), sizeof(simd64_t));
-            var_types loadType = compiler->getSIMDTypeForSize(regSize);
-            genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue));
+            ssize_t              fill     = src->AsIntCon()->IconValue();
+            CORINFO_FIELD_HANDLE hnd      = nullptr;
+            var_types            loadType = TYP_UNDEF;
+            if (regSize == XMM_REGSIZE_BYTES)
+            {
+                simd16_t constValue;
+                memset(&constValue, (uint8_t)fill, sizeof(simd16_t));
+                hnd      = emit->emitSimd16Const(constValue);
+                loadType = TYP_SIMD16;
+            }
+            else if (regSize == YMM_REGSIZE_BYTES)
+            {
+                simd32_t constValue;
+                memset(&constValue, (uint8_t)fill, sizeof(simd32_t));
+                hnd      = emit->emitSimd32Const(constValue);
+                loadType = TYP_SIMD32;
+            }
+            else if (regSize == ZMM_REGSIZE_BYTES)
+            {
+                simd64_t constValue;
+                memset(&constValue, (uint8_t)fill, sizeof(simd64_t));
+                hnd      = emit->emitSimd64Const(constValue);
+                loadType = TYP_SIMD64;
+            }
+            else
+            {
+                // Unexpected regSize
+                unreached();
+            }
+            emit->emitIns_R_C(ins_Load(loadType), EA_ATTR(regSize), srcXmmReg, hnd, 0);
         }
-#endif
         else
         {
             // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk.
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index cee10204552fcc..61641da6cf430c 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -347,7 +347,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
                 bool willUseSimd = canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported();
 #ifndef TARGET_AMD64
                 // TODO-CQ: Current codegen logic relies on size being a multiple of 8
-                // to unroll for 32-bit targets.
+                // to unroll for 32-bit targets (for INS_movq)
                 willUseSimd = willUseSimd && (size % 8) == 0;
 #endif
                 if (willUseSimd)

From b97e09f9707aaa8604861558a529d4d1181718c6 Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Tue, 2 May 2023 22:04:05 +0200
Subject: [PATCH 08/10] clean up

---
 src/coreclr/jit/codegenxarch.cpp | 62 ++++++--------------------------
 src/coreclr/jit/lowerxarch.cpp   | 20 ++++++-----
 2 files changed, 21 insertions(+), 61 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 6c251529c1f147..0e82149b9f8456 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -3134,17 +3134,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
     // INITBLK zeroes a struct that contains GC pointers and can be observed by
     // other threads (i.e. when dstAddr is not an address of a local).
     // For example, this can happen when initializing a struct field of an object.
-    const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences();
-
-#ifdef TARGET_AMD64
-    // On Amd64 the JIT will not use SIMD stores for such structs and instead
-    // will always allocate a GP register for src node.
-    const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
-#else
-    // On X86 the JIT will use movq for structs that are larger than 16 bytes
-    // since it is more beneficial than using two mov-s from a GP register.
-    const bool willUseSimdMov = (size >= 16);
-#endif
+    const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences() && compiler->IsBaselineSimdIsaSupported();
+    const bool willUseSimdMov       = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
 
     if (!src->isContained())
     {
@@ -3153,11 +3144,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
     else
     {
         assert(willUseSimdMov);
-#ifdef TARGET_AMD64
         assert(size >= XMM_REGSIZE_BYTES);
-#else
-        assert(size % 8 == 0);
-#endif
     }
 
     emitter* emit = GetEmitter();
@@ -3182,41 +3169,18 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
             // than copying the constant from a GPR to a XMM register.
             emit->emitIns_SIMD_R_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg, srcXmmReg);
         }
+#ifdef FEATURE_SIMD
         else if (src->gtSkipReloadOrCopy()->IsIntegralConst())
         {
             // Populate a constant vector from the fill value and save it
             // to the data section so we can load it by address.
-            ssize_t              fill     = src->AsIntCon()->IconValue();
-            CORINFO_FIELD_HANDLE hnd      = nullptr;
-            var_types            loadType = TYP_UNDEF;
-            if (regSize == XMM_REGSIZE_BYTES)
-            {
-                simd16_t constValue;
-                memset(&constValue, (uint8_t)fill, sizeof(simd16_t));
-                hnd      = emit->emitSimd16Const(constValue);
-                loadType = TYP_SIMD16;
-            }
-            else if (regSize == YMM_REGSIZE_BYTES)
-            {
-                simd32_t constValue;
-                memset(&constValue, (uint8_t)fill, sizeof(simd32_t));
-                hnd      = emit->emitSimd32Const(constValue);
-                loadType = TYP_SIMD32;
-            }
-            else if (regSize == ZMM_REGSIZE_BYTES)
-            {
-                simd64_t constValue;
-                memset(&constValue, (uint8_t)fill, sizeof(simd64_t));
-                hnd      = emit->emitSimd64Const(constValue);
-                loadType = TYP_SIMD64;
-            }
-            else
-            {
-                // Unexpected regSize
-                unreached();
-            }
-            emit->emitIns_R_C(ins_Load(loadType), EA_ATTR(regSize), srcXmmReg, hnd, 0);
+            assert(regSize <= ZMM_REGSIZE_BYTES);
+            simd64_t constValue;
+            memset(&constValue, (uint8_t)(src->AsIntCon()->IconValue() & 0xFF), sizeof(simd64_t));
+            var_types loadType = compiler->getSIMDTypeForSize(regSize);
+            genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue));
         }
+#endif
         else
         {
             // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk.
@@ -3267,15 +3231,9 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
 
         while (bytesWritten < size)
         {
-#ifdef TARGET_X86
-            if (!canUse16BytesSimdMov || (bytesWritten + regSize > size))
-            {
-                simdMov = INS_movq;
-                regSize = 8;
-            }
-#endif
             if (bytesWritten + regSize > size)
             {
+                // We have a remainder that is smaller than regSize.
                 break;
             }
 
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 61641da6cf430c..7576f694ec6f57 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -327,7 +327,6 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
             }
             else
             {
-
                 // The fill value of an initblk is interpreted to hold a
                 // value of (unsigned int8) however a constant of any size
                 // may practically reside on the evaluation stack. So extract
@@ -345,20 +344,23 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
                 }
 
                 bool willUseSimd = canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported();
-#ifndef TARGET_AMD64
-                // TODO-CQ: Current codegen logic relies on size being a multiple of 8
-                // to unroll for 32-bit targets (for INS_movq)
-                willUseSimd = willUseSimd && (size % 8) == 0;
-#endif
                 if (willUseSimd)
                 {
-                    // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value
-                    // just to handle the remainder when we can do that with an overlapped SIMD load).
+// We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value
+// just to handle the remainder when we can do that with an overlapped SIMD load).
+#ifdef FEATURE_SIMD
                     src->SetContained();
+#else
+                    if (fill == 0)
+                    {
+                        // When we don't have FEATURE_SIMD we can only handle zeroing
+                        src->SetContained();
+                    }
+#endif
                 }
                 else if (fill == 0)
                 {
-                    // Leave as is - zero doesn't shouldn't be contained when we don't use SIMD.
+                    // Leave as is - zero shouldn't be contained when we don't use SIMD.
                 }
 #ifdef TARGET_AMD64
                 else if (size >= REGSIZE_BYTES)

From 13cf9e099cb9ca953b53439d4186a53322a6592d Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Wed, 3 May 2023 16:54:40 +0200
Subject: [PATCH 09/10] address feedback

---
 src/coreclr/jit/codegen.h        |   3 +
 src/coreclr/jit/codegenxarch.cpp | 269 +++++++++++++++----------------
 2 files changed, 129 insertions(+), 143 deletions(-)

diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index 94d67a06ace088..f038c607c0dff6 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -780,6 +780,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #endif
 
     void genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree);
+#if defined(FEATURE_SIMD)
+    void genSetRegToConst(regNumber targetReg, var_types targetType, simd_t val);
+#endif
     void genCodeForTreeNode(GenTree* treeNode);
     void genCodeForBinary(GenTreeOp* treeNode);
 
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 0e82149b9f8456..f336e9207aced7 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -417,6 +417,123 @@ void CodeGen::instGen_Set_Reg_To_Imm(emitAttr  size,
     regSet.verifyRegUsed(reg);
 }
 
+#if defined(FEATURE_SIMD)
+//----------------------------------------------------------------------------------
+// genSetRegToConst: generate code to set target SIMD register to a given constant value
+//
+// Arguments:
+//    targetReg  - target SIMD register
+//    targetType - target's type
+//    simd_t     - constant data (its width depends on type)
+//
+void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t val)
+{
+    emitter* emit = GetEmitter();
+    emitAttr attr = emitTypeSize(targetType);
+
+    switch (targetType)
+    {
+        case TYP_SIMD8:
+        {
+            simd8_t val8 = *(simd8_t*)&val;
+            if (val8.IsAllBitsSet())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
+            }
+            else if (val8.IsZero())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg);
+            }
+            else
+            {
+                CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(val8);
+                emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+            }
+            break;
+        }
+        case TYP_SIMD12:
+        {
+            simd12_t val12 = *(simd12_t*)&val;
+            if (val12.IsAllBitsSet())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
+            }
+            else if (val12.IsZero())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg);
+            }
+            else
+            {
+                simd16_t val16 = {};
+                memcpy(&val16, &val12, sizeof(val12));
+                CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16);
+                emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+            }
+            break;
+        }
+        case TYP_SIMD16:
+        {
+            simd16_t val16 = *(simd16_t*)&val;
+            if (val16.IsAllBitsSet())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
+            }
+            else if (val16.IsZero())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
+            }
+            else
+            {
+                CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16);
+                emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+            }
+            break;
+        }
+        case TYP_SIMD32:
+        {
+            simd32_t val32 = *(simd32_t*)&val;
+            if (val32.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
+            {
+                emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
+            }
+            else if (val32.IsZero())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
+            }
+            else
+            {
+                CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(val32);
+                emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+            }
+            break;
+        }
+        case TYP_SIMD64:
+        {
+            simd64_t val64 = *(simd64_t*)&val;
+            if (val64.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F))
+            {
+                emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
+                                           static_cast<int8_t>(0xFF));
+            }
+            else if (val64.IsZero())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
+            }
+            else
+            {
+                CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(val64);
+                emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+            }
+            break;
+        }
+        default:
+        {
+            unreached();
+        }
+    }
+}
+#endif // FEATURE_SIMD
+
 /***********************************************************************************
  *
  * Generate code to set a register 'targetReg' of type 'targetType' to the constant
@@ -484,147 +601,12 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
 
         case GT_CNS_VEC:
         {
-            GenTreeVecCon* vecCon = tree->AsVecCon();
-
-            emitter* emit = GetEmitter();
-            emitAttr attr = emitTypeSize(targetType);
-
-            if (vecCon->IsAllBitsSet())
-            {
-                switch (attr)
-                {
-                    case EA_8BYTE:
-                    case EA_16BYTE:
-                    {
-                        emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
-                        return;
-                    }
-
-#if defined(FEATURE_SIMD)
-                    case EA_32BYTE:
-                    {
-                        if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
-                        {
-                            emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
-                            return;
-                        }
-                        break;
-                    }
-
-                    case EA_64BYTE:
-                    {
-                        assert(compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F));
-                        emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
-                                                   static_cast<int8_t>(0xFF));
-                        return;
-                    }
-#endif // FEATURE_SIMD
-
-                    default:
-                    {
-                        unreached();
-                    }
-                }
-            }
-
-            if (vecCon->IsZero())
-            {
-                switch (attr)
-                {
-                    case EA_8BYTE:
-                    case EA_16BYTE:
-                    {
-                        emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg);
-                        return;
-                    }
-
-                    case EA_32BYTE:
-                    {
-                        if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX))
-                        {
-                            emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
-                            return;
-                        }
-                        break;
-                    }
-
-                    case EA_64BYTE:
-                    {
-                        if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F))
-                        {
-                            emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
-                            return;
-                        }
-                        break;
-                    }
-
-                    default:
-                    {
-                        unreached();
-                    }
-                }
-            }
-
-            switch (tree->TypeGet())
-            {
 #if defined(FEATURE_SIMD)
-                case TYP_SIMD8:
-                {
-                    simd8_t constValue;
-                    memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd8_t));
-
-                    CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(constValue);
-                    emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
-                    break;
-                }
-
-                case TYP_SIMD12:
-                {
-                    simd16_t constValue = {};
-                    memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd12_t));
-
-                    CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(constValue);
-                    emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
-                    break;
-                }
-
-                case TYP_SIMD16:
-                {
-                    simd16_t constValue;
-                    memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd16_t));
-
-                    CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(constValue);
-                    emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
-                    break;
-                }
-
-                case TYP_SIMD32:
-                {
-                    simd32_t constValue;
-                    memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd32_t));
-
-                    CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(constValue);
-                    emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
-                    break;
-                }
-
-                case TYP_SIMD64:
-                {
-                    simd64_t constValue;
-                    memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd64_t));
-
-                    CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(constValue);
-                    emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
-                    break;
-                }
-#endif // FEATURE_SIMD
-
-                default:
-                {
-                    unreached();
-                }
-            }
-
+            GenTreeVecCon* vecCon = tree->AsVecCon();
+            genSetRegToConst(vecCon->GetRegNum(), targetType, vecCon->gtSimdVal);
+#else
+            unreached();
+#endif
             break;
         }
 
@@ -3175,10 +3157,11 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
             // Populate a constant vector from the fill value and save it
             // to the data section so we can load it by address.
             assert(regSize <= ZMM_REGSIZE_BYTES);
-            simd64_t constValue;
-            memset(&constValue, (uint8_t)(src->AsIntCon()->IconValue() & 0xFF), sizeof(simd64_t));
+
             var_types loadType = compiler->getSIMDTypeForSize(regSize);
-            genSetRegToConst(srcXmmReg, loadType, compiler->gtNewVconNode(loadType, &constValue));
+            simd_t    vecCon;
+            memset(&vecCon, (uint8_t)src->AsIntCon()->IconValue(), sizeof(simd_t));
+            genSetRegToConst(srcXmmReg, loadType, vecCon);
         }
 #endif
         else

From 7dfe72091d767b2140c1f0344ec8ae067f778745 Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Wed, 3 May 2023 17:50:22 +0200
Subject: [PATCH 10/10] clean up

---
 src/coreclr/jit/codegen.h        |  2 +-
 src/coreclr/jit/codegenxarch.cpp | 75 ++++++--------------------------
 src/coreclr/jit/lowerxarch.cpp   | 19 +++-----
 3 files changed, 20 insertions(+), 76 deletions(-)

diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index f038c607c0dff6..070ce82e6ec22f 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -781,7 +781,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 
     void genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree);
 #if defined(FEATURE_SIMD)
-    void genSetRegToConst(regNumber targetReg, var_types targetType, simd_t val);
+    void genSetRegToConst(regNumber targetReg, var_types targetType, simd_t* val);
 #endif
     void genCodeForTreeNode(GenTree* treeNode);
     void genCodeForBinary(GenTreeOp* treeNode);
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index f336e9207aced7..1988aa632cf35b 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -426,7 +426,7 @@ void CodeGen::instGen_Set_Reg_To_Imm(emitAttr  size,
 //    targetType - target's type
 //    simd_t     - constant data (its width depends on type)
 //
-void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t val)
+void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t* val)
 {
     emitter* emit = GetEmitter();
     emitAttr attr = emitTypeSize(targetType);
@@ -435,7 +435,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
     {
         case TYP_SIMD8:
         {
-            simd8_t val8 = *(simd8_t*)&val;
+            simd8_t val8 = *(simd8_t*)val;
             if (val8.IsAllBitsSet())
             {
                 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
@@ -453,7 +453,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
         }
         case TYP_SIMD12:
         {
-            simd12_t val12 = *(simd12_t*)&val;
+            simd12_t val12 = *(simd12_t*)val;
             if (val12.IsAllBitsSet())
             {
                 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
@@ -473,7 +473,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
         }
         case TYP_SIMD16:
         {
-            simd16_t val16 = *(simd16_t*)&val;
+            simd16_t val16 = *(simd16_t*)val;
             if (val16.IsAllBitsSet())
             {
                 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
@@ -491,7 +491,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
         }
         case TYP_SIMD32:
         {
-            simd32_t val32 = *(simd32_t*)&val;
+            simd32_t val32 = *(simd32_t*)val;
             if (val32.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
             {
                 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
@@ -509,7 +509,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
         }
         case TYP_SIMD64:
         {
-            simd64_t val64 = *(simd64_t*)&val;
+            simd64_t val64 = *(simd64_t*)val;
             if (val64.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F))
             {
                 emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
@@ -603,7 +603,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
         {
 #if defined(FEATURE_SIMD)
             GenTreeVecCon* vecCon = tree->AsVecCon();
-            genSetRegToConst(vecCon->GetRegNum(), targetType, vecCon->gtSimdVal);
+            genSetRegToConst(vecCon->GetRegNum(), targetType, &vecCon->gtSimdVal);
 #else
             unreached();
 #endif
@@ -3134,68 +3134,20 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
     assert(size <= INT32_MAX);
     assert(dstOffset < (INT32_MAX - static_cast<int>(size)));
 
+#ifdef FEATURE_SIMD
     if (willUseSimdMov)
     {
         regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);
-
-        unsigned regSize = compiler->roundDownSIMDSize(size);
+        unsigned  regSize   = compiler->roundDownSIMDSize(size);
         if (size < ZMM_RECOMMENDED_THRESHOLD)
         {
             // Involve ZMM only for large data due to possible downclocking.
             regSize = min(regSize, YMM_REGSIZE_BYTES);
         }
-
-        if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
-        {
-            // If the source is constant 0 then always use xorps, it's faster
-            // than copying the constant from a GPR to a XMM register.
-            emit->emitIns_SIMD_R_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg, srcXmmReg);
-        }
-#ifdef FEATURE_SIMD
-        else if (src->gtSkipReloadOrCopy()->IsIntegralConst())
-        {
-            // Populate a constant vector from the fill value and save it
-            // to the data section so we can load it by address.
-            assert(regSize <= ZMM_REGSIZE_BYTES);
-
-            var_types loadType = compiler->getSIMDTypeForSize(regSize);
-            simd_t    vecCon;
-            memset(&vecCon, (uint8_t)src->AsIntCon()->IconValue(), sizeof(simd_t));
-            genSetRegToConst(srcXmmReg, loadType, vecCon);
-        }
-#endif
-        else
-        {
-            // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk.
-            regSize = min(regSize, YMM_REGSIZE_BYTES);
-
-            if (compiler->compOpportunisticallyDependsOn(InstructionSet_Vector512))
-            {
-                emit->emitIns_R_R(INS_vpbroadcastd_gpr, EA_ATTR(regSize), srcXmmReg, srcIntReg);
-            }
-            else if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
-            {
-                emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false);
-                emit->emitIns_R_R(INS_vpbroadcastd, EA_ATTR(regSize), srcXmmReg, srcXmmReg);
-            }
-            else
-            {
-                emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false);
-
-                emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg);
-
-#ifdef TARGET_X86
-                // For x86, we need one more to convert it from 8 bytes to 16 bytes.
-                emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg);
-#endif
-
-                if (regSize == YMM_REGSIZE_BYTES)
-                {
-                    // Extend the bytes in the lower lanes to the upper lanes
-                    emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1);
-                }
-            }
-        }
+        var_types loadType = compiler->getSIMDTypeForSize(regSize);
+        simd_t    vecCon;
+        memset(&vecCon, (uint8_t)src->AsIntCon()->IconValue(), sizeof(simd_t));
+        genSetRegToConst(srcXmmReg, loadType, &vecCon);
 
         instruction simdMov      = simdUnalignedMovIns();
         unsigned    bytesWritten = 0;
@@ -3239,6 +3191,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
             size = 0;
         }
     }
+#endif // FEATURE_SIMD
 
     assert((srcIntReg != REG_NA) || (size == 0));
 
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 7576f694ec6f57..d780773abba69a 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -336,27 +336,18 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
 
                 ssize_t fill = src->AsIntCon()->IconValue() & 0xFF;
 
-                const bool canUseSimd = !blkNode->IsOnHeapAndContainsReferences();
+                const bool canUseSimd = !blkNode->IsOnHeapAndContainsReferences() && comp->IsBaselineSimdIsaSupported();
                 if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, canUseSimd))
                 {
                     // It turns out we can't use SIMD so the default threshold is too big
                     goto TOO_BIG_TO_UNROLL;
                 }
-
-                bool willUseSimd = canUseSimd && (size >= XMM_REGSIZE_BYTES) && comp->IsBaselineSimdIsaSupported();
-                if (willUseSimd)
+                if (canUseSimd && (size >= XMM_REGSIZE_BYTES))
                 {
-// We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register with a fill value
-// just to handle the remainder when we can do that with an overlapped SIMD load).
-#ifdef FEATURE_SIMD
+                    // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register
+                    // with a fill value just to handle the remainder when we can do that with
+                    // an overlapped SIMD load).
                     src->SetContained();
-#else
-                    if (fill == 0)
-                    {
-                        // When we don't have FEATURE_SIMD we can only handle zeroing
-                        src->SetContained();
-                    }
-#endif
                 }
                 else if (fill == 0)
                 {