From 6c9a07a209aa036f158f5f57d4e5d9fa679a8f24 Mon Sep 17 00:00:00 2001
From: Egor Chesakov <Egor.Chesakov@microsoft.com>
Date: Thu, 27 May 2021 11:12:40 -0700
Subject: [PATCH 1/3] Construct ASG(OBJ(addr), 0) for structs that have GC
 fields and ASG(BLK(addr), 0) for other types. This allows the JIT to maintain
 information about the class layout.

---
 src/coreclr/jit/importer.cpp | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp
index f14d8d0cd40d8b..fc8544f9138ba6 100644
--- a/src/coreclr/jit/importer.cpp
+++ b/src/coreclr/jit/importer.cpp
@@ -16286,11 +16286,25 @@ void Compiler::impImportBlockCode(BasicBlock* block)
                            "type operand incompatible with type of address");
                 }
 
-                size = info.compCompHnd->getClassSize(resolvedToken.hClass); // Size
-                op2  = gtNewIconNode(0);                                     // Value
-                op1  = impPopStack().val;                                    // Dest
-                op1  = gtNewBlockVal(op1, size);
-                op1  = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false);
+                op2 = gtNewIconNode(0);  // Value
+                op1 = impPopStack().val; // Dest
+
+                if (eeIsValueClass(resolvedToken.hClass))
+                {
+                    op1 = gtNewStructVal(resolvedToken.hClass, op1);
+                    if (op1->OperIs(GT_OBJ))
+                    {
+                        gtSetObjGcInfo(op1->AsObj());
+                    }
+                }
+                else
+                {
+                    size = info.compCompHnd->getClassSize(resolvedToken.hClass);
+                    assert(size == TARGET_POINTER_SIZE);
+                    op1 = gtNewBlockVal(op1, size);
+                }
+
+                op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false);
                 goto SPILL_APPEND;
 
             case CEE_INITBLK:

From d9c78b63e505533605502c9ffb6b5a019bb105b7 Mon Sep 17 00:00:00 2001
From: Egor Chesakov <Egor.Chesakov@microsoft.com>
Date: Fri, 28 May 2021 14:04:18 -0700
Subject: [PATCH 2/3] X64: Always use a sequence of mov [m64],r64 when a struct
 has GC fields and is not guaranteed to be on the stack

X86:
1) Use movq [m64],xmm to zero 8-bytes on the stack instead of mov [m32],r32; mov [m32+4],r32
2) Do not use movdqu when a struct has GC fields and is not guaranteed to be on the stack
---
 src/coreclr/jit/codegenxarch.cpp | 57 +++++++++++++++++++++++++-------
 src/coreclr/jit/gentree.h        |  7 ++++
 src/coreclr/jit/lowerxarch.cpp   |  9 +++--
 src/coreclr/jit/lsraxarch.cpp    |  2 +-
 4 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 7fd371e103e194..e43fbe22a3f800 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -2752,26 +2752,42 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
         src = src->AsUnOp()->gtGetOp1();
     }
 
+    unsigned size = node->GetLayout()->GetSize();
+
+    // An SSE mov that stores data larger than 8 bytes may be implemented using
+    // multiple memory accesses. Therefore, the JIT must not use such stores
+    // when INITBLK zeroes a struct that contains GC pointers and can be visible
+    // to other threads (i.e. the struct is not located on the stack).
+    // Therefore, on x64 the JIT will not use SIMD stores for such structs and
+    // will allocate a GP register for the node.
+    // On x86 the JIT will use 8 bytes stores (movq) for structs that are
+    // larger than 16 bytes.
+    const bool canUse16BytesSimdMov = !node->MustUsePointerSizeAtomicStores();
+
     if (!src->isContained())
     {
         srcIntReg = genConsumeReg(src);
     }
     else
     {
-        // If src is contained then it must be 0 and the size must be a multiple
-        // of XMM_REGSIZE_BYTES so initialization can use only SSE2 instructions.
+        // If src is contained then it must be 0.
         assert(src->IsIntegralConst(0));
-        assert((node->GetLayout()->GetSize() % XMM_REGSIZE_BYTES) == 0);
+
+#ifdef TARGET_AMD64
+        assert(canUse16BytesSimdMov);
+        assert(size % 16 == 0);
+#else
+        assert(size >= XMM_REGSIZE_BYTES);
+        assert(size % 8 == 0);
+#endif
     }
 
     emitter* emit = GetEmitter();
-    unsigned size = node->GetLayout()->GetSize();
 
     assert(size <= INT32_MAX);
     assert(dstOffset < (INT32_MAX - static_cast<int>(size)));
 
-    // Fill as much as possible using SSE2 stores.
-    if (size >= XMM_REGSIZE_BYTES)
+    if (AMD64_ONLY(canUse16BytesSimdMov &&)(size >= XMM_REGSIZE_BYTES))
     {
         regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);
 
@@ -2791,9 +2807,25 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
 #endif
         }
 
-        instruction simdMov = simdUnalignedMovIns();
-        for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize)
+        instruction simdMov      = simdUnalignedMovIns();
+        unsigned    regSize      = XMM_REGSIZE_BYTES;
+        unsigned    bytesWritten = 0;
+
+        while (bytesWritten < size)
         {
+#ifdef TARGET_X86
+            if (!canUse16BytesSimdMov || (bytesWritten + regSize > size))
+            {
+                simdMov = INS_movq;
+                regSize = 8;
+            }
+#endif
+            if (bytesWritten + regSize > size)
+            {
+                assert(srcIntReg != REG_NA);
+                break;
+            }
+
             if (dstLclNum != BAD_VAR_NUM)
             {
                 emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset);
@@ -2803,11 +2835,12 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
                 emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg,
                                     dstAddrIndexScale, dstOffset);
             }
+
+            dstOffset += regSize;
+            bytesWritten += regSize;
         }
 
-        // TODO-CQ-XArch: On x86 we could initialize 8 byte at once by using MOVQ instead of two 4 byte MOV stores.
-        // On x64 it may also be worth zero initializing a 4/8 byte remainder using MOVD/MOVQ, that avoids the need
-        // to allocate a GPR just for the remainder.
+        size -= bytesWritten;
     }
 
     // Fill the remainder using normal stores.
@@ -4604,7 +4637,7 @@ void CodeGen::genCodeForIndexAddr(GenTreeIndexAddr* node)
             // The VM doesn't allow such large array elements but let's be sure.
             noway_assert(scale <= INT32_MAX);
 #else  // !TARGET_64BIT
-            tmpReg              = node->GetSingleTempReg();
+            tmpReg = node->GetSingleTempReg();
 #endif // !TARGET_64BIT
 
             GetEmitter()->emitIns_R_I(emitter::inst3opImulForReg(tmpReg), EA_PTRSIZE, indexReg,
diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h
index ef65f7d7aae051..c3a09c6d94e172 100644
--- a/src/coreclr/jit/gentree.h
+++ b/src/coreclr/jit/gentree.h
@@ -5686,6 +5686,13 @@ struct GenTreeBlk : public GenTreeIndir
     bool gtBlkOpGcUnsafe;
 #endif
 
+#ifdef TARGET_XARCH
+    bool MustUsePointerSizeAtomicStores()
+    {
+        return (m_layout != nullptr) && m_layout->HasGCPtr() && !Addr()->OperIsLocalAddr();
+    }
+#endif
+
     GenTreeBlk(genTreeOps oper, var_types type, GenTree* addr, ClassLayout* layout)
         : GenTreeIndir(oper, type, addr, nullptr)
         , m_layout(layout)
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 4d4253006047cd..d4dae28195e08a 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -218,9 +218,14 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
                 {
                     // If the size is multiple of XMM register size there's no need to load 0 in a GPR,
                     // codegen will use xorps to generate 0 directly in the temporary XMM register.
-                    if ((size % XMM_REGSIZE_BYTES) == 0)
+                    if (size >= XMM_REGSIZE_BYTES)
                     {
-                        src->SetContained();
+                        const bool canUse16BytesSimdMov = !blkNode->MustUsePointerSizeAtomicStores();
+
+                        if (canUse16BytesSimdMov && (size % 16 == 0)X86_ONLY(|| (size % 8 == 0)))
+                        {
+                            src->SetContained();
+                        }
                     }
                 }
 #ifdef TARGET_AMD64
diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp
index 1cd81124d05663..ec10bc3c0eb867 100644
--- a/src/coreclr/jit/lsraxarch.cpp
+++ b/src/coreclr/jit/lsraxarch.cpp
@@ -1297,7 +1297,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
         switch (blkNode->gtBlkOpKind)
         {
             case GenTreeBlk::BlkOpKindUnroll:
-                if (size >= XMM_REGSIZE_BYTES)
+                if (AMD64_ONLY(!blkNode->MustUsePointerSizeAtomicStores() &&)(size >= XMM_REGSIZE_BYTES))
                 {
                     buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
                     SetContainsAVXFlags();

From cfceb5f6b999f8d2845bf585d59b3eed2df34ea7 Mon Sep 17 00:00:00 2001
From: Egor Chesakov <Egor.Chesakov@microsoft.com>
Date: Mon, 14 Jun 2021 16:29:07 -0700
Subject: [PATCH 3/3] Address code review

---
 src/coreclr/jit/codegenxarch.cpp | 31 ++++++++++++++++++-------------
 src/coreclr/jit/gentree.h        |  2 +-
 src/coreclr/jit/lowerxarch.cpp   | 12 +++++++-----
 src/coreclr/jit/lsraxarch.cpp    | 14 +++++++++++---
 4 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index e43fbe22a3f800..2bd0142381f628 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -2754,15 +2754,22 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
 
     unsigned size = node->GetLayout()->GetSize();
 
-    // An SSE mov that stores data larger than 8 bytes may be implemented using
-    // multiple memory accesses. Therefore, the JIT must not use such stores
-    // when INITBLK zeroes a struct that contains GC pointers and can be visible
-    // to other threads (i.e. the struct is not located on the stack).
-    // Therefore, on x64 the JIT will not use SIMD stores for such structs and
-    // will allocate a GP register for the node.
-    // On x86 the JIT will use 8 bytes stores (movq) for structs that are
-    // larger than 16 bytes.
-    const bool canUse16BytesSimdMov = !node->MustUsePointerSizeAtomicStores();
+    // An SSE mov that accesses data larger than 8 bytes may be implemented using
+    // multiple memory accesses. Hence, the JIT must not use such stores when
+    // INITBLK zeroes a struct that contains GC pointers and can be observed by
+    // other threads (i.e. when dstAddr is not an address of a local).
+    // For example, this can happen when initializing a struct field of an object.
+    const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences();
+
+#ifdef TARGET_AMD64
+    // On Amd64 the JIT will not use SIMD stores for such structs and instead
+    // will always allocate a GP register for src node.
+    const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
+#else
+    // On X86 the JIT will use movq for structs that are larger than 16 bytes
+    // since it is more beneficial than using two mov-s from a GP register.
+    const bool willUseSimdMov = (size >= 16);
+#endif
 
     if (!src->isContained())
     {
@@ -2772,12 +2779,10 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
     {
         // If src is contained then it must be 0.
         assert(src->IsIntegralConst(0));
-
+        assert(willUseSimdMov);
 #ifdef TARGET_AMD64
-        assert(canUse16BytesSimdMov);
         assert(size % 16 == 0);
 #else
-        assert(size >= XMM_REGSIZE_BYTES);
         assert(size % 8 == 0);
 #endif
     }
@@ -2787,7 +2792,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
     assert(size <= INT32_MAX);
     assert(dstOffset < (INT32_MAX - static_cast<int>(size)));
 
-    if (AMD64_ONLY(canUse16BytesSimdMov &&)(size >= XMM_REGSIZE_BYTES))
+    if (willUseSimdMov)
     {
         regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);
 
diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h
index c3a09c6d94e172..e2a5045e9069b4 100644
--- a/src/coreclr/jit/gentree.h
+++ b/src/coreclr/jit/gentree.h
@@ -5687,7 +5687,7 @@ struct GenTreeBlk : public GenTreeIndir
 #endif
 
 #ifdef TARGET_XARCH
-    bool MustUsePointerSizeAtomicStores()
+    bool IsOnHeapAndContainsReferences()
     {
         return (m_layout != nullptr) && m_layout->HasGCPtr() && !Addr()->OperIsLocalAddr();
     }
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index d4dae28195e08a..55bfab94f6f5f8 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -216,13 +216,15 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
 
                 if (fill == 0)
                 {
-                    // If the size is multiple of XMM register size there's no need to load 0 in a GPR,
-                    // codegen will use xorps to generate 0 directly in the temporary XMM register.
                     if (size >= XMM_REGSIZE_BYTES)
                     {
-                        const bool canUse16BytesSimdMov = !blkNode->MustUsePointerSizeAtomicStores();
-
-                        if (canUse16BytesSimdMov && (size % 16 == 0)X86_ONLY(|| (size % 8 == 0)))
+                        const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
+#ifdef TARGET_AMD64
+                        const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0);
+#else
+                        const bool willUseOnlySimdMov = (size % 8 == 0);
+#endif
+                        if (willUseOnlySimdMov)
                         {
                             src->SetContained();
                         }
diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp
index ec10bc3c0eb867..aa05361d115406 100644
--- a/src/coreclr/jit/lsraxarch.cpp
+++ b/src/coreclr/jit/lsraxarch.cpp
@@ -1069,7 +1069,7 @@ int LinearScan::BuildCall(GenTreeCall* call)
         // The return value will be on the X87 stack, and we will need to move it.
         dstCandidates = allRegs(registerType);
 #else  // !TARGET_X86
-        dstCandidates = RBM_FLOATRET;
+        dstCandidates                     = RBM_FLOATRET;
 #endif // !TARGET_X86
     }
     else if (registerType == TYP_LONG)
@@ -1297,7 +1297,14 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
         switch (blkNode->gtBlkOpKind)
         {
             case GenTreeBlk::BlkOpKindUnroll:
-                if (AMD64_ONLY(!blkNode->MustUsePointerSizeAtomicStores() &&)(size >= XMM_REGSIZE_BYTES))
+            {
+#ifdef TARGET_AMD64
+                const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
+                const bool willUseSimdMov       = canUse16BytesSimdMov && (size >= 16);
+#else
+                const bool willUseSimdMov = (size >= 16);
+#endif
+                if (willUseSimdMov)
                 {
                     buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
                     SetContainsAVXFlags();
@@ -1310,7 +1317,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
                     srcRegMask = allByteRegs();
                 }
 #endif
-                break;
+            }
+            break;
 
             case GenTreeBlk::BlkOpKindRepInstr:
                 dstAddrRegMask = RBM_RDI;