From 6c9a07a209aa036f158f5f57d4e5d9fa679a8f24 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Thu, 27 May 2021 11:12:40 -0700 Subject: [PATCH 1/3] Construct ASG(OBJ(addr), 0) for structs that have GC fields and ASG(BLK(addr), 0) for other types. This allows the JIT to maintain information about the class layout. --- src/coreclr/jit/importer.cpp | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp index f14d8d0cd40d8b..fc8544f9138ba6 100644 --- a/src/coreclr/jit/importer.cpp +++ b/src/coreclr/jit/importer.cpp @@ -16286,11 +16286,25 @@ void Compiler::impImportBlockCode(BasicBlock* block) "type operand incompatible with type of address"); } - size = info.compCompHnd->getClassSize(resolvedToken.hClass); // Size - op2 = gtNewIconNode(0); // Value - op1 = impPopStack().val; // Dest - op1 = gtNewBlockVal(op1, size); - op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false); + op2 = gtNewIconNode(0); // Value + op1 = impPopStack().val; // Dest + + if (eeIsValueClass(resolvedToken.hClass)) + { + op1 = gtNewStructVal(resolvedToken.hClass, op1); + if (op1->OperIs(GT_OBJ)) + { + gtSetObjGcInfo(op1->AsObj()); + } + } + else + { + size = info.compCompHnd->getClassSize(resolvedToken.hClass); + assert(size == TARGET_POINTER_SIZE); + op1 = gtNewBlockVal(op1, size); + } + + op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false); goto SPILL_APPEND; case CEE_INITBLK: From d9c78b63e505533605502c9ffb6b5a019bb105b7 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Fri, 28 May 2021 14:04:18 -0700 Subject: [PATCH 2/3] X64: Always use a sequence of mov [m64],r64 when a struct has GC fields and is not guaranteed to be on the stack X86: 1) Use movq [m64],xmm to zero 8-bytes on the stack instead of mov [m32],r32; mov [m32+4],r32 2) Do not use movdqu when a struct has GC fields and is not guaranteed to be on the stack --- src/coreclr/jit/codegenxarch.cpp | 57 +++++++++++++++++++++++++------- src/coreclr/jit/gentree.h | 7 ++++ src/coreclr/jit/lowerxarch.cpp | 9 +++-- src/coreclr/jit/lsraxarch.cpp | 2 +- 4 files changed, 60 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 7fd371e103e194..e43fbe22a3f800 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2752,26 +2752,42 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) src = src->AsUnOp()->gtGetOp1(); } + unsigned size = node->GetLayout()->GetSize(); + + // An SSE mov that stores data larger than 8 bytes may be implemented using + // multiple memory accesses. Therefore, the JIT must not use such stores + // when INITBLK zeroes a struct that contains GC pointers and can be visible + // to other threads (i.e. the struct is not located on the stack). + // Therefore, on x64 the JIT will not use SIMD stores for such structs and + // will allocate a GP register for the node. + // On x86 the JIT will use 8 bytes stores (movq) for structs that are + // larger than 16 bytes. + const bool canUse16BytesSimdMov = !node->MustUsePointerSizeAtomicStores(); + if (!src->isContained()) { srcIntReg = genConsumeReg(src); } else { - // If src is contained then it must be 0 and the size must be a multiple - // of XMM_REGSIZE_BYTES so initialization can use only SSE2 instructions. + // If src is contained then it must be 0. assert(src->IsIntegralConst(0)); - assert((node->GetLayout()->GetSize() % XMM_REGSIZE_BYTES) == 0); + +#ifdef TARGET_AMD64 + assert(canUse16BytesSimdMov); + assert(size % 16 == 0); +#else + assert(size >= XMM_REGSIZE_BYTES); + assert(size % 8 == 0); +#endif } emitter* emit = GetEmitter(); - unsigned size = node->GetLayout()->GetSize(); assert(size <= INT32_MAX); assert(dstOffset < (INT32_MAX - static_cast(size))); - // Fill as much as possible using SSE2 stores. - if (size >= XMM_REGSIZE_BYTES) + if (AMD64_ONLY(canUse16BytesSimdMov &&)(size >= XMM_REGSIZE_BYTES)) { regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); @@ -2791,9 +2807,25 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) #endif } - instruction simdMov = simdUnalignedMovIns(); - for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize) + instruction simdMov = simdUnalignedMovIns(); + unsigned regSize = XMM_REGSIZE_BYTES; + unsigned bytesWritten = 0; + + while (bytesWritten < size) { +#ifdef TARGET_X86 + if (!canUse16BytesSimdMov || (bytesWritten + regSize > size)) + { + simdMov = INS_movq; + regSize = 8; + } +#endif + if (bytesWritten + regSize > size) + { + assert(srcIntReg != REG_NA); + break; + } + if (dstLclNum != BAD_VAR_NUM) { emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset); @@ -2803,11 +2835,12 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale, dstOffset); } + + dstOffset += regSize; + bytesWritten += regSize; } - // TODO-CQ-XArch: On x86 we could initialize 8 byte at once by using MOVQ instead of two 4 byte MOV stores. - // On x64 it may also be worth zero initializing a 4/8 byte remainder using MOVD/MOVQ, that avoids the need - // to allocate a GPR just for the remainder. + size -= bytesWritten; } // Fill the remainder using normal stores. @@ -4604,7 +4637,7 @@ void CodeGen::genCodeForIndexAddr(GenTreeIndexAddr* node) // The VM doesn't allow such large array elements but let's be sure. noway_assert(scale <= INT32_MAX); #else // !TARGET_64BIT - tmpReg = node->GetSingleTempReg(); + tmpReg = node->GetSingleTempReg(); #endif // !TARGET_64BIT GetEmitter()->emitIns_R_I(emitter::inst3opImulForReg(tmpReg), EA_PTRSIZE, indexReg, diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index ef65f7d7aae051..c3a09c6d94e172 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -5686,6 +5686,13 @@ struct GenTreeBlk : public GenTreeIndir bool gtBlkOpGcUnsafe; #endif +#ifdef TARGET_XARCH + bool MustUsePointerSizeAtomicStores() + { + return (m_layout != nullptr) && m_layout->HasGCPtr() && !Addr()->OperIsLocalAddr(); + } +#endif + GenTreeBlk(genTreeOps oper, var_types type, GenTree* addr, ClassLayout* layout) : GenTreeIndir(oper, type, addr, nullptr) , m_layout(layout) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 4d4253006047cd..d4dae28195e08a 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -218,9 +218,14 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) { // If the size is multiple of XMM register size there's no need to load 0 in a GPR, // codegen will use xorps to generate 0 directly in the temporary XMM register. - if ((size % XMM_REGSIZE_BYTES) == 0) + if (size >= XMM_REGSIZE_BYTES) { - src->SetContained(); + const bool canUse16BytesSimdMov = !blkNode->MustUsePointerSizeAtomicStores(); + + if (canUse16BytesSimdMov && (size % 16 == 0)X86_ONLY(|| (size % 8 == 0))) + { + src->SetContained(); + } } } #ifdef TARGET_AMD64 diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 1cd81124d05663..ec10bc3c0eb867 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1297,7 +1297,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) switch (blkNode->gtBlkOpKind) { case GenTreeBlk::BlkOpKindUnroll: - if (size >= XMM_REGSIZE_BYTES) + if (AMD64_ONLY(!blkNode->MustUsePointerSizeAtomicStores() &&)(size >= XMM_REGSIZE_BYTES)) { buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); SetContainsAVXFlags(); From cfceb5f6b999f8d2845bf585d59b3eed2df34ea7 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 14 Jun 2021 16:29:07 -0700 Subject: [PATCH 3/3] Address code review --- src/coreclr/jit/codegenxarch.cpp | 31 ++++++++++++++++++------------- src/coreclr/jit/gentree.h | 2 +- src/coreclr/jit/lowerxarch.cpp | 12 +++++++----- src/coreclr/jit/lsraxarch.cpp | 14 +++++++++++--- 4 files changed, 37 insertions(+), 22 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e43fbe22a3f800..2bd0142381f628 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2754,15 +2754,22 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) unsigned size = node->GetLayout()->GetSize(); - // An SSE mov that stores data larger than 8 bytes may be implemented using - // multiple memory accesses. Therefore, the JIT must not use such stores - // when INITBLK zeroes a struct that contains GC pointers and can be visible - // to other threads (i.e. the struct is not located on the stack). - // Therefore, on x64 the JIT will not use SIMD stores for such structs and - // will allocate a GP register for the node. - // On x86 the JIT will use 8 bytes stores (movq) for structs that are - // larger than 16 bytes. - const bool canUse16BytesSimdMov = !node->MustUsePointerSizeAtomicStores(); + // An SSE mov that accesses data larger than 8 bytes may be implemented using + // multiple memory accesses. Hence, the JIT must not use such stores when + // INITBLK zeroes a struct that contains GC pointers and can be observed by + // other threads (i.e. when dstAddr is not an address of a local). + // For example, this can happen when initializing a struct field of an object. + const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences(); + +#ifdef TARGET_AMD64 + // On Amd64 the JIT will not use SIMD stores for such structs and instead + // will always allocate a GP register for src node. + const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES); +#else + // On X86 the JIT will use movq for structs that are larger than 16 bytes + // since it is more beneficial than using two mov-s from a GP register. + const bool willUseSimdMov = (size >= 16); +#endif if (!src->isContained()) { @@ -2772,12 +2779,10 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) { // If src is contained then it must be 0. assert(src->IsIntegralConst(0)); - + assert(willUseSimdMov); #ifdef TARGET_AMD64 - assert(canUse16BytesSimdMov); assert(size % 16 == 0); #else - assert(size >= XMM_REGSIZE_BYTES); assert(size % 8 == 0); #endif } @@ -2787,7 +2792,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) assert(size <= INT32_MAX); assert(dstOffset < (INT32_MAX - static_cast(size))); - if (AMD64_ONLY(canUse16BytesSimdMov &&)(size >= XMM_REGSIZE_BYTES)) + if (willUseSimdMov) { regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index c3a09c6d94e172..e2a5045e9069b4 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -5687,7 +5687,7 @@ struct GenTreeBlk : public GenTreeIndir #endif #ifdef TARGET_XARCH - bool MustUsePointerSizeAtomicStores() + bool IsOnHeapAndContainsReferences() { return (m_layout != nullptr) && m_layout->HasGCPtr() && !Addr()->OperIsLocalAddr(); } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index d4dae28195e08a..55bfab94f6f5f8 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -216,13 +216,15 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) if (fill == 0) { - // If the size is multiple of XMM register size there's no need to load 0 in a GPR, - // codegen will use xorps to generate 0 directly in the temporary XMM register. if (size >= XMM_REGSIZE_BYTES) { - const bool canUse16BytesSimdMov = !blkNode->MustUsePointerSizeAtomicStores(); - - if (canUse16BytesSimdMov && (size % 16 == 0)X86_ONLY(|| (size % 8 == 0))) + const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences(); +#ifdef TARGET_AMD64 + const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0); +#else + const bool willUseOnlySimdMov = (size % 8 == 0); +#endif + if (willUseOnlySimdMov) { src->SetContained(); } diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index ec10bc3c0eb867..aa05361d115406 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1069,7 +1069,7 @@ int LinearScan::BuildCall(GenTreeCall* call) // The return value will be on the X87 stack, and we will need to move it. dstCandidates = allRegs(registerType); #else // !TARGET_X86 - dstCandidates = RBM_FLOATRET; + dstCandidates = RBM_FLOATRET; #endif // !TARGET_X86 } else if (registerType == TYP_LONG) @@ -1297,7 +1297,14 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) switch (blkNode->gtBlkOpKind) { case GenTreeBlk::BlkOpKindUnroll: - if (AMD64_ONLY(!blkNode->MustUsePointerSizeAtomicStores() &&)(size >= XMM_REGSIZE_BYTES)) + { +#ifdef TARGET_AMD64 + const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences(); + const bool willUseSimdMov = canUse16BytesSimdMov && (size >= 16); +#else + const bool willUseSimdMov = (size >= 16); +#endif + if (willUseSimdMov) { buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); SetContainsAVXFlags(); @@ -1310,7 +1317,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) srcRegMask = allByteRegs(); } #endif - break; + } + break; case GenTreeBlk::BlkOpKindRepInstr: dstAddrRegMask = RBM_RDI;