diff --git a/docs/design/coreclr/botr/clr-abi.md b/docs/design/coreclr/botr/clr-abi.md index bda2f1d1d3621d..b1680651465775 100644 --- a/docs/design/coreclr/botr/clr-abi.md +++ b/docs/design/coreclr/botr/clr-abi.md @@ -207,7 +207,7 @@ For non-rude thread abort, the VM walks the stack, running any catch handler tha For example: -``` +```cs try { // try 1 try { // try 2 System.Threading.Thread.CurrentThread.Abort(); @@ -223,7 +223,7 @@ L: In this case, if the address returned in catch 2 corresponding to label L is outside try 1, then the ThreadAbortException re-raised by the VM will not be caught by catch 1, as is expected. The JIT needs to insert a block such that this is the effective code generation: -``` +```cs try { // try 1 try { // try 2 System.Threading.Thread.CurrentThread.Abort(); @@ -240,7 +240,7 @@ L: Similarly, the automatic re-raise address for a ThreadAbortException can't be within a finally handler, or the VM will abort the re-raise and swallow the exception. This can happen due to call-to-finally thunks marked as "cloned finally", as described above. For example (this is pseudo-assembly-code, not C#): -``` +```cs try { // try 1 try { // try 2 System.Threading.Thread.CurrentThread.Abort(); @@ -256,7 +256,7 @@ L: This would generate something like: -``` +```asm // beginning of 'try 1' // beginning of 'try 2' System.Threading.Thread.CurrentThread.Abort(); @@ -281,7 +281,7 @@ Finally1: Note that the JIT must already insert a "step" block so the finally will be called. However, this isn't sufficient to support ThreadAbortException processing, because "L1" is marked as "cloned finally". In this case, the JIT must insert another step block that is within "try 1" but outside the cloned finally block, that will allow for correct re-raise semantics. For example: -``` +```asm // beginning of 'try 1' // beginning of 'try 2' System.Threading.Thread.CurrentThread.Abort(); @@ -399,7 +399,7 @@ To implement this requirement, for any function with EH, we create a frame-local Note that the since a slot on x86 is 4 bytes, the minimum size is 16 bytes. The idea is to have 1 slot for each handler that could be possibly be invoked at the same time. For example, for: -``` +```cs try { ... } catch { @@ -419,7 +419,7 @@ When calling a finally, we set the appropriate level to 0xFC (aka "finally call" Thus, calling a finally from JIT generated code looks like: -``` +```asm mov dword ptr [L_02+0x4 ebp-10H], 0 // This must happen before the 0xFC is written mov dword ptr [L_02+0x8 ebp-0CH], 252 // 0xFC push G_M52300_IG07 @@ -430,7 +430,7 @@ In this case, `G_M52300_IG07` is not the address after the 'jmp', so a simple 'c The code this finally returns to looks like this: -``` +```asm mov dword ptr [L_02+0x8 ebp-0CH], 0 jmp SHORT G_M52300_IG05 ``` @@ -479,7 +479,7 @@ Because a main function body will **always** be on the stack when one of its fun There is one "corner case" in the VM implementation of WantsReportOnlyLeaf model that has implications for the code the JIT is allowed to generate. Consider this function with nested exception handling: -``` +```cs public void runtest() { try { try { @@ -804,3 +804,29 @@ In addition to the usual registers it also preserves all float registers and `rc `CORINFO_HELP_DISPATCH_INDIRECT_CALL` takes the call address in `rax` and it reserves the right to use and trash `r10` and `r11`. The JIT uses the dispatch helper on x64 whenever possible as it is expected that the code size benefits outweighs the less accurate branch prediction. However, note that the use of `r11` in the dispatcher makes it incompatible with VSD calls where the JIT must fall back to the validator and a manual call. + +# Notes on Memset/Memcpy + +Generally, `memset` and `memcpy` do not provide any guarantees of atomicity. This implies that they should only be used when the memory being modified by `memset`/`memcpy` is not observable by any other thread (including GC), or when there are no atomicity requirements according to our [Memory Model](../../specs/Memory-model.md). It's especially important when we modify heap containing managed pointers - those must be updated atomically, e.g. using pointer-sized `mov` instruction (managed pointers are always aligned) - see [Atomic Memory Access](../../specs/Memory-model.md#Atomic-memory-accesses). It's worth noting that by "update" it's implied "set to zero", otherwise, we need a write barrier. + +Examples: + +```cs +struct MyStruct +{ + long a; + string b; +} + +void Test1(ref MyStruct m) +{ + // We're not allowed to use memset here + m = default; +} + +MyStruct Test2() +{ + // We can use memset here + return default; +} +``` \ No newline at end of file diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 19a7b901d7c672..22f4e58b094a2c 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1240,6 +1240,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #ifndef TARGET_X86 void genCodeForInitBlkHelper(GenTreeBlk* initBlkNode); #endif + void genCodeForInitBlkLoop(GenTreeBlk* initBlkNode); void genCodeForInitBlkRepStos(GenTreeBlk* initBlkNode); void genCodeForInitBlkUnroll(GenTreeBlk* initBlkNode); void genJumpTable(GenTree* tree); diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index d19f7ca5657d5a..cb04cf702f2b94 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -3243,6 +3243,72 @@ void CodeGen::genCodeForInitBlkHelper(GenTreeBlk* initBlkNode) genEmitHelperCall(CORINFO_HELP_MEMSET, 0, EA_UNKNOWN); } +//------------------------------------------------------------------------ +// genCodeForInitBlkLoop - Generate code for an InitBlk using an inlined for-loop. +// It's needed for cases when size is too big to unroll and we're not allowed +// to use memset call due to atomicity requirements. +// +// Arguments: +// initBlkNode - the GT_STORE_BLK node +// +void CodeGen::genCodeForInitBlkLoop(GenTreeBlk* initBlkNode) +{ + GenTree* const dstNode = initBlkNode->Addr(); + genConsumeReg(dstNode); + const regNumber dstReg = dstNode->GetRegNum(); + +#ifndef TARGET_ARM64 + GenTree* const zeroNode = initBlkNode->Data(); + genConsumeReg(zeroNode); + const regNumber zeroReg = zeroNode->GetRegNum(); +#else + const regNumber zeroReg = REG_ZR; +#endif + + if (initBlkNode->IsVolatile()) + { + // issue a full memory barrier before a volatile initBlock Operation + instGen_MemoryBarrier(); + } + + // str xzr, [dstReg] + // mov offsetReg, + //.LOOP: + // str xzr, [dstReg, offsetReg] + // subs offsetReg, offsetReg, #8 + // bne .LOOP + + const unsigned size = initBlkNode->GetLayout()->GetSize(); + assert((size >= TARGET_POINTER_SIZE) && ((size % TARGET_POINTER_SIZE) == 0)); + + // The loop is reversed - it makes it smaller. + // Although, we zero the first pointer before the loop (the loop doesn't zero it) + // it works as a nullcheck, otherwise the first iteration would try to access + // "null + potentially large offset" and hit AV. + GetEmitter()->emitIns_R_R(INS_str, EA_PTRSIZE, zeroReg, dstReg); + if (size > TARGET_POINTER_SIZE) + { + // Extend liveness of dstReg in case if it gets killed by the store. + gcInfo.gcMarkRegPtrVal(dstReg, dstNode->TypeGet()); + + const regNumber offsetReg = initBlkNode->GetSingleTempReg(); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, offsetReg, size - TARGET_POINTER_SIZE); + + BasicBlock* loop = genCreateTempLabel(); + genDefineTempLabel(loop); + + GetEmitter()->emitIns_R_R_R(INS_str, EA_PTRSIZE, zeroReg, dstReg, offsetReg); +#ifdef TARGET_ARM64 + GetEmitter()->emitIns_R_R_I(INS_subs, EA_PTRSIZE, offsetReg, offsetReg, TARGET_POINTER_SIZE); +#else + GetEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, offsetReg, offsetReg, TARGET_POINTER_SIZE, INS_FLAGS_SET); +#endif + inst_JMP(EJ_ne, loop); + + gcInfo.gcMarkRegSetNpt(genRegMask(dstReg)); + } +} + //------------------------------------------------------------------------ // genCall: Produce code for a GT_CALL node // @@ -4513,6 +4579,11 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) genCodeForCpObj(blkOp->AsBlk()); break; + case GenTreeBlk::BlkOpKindLoop: + assert(!isCopyBlk); + genCodeForInitBlkLoop(blkOp); + break; + case GenTreeBlk::BlkOpKindHelper: assert(!blkOp->gtBlkOpGcUnsafe); if (isCopyBlk) diff --git a/src/coreclr/jit/codegenloongarch64.cpp b/src/coreclr/jit/codegenloongarch64.cpp index b97efae40c4ae8..1311ccc082e800 100644 --- a/src/coreclr/jit/codegenloongarch64.cpp +++ b/src/coreclr/jit/codegenloongarch64.cpp @@ -6351,6 +6351,54 @@ void CodeGen::genCodeForInitBlkHelper(GenTreeBlk* initBlkNode) genEmitHelperCall(CORINFO_HELP_MEMSET, 0, EA_UNKNOWN); } +//------------------------------------------------------------------------ +// genCodeForInitBlkLoop - Generate code for an InitBlk using an inlined for-loop. +// It's needed for cases when size is too big to unroll and we're not allowed +// to use memset call due to atomicity requirements. +// +// Arguments: +// initBlkNode - the GT_STORE_BLK node +// +void CodeGen::genCodeForInitBlkLoop(GenTreeBlk* initBlkNode) +{ + GenTree* const dstNode = initBlkNode->Addr(); + genConsumeReg(dstNode); + const regNumber dstReg = dstNode->GetRegNum(); + + if (initBlkNode->IsVolatile()) + { + // issue a full memory barrier before a volatile initBlock Operation + instGen_MemoryBarrier(); + } + + const unsigned size = initBlkNode->GetLayout()->GetSize(); + assert((size >= TARGET_POINTER_SIZE) && ((size % TARGET_POINTER_SIZE) == 0)); + + // The loop is reversed - it makes it smaller. + // Although, we zero the first pointer before the loop (the loop doesn't zero it) + // it works as a nullcheck, otherwise the first iteration would try to access + // "null + potentially large offset" and hit AV. + GetEmitter()->emitIns_R_R_I(INS_st_d, EA_PTRSIZE, REG_R0, dstReg, 0); + if (size > TARGET_POINTER_SIZE) + { + // Extend liveness of dstReg in case if it gets killed by the store. + gcInfo.gcMarkRegPtrVal(dstReg, dstNode->TypeGet()); + + const regNumber offsetReg = initBlkNode->GetSingleTempReg(); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, offsetReg, size - TARGET_POINTER_SIZE); + + // loop begin: + // *(dstReg + offsetReg) = 0 + GetEmitter()->emitIns_R_R_R(INS_stx_d, EA_PTRSIZE, REG_R0, dstReg, offsetReg); + // offsetReg = offsetReg - 8 + GetEmitter()->emitIns_R_R_I(INS_addi_d, EA_PTRSIZE, offsetReg, offsetReg, -8); + // if (offsetReg != 0) goto loop; + GetEmitter()->emitIns_R_I(INS_bnez, EA_8BYTE, offsetReg, -2 << 2); + + gcInfo.gcMarkRegSetNpt(genRegMask(dstReg)); + } +} + // Generate code for a load from some address + offset // base: tree node which can be either a local address or arbitrary node // offset: distance from the base from which to load @@ -7262,6 +7310,11 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) genCodeForCpObj(blkOp->AsBlk()); break; + case GenTreeBlk::BlkOpKindLoop: + assert(!isCopyBlk); + genCodeForInitBlkLoop(blkOp); + break; + case GenTreeBlk::BlkOpKindHelper: if (isCopyBlk) { diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp index 9b8d1b94dcb43c..7e6f71ec6c460b 100644 --- a/src/coreclr/jit/codegenriscv64.cpp +++ b/src/coreclr/jit/codegenriscv64.cpp @@ -6029,6 +6029,61 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode) } } +//------------------------------------------------------------------------ +// genCodeForInitBlkLoop - Generate code for an InitBlk using an inlined for-loop. +// It's needed for cases when size is too big to unroll and we're not allowed +// to use memset call due to atomicity requirements. +// +// Arguments: +// initBlkNode - the GT_STORE_BLK node +// +void CodeGen::genCodeForInitBlkLoop(GenTreeBlk* initBlkNode) +{ + GenTree* const dstNode = initBlkNode->Addr(); + genConsumeReg(dstNode); + const regNumber dstReg = dstNode->GetRegNum(); + + if (initBlkNode->IsVolatile()) + { + // issue a full memory barrier before a volatile initBlock Operation + instGen_MemoryBarrier(); + } + + const unsigned size = initBlkNode->GetLayout()->GetSize(); + assert((size >= TARGET_POINTER_SIZE) && ((size % TARGET_POINTER_SIZE) == 0)); + + // The loop is reversed - it makes it smaller. + // Although, we zero the first pointer before the loop (the loop doesn't zero it) + // it works as a nullcheck, otherwise the first iteration would try to access + // "null + potentially large offset" and hit AV. + GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, dstReg, 0); + if (size > TARGET_POINTER_SIZE) + { + // Extend liveness of dstReg in case if it gets killed by the store. + gcInfo.gcMarkRegPtrVal(dstReg, dstNode->TypeGet()); + + const regNumber tempReg = initBlkNode->GetSingleTempReg(); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, tempReg, size - TARGET_POINTER_SIZE); + + // tempReg = dstReg + tempReg (a new interior pointer, but in a nongc region) + GetEmitter()->emitIns_R_R_R(INS_add, EA_PTRSIZE, tempReg, dstReg, tempReg); + + BasicBlock* loop = genCreateTempLabel(); + genDefineTempLabel(loop); + GetEmitter()->emitDisableGC(); // TODO: add gcinfo to tempReg and remove nogc + + // *tempReg = 0 + GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, tempReg, 0); + // tempReg = tempReg - 8 + GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, tempReg, tempReg, -8); + // if (tempReg != dstReg) goto loop; + GetEmitter()->emitIns_J(INS_bne, loop, (int)tempReg | ((int)dstReg << 5)); + GetEmitter()->emitEnableGC(); + + gcInfo.gcMarkRegSetNpt(genRegMask(dstReg)); + } +} + //------------------------------------------------------------------------ // genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call // @@ -6918,6 +6973,11 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) genCodeForCpObj(blkOp->AsBlk()); break; + case GenTreeBlk::BlkOpKindLoop: + assert(!isCopyBlk); + genCodeForInitBlkLoop(blkOp); + break; + case GenTreeBlk::BlkOpKindHelper: if (isCopyBlk) { diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 43553c42018482..4cc21b385fce07 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3035,6 +3035,11 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode) genCodeForCpObj(storeBlkNode->AsBlk()); break; + case GenTreeBlk::BlkOpKindLoop: + assert(!isCopyBlk); + genCodeForInitBlkLoop(storeBlkNode); + break; + #ifdef TARGET_AMD64 case GenTreeBlk::BlkOpKindHelper: assert(!storeBlkNode->gtBlkOpGcUnsafe); @@ -3313,6 +3318,60 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) #endif } +//------------------------------------------------------------------------ +// genCodeForInitBlkLoop - Generate code for an InitBlk using an inlined for-loop. +// It's needed for cases when size is too big to unroll and we're not allowed +// to use memset call due to atomicity requirements. +// +// Arguments: +// initBlkNode - the GT_STORE_BLK node +// +void CodeGen::genCodeForInitBlkLoop(GenTreeBlk* initBlkNode) +{ + GenTree* const dstNode = initBlkNode->Addr(); + GenTree* const zeroNode = initBlkNode->Data(); + + genConsumeReg(dstNode); + genConsumeReg(zeroNode); + + const regNumber dstReg = dstNode->GetRegNum(); + const regNumber zeroReg = zeroNode->GetRegNum(); + + // xor zeroReg, zeroReg + // mov qword ptr [dstReg], zeroReg + // mov offsetReg, + //.LOOP: + // mov qword ptr [dstReg + offsetReg], zeroReg + // sub offsetReg, 8 + // jne .LOOP + + const unsigned size = initBlkNode->GetLayout()->GetSize(); + assert((size >= TARGET_POINTER_SIZE) && ((size % TARGET_POINTER_SIZE) == 0)); + + // The loop is reversed - it makes it smaller. + // Although, we zero the first pointer before the loop (the loop doesn't zero it) + // it works as a nullcheck, otherwise the first iteration would try to access + // "null + potentially large offset" and hit AV. + GetEmitter()->emitIns_AR_R(INS_mov, EA_PTRSIZE, zeroReg, dstReg, 0); + if (size > TARGET_POINTER_SIZE) + { + // Extend liveness of dstReg in case if it gets killed by the store. + gcInfo.gcMarkRegPtrVal(dstReg, dstNode->TypeGet()); + + const regNumber offsetReg = initBlkNode->GetSingleTempReg(); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, offsetReg, size - TARGET_POINTER_SIZE); + + BasicBlock* loop = genCreateTempLabel(); + genDefineTempLabel(loop); + + GetEmitter()->emitIns_ARX_R(INS_mov, EA_PTRSIZE, zeroReg, dstReg, offsetReg, 1, 0); + GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, offsetReg, TARGET_POINTER_SIZE); + inst_JMP(EJ_jne, loop); + + gcInfo.gcMarkRegSetNpt(genRegMask(dstReg)); + } +} + #ifdef TARGET_AMD64 //------------------------------------------------------------------------ // genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index ec7ece185b9873..646137496cfbef 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10396,6 +10396,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX STRESS_MODE(IF_CONVERSION_COST) \ STRESS_MODE(IF_CONVERSION_INNER_LOOPS) \ STRESS_MODE(POISON_IMPLICIT_BYREFS) \ + STRESS_MODE(STORE_BLOCK_UNROLLING) \ STRESS_MODE(COUNT) enum compStressArea diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 712591868c30bf..c10bfe58500e4c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -12486,6 +12486,11 @@ void Compiler::gtDispTree(GenTree* tree, printf(" (Helper)"); break; #endif + + case GenTreeBlk::BlkOpKindLoop: + printf(" (Loop)"); + break; + default: unreached(); } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 6f47496c201a27..fd0406270db04c 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -7330,6 +7330,7 @@ struct GenTreeBlk : public GenTreeIndir #ifdef TARGET_XARCH BlkOpKindRepInstr, #endif + BlkOpKindLoop, BlkOpKindUnroll, BlkOpKindUnrollMemmove, } gtBlkOpKind; @@ -7338,12 +7339,20 @@ struct GenTreeBlk : public GenTreeIndir bool gtBlkOpGcUnsafe; #endif -#ifdef TARGET_XARCH + bool ContainsReferences() + { + return (m_layout != nullptr) && m_layout->HasGCPtr(); + } + bool IsOnHeapAndContainsReferences() { - return (m_layout != nullptr) && m_layout->HasGCPtr() && !Addr()->OperIs(GT_LCL_ADDR); + return ContainsReferences() && !Addr()->OperIs(GT_LCL_ADDR); + } + + bool IsZeroingGcPointersOnHeap() + { + return OperIs(GT_STORE_BLK) && Data()->IsIntegralConst(0) && IsOnHeapAndContainsReferences(); } -#endif GenTreeBlk(genTreeOps oper, var_types type, GenTree* addr, ClassLayout* layout) : GenTreeIndir(oper, type, addr, nullptr) @@ -7354,6 +7363,10 @@ struct GenTreeBlk : public GenTreeIndir GenTreeBlk(genTreeOps oper, var_types type, GenTree* addr, GenTree* data, ClassLayout* layout) : GenTreeIndir(oper, type, addr, data) { + if (data->IsIntegralConst(0)) + { + data->gtFlags |= GTF_DONT_CSE; + } Initialize(layout); } diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 3263d8c4ba308a..72f18320642be6 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -8865,6 +8865,15 @@ void Lowering::LowerBlockStoreCommon(GenTreeBlk* blkNode) { assert(blkNode->OperIs(GT_STORE_BLK, GT_STORE_DYN_BLK)); + if (blkNode->ContainsReferences() && !blkNode->OperIsCopyBlkOp()) + { + // Make sure we don't use GT_STORE_DYN_BLK + assert(blkNode->OperIs(GT_STORE_BLK)); + + // and we only zero it (and that zero is better to be not hoisted/CSE'd) + assert(blkNode->Data()->IsIntegralConst(0)); + } + // Lose the type information stored in the source - we no longer need it. if (blkNode->Data()->OperIs(GT_BLK)) { diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 421a46a12ad550..20eea0e31e7a22 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -564,6 +564,16 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) if (blkNode->OperIsInitBlkOp()) { +#ifdef DEBUG + // Use BlkOpKindLoop for more cases under stress mode + if (comp->compStressCompile(Compiler::STRESS_STORE_BLOCK_UNROLLING, 50) && blkNode->OperIs(GT_STORE_BLK) && + ((blkNode->GetLayout()->GetSize() % TARGET_POINTER_SIZE) == 0) && src->IsIntegralConst(0)) + { + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindLoop; + return; + } +#endif + if (src->OperIs(GT_INIT_VAL)) { src->SetContained(); @@ -608,6 +618,15 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr); } + else if (blkNode->IsZeroingGcPointersOnHeap()) + { + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindLoop; +#ifdef TARGET_ARM64 + // On ARM64 we can just use REG_ZR instead of having to load + // the constant into a real register like on ARM32. + src->SetContained(); +#endif + } else { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; diff --git a/src/coreclr/jit/lowerloongarch64.cpp b/src/coreclr/jit/lowerloongarch64.cpp index 8ddccb2ae800a4..5110442eda10d1 100644 --- a/src/coreclr/jit/lowerloongarch64.cpp +++ b/src/coreclr/jit/lowerloongarch64.cpp @@ -326,6 +326,12 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr); } + else if (blkNode->IsZeroingGcPointersOnHeap()) + { + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindLoop; + // We're going to use REG_R0 for zero + src->SetContained(); + } else { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; diff --git a/src/coreclr/jit/lowerriscv64.cpp b/src/coreclr/jit/lowerriscv64.cpp index 3674cd2075a394..4f60458fd25161 100644 --- a/src/coreclr/jit/lowerriscv64.cpp +++ b/src/coreclr/jit/lowerriscv64.cpp @@ -274,6 +274,12 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr); } + else if (blkNode->IsZeroingGcPointersOnHeap()) + { + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindLoop; + // We're going to use REG_R0 for zero + src->SetContained(); + } else { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ee5a6a1eabce67..31e2a55e1b4862 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -332,6 +332,16 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) if (blkNode->OperIsInitBlkOp()) { +#ifdef DEBUG + // Use BlkOpKindLoop for more cases under stress mode + if (comp->compStressCompile(Compiler::STRESS_STORE_BLOCK_UNROLLING, 50) && blkNode->OperIs(GT_STORE_BLK) && + ((blkNode->GetLayout()->GetSize() % TARGET_POINTER_SIZE) == 0) && src->IsIntegralConst(0)) + { + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindLoop; + return; + } +#endif + if (src->OperIs(GT_INIT_VAL)) { src->SetContained(); @@ -397,10 +407,12 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) { TOO_BIG_TO_UNROLL: #ifdef TARGET_AMD64 - blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; + blkNode->gtBlkOpKind = + blkNode->IsZeroingGcPointersOnHeap() ? GenTreeBlk::BlkOpKindLoop : GenTreeBlk::BlkOpKindHelper; #else // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86 - blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr; + blkNode->gtBlkOpKind = + blkNode->IsZeroingGcPointersOnHeap() ? GenTreeBlk::BlkOpKindLoop : GenTreeBlk::BlkOpKindRepInstr; #endif } } diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index cbdeebb7814a53..8b1305caec52ae 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -624,6 +624,11 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) #endif // TARGET_ARM64 break; + case GenTreeBlk::BlkOpKindLoop: + // Needed for offsetReg + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + break; + case GenTreeBlk::BlkOpKindHelper: assert(!src->isContained()); dstAddrRegMask = RBM_ARG_0; diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index f3aaba0847f689..824555ed1840e9 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -968,6 +968,7 @@ regMaskTP LinearScan::getKillSetForBlockStore(GenTreeBlk* blkNode) #endif case GenTreeBlk::BlkOpKindUnrollMemmove: case GenTreeBlk::BlkOpKindUnroll: + case GenTreeBlk::BlkOpKindLoop: case GenTreeBlk::BlkOpKindInvalid: // for these 'gtBlkOpKind' kinds, we leave 'killMask' = RBM_NONE break; diff --git a/src/coreclr/jit/lsraloongarch64.cpp b/src/coreclr/jit/lsraloongarch64.cpp index 418ba90d9479b5..67b27aa51300c8 100644 --- a/src/coreclr/jit/lsraloongarch64.cpp +++ b/src/coreclr/jit/lsraloongarch64.cpp @@ -1099,6 +1099,11 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) } break; + case GenTreeBlk::BlkOpKindLoop: + // Needed for offsetReg + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + break; + case GenTreeBlk::BlkOpKindHelper: assert(!src->isContained()); dstAddrRegMask = RBM_ARG_0; diff --git a/src/coreclr/jit/lsrariscv64.cpp b/src/coreclr/jit/lsrariscv64.cpp index ebad7b7b7bbc97..cc4351a5a254eb 100644 --- a/src/coreclr/jit/lsrariscv64.cpp +++ b/src/coreclr/jit/lsrariscv64.cpp @@ -1247,6 +1247,11 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) } break; + case GenTreeBlk::BlkOpKindLoop: + // Needed for tempReg + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + break; + case GenTreeBlk::BlkOpKindHelper: assert(!src->isContained()); dstAddrRegMask = RBM_ARG_0; diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 322b6e4ef02c33..4b2a38ea130e36 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1394,6 +1394,11 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) sizeRegMask = RBM_RCX; break; + case GenTreeBlk::BlkOpKindLoop: + // Needed for offsetReg + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + break; + #ifdef TARGET_AMD64 case GenTreeBlk::BlkOpKindHelper: dstAddrRegMask = RBM_ARG_0; diff --git a/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.cs b/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.cs new file mode 100644 index 00000000000000..0a8719a2430a3b --- /dev/null +++ b/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.cs @@ -0,0 +1,56 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Runtime.CompilerServices; +using Xunit; + +public class StructWithGC_Zeroing +{ + [Fact] + public static void StructZeroingShouldNotUseMemset() + { + LargeStructWithGC ls1 = default; + ls1.str = "hello1"; + ls1.z2 = long.MaxValue; + ZeroIt(ref ls1); + + LargeStructWithGC2 ls2 = default; + ls2.str = "hello2"; + ls2.z1 = long.MinValue; + ZeroIt2(ref ls2); + + if (ls1.str != null || ls2.str != null || ls1.z2 != 0 || ls2.z1 != 0) + throw new InvalidOperationException("should be zeroed"); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ZeroIt(ref LargeStructWithGC s) + { + // X64-NOT: CORINFO_HELP_MEMSET + // ARM64-NOT: CORINFO_HELP_MEMSET + s = default; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ZeroIt2(ref LargeStructWithGC2 s) + { + // X64-NOT: CORINFO_HELP_MEMSET + // ARM64-NOT: CORINFO_HELP_MEMSET + s = default; + } + + struct LargeStructWithGC // 360 bytes (64-bit) + { + public string str; + public long b1, c1, d1, e1, f1, g1, h1, i1, j1, k1, l1, m1, n1, o1, p1, r1, s1, t1, u1, v1, w1, z1; + public long b2, c2, d2, e2, f2, g2, h2, i2, j2, k2, l2, m2, n2, o2, p2, r2, s2, t2, u2, v2, w2, z2; + } + + unsafe struct LargeStructWithGC2 // 4184 bytes (64-bit) + { + public fixed byte data[4000]; + public string str; + public long b1, c1, d1, e1, f1, g1, h1, i1, j1, k1, l1, m1, n1, o1, p1, r1, s1, t1, u1, v1, w1, z1; + } +} diff --git a/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.csproj b/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.csproj new file mode 100644 index 00000000000000..7aa59749804e49 --- /dev/null +++ b/src/tests/JIT/opt/Structs/StructWithGC_Zeroing.csproj @@ -0,0 +1,13 @@ + + + true + true + None + True + + + + true + + +