From 21ab197b597bc5d707e40a2bc9777bf39ee5ba43 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Fri, 28 Jan 2022 15:43:20 -0800 Subject: [PATCH 1/5] Add LCLHEAP_UNROLL_LIMIT in src/coreclr/jit/targetarm64.h --- src/coreclr/jit/targetarm64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index 4cc6b63f73009f..a4ef3a96782f72 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -15,6 +15,7 @@ #define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack) #define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk #define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack) + #define LCLHEAP_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required) #ifdef FEATURE_SIMD #define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned From b4ce7949e83e091e2929c079fd55508f4fc13038 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Fri, 28 Jan 2022 15:49:51 -0800 Subject: [PATCH 2/5] Couple small optimizations for genLclHeap: 1. When not required to zero the allocated space for local heap (for sizes up to 64 bytes) do not zero. 2. For sizes less than one PAGE_SIZE and when the size is an encodable offset use ldp tmpReg, xzr, [sp], #-amount that does probing at [sp] and allocates the space at the same time. 3. Allow non-loop zeroing (i.e. unrolled sequence) for sizes up to 128 bytes (i.e. up to LCLHEAP_UNROLL_LIMIT) 4. Do such zeroing in ascending order of effective address. --- src/coreclr/jit/codegenarm64.cpp | 66 ++++++++++++++++++++++++-------- src/coreclr/jit/lsraarm64.cpp | 13 +++---- 2 files changed, 54 insertions(+), 25 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 595e2a232e5415..08542f6ce2ccd1 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2311,34 +2311,66 @@ void CodeGen::genLclHeap(GenTree* tree) // We should reach here only for non-zero, constant size allocations. assert(amount > 0); + const int storePairRegsWritesBytes = 2 * REGSIZE_BYTES; + // For small allocations we will generate up to four stp instructions, to zero 16 to 64 bytes. - static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2)); - assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time - size_t stpCount = amount / (REGSIZE_BYTES * 2); - if (stpCount <= 4) + static_assert_no_msg(STACK_ALIGN == storePairRegsWritesBytes); + assert(amount % storePairRegsWritesBytes == 0); // stp stores two registers at a time + + if (compiler->info.compInitMem) { - while (stpCount != 0) + if (amount <= LCLHEAP_UNROLL_LIMIT) { - // We can use pre-indexed addressing. - // stp ZR, ZR, [SP, #-16]! // STACK_ALIGN is 16 - GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX); - stpCount -= 1; - } + // The following zeroes the last 16 bytes and probes the page containing [sp, #16] address. + // stp xzr, xzr, [sp, #-16]! + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -storePairRegsWritesBytes, + INS_OPTS_PRE_INDEX); - lastTouchDelta = 0; + if (amount > storePairRegsWritesBytes) + { + // The following sets SP to its final value and zeroes the first 16 bytes of the allocated space. + // stp xzr, xzr, [sp, #-amount+16]! + const ssize_t finalSpDelta = (ssize_t)amount - storePairRegsWritesBytes; + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -finalSpDelta, + INS_OPTS_PRE_INDEX); + + // The following zeroes the remaining space in [finalSp+16, initialSp-16) interval + // using a sequence of stp instruction with unsigned offset. + for (ssize_t offset = storePairRegsWritesBytes; offset < (ssize_t)amount - storePairRegsWritesBytes; + offset += storePairRegsWritesBytes) + { + // stp xzr, xzr, [sp, #offset] + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, offset); + } + } - goto ALLOC_DONE; + lastTouchDelta = 0; + + goto ALLOC_DONE; + } } - else if (!compiler->info.compInitMem && (amount < compiler->eeGetPageSize())) // must be < not <= + else if (amount < compiler->eeGetPageSize()) // must be < not <= { // Since the size is less than a page, simply adjust the SP value. // The SP might already be in the guard page, so we must touch it BEFORE // the alloc, not after. - // ldr wz, [SP, #0] - GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SP, 0); - - genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg()); + if (emitter::canEncodeLoadOrStorePairOffset(amount, EA_8BYTE)) + { + // The following probes the page and allocates the local heap. + // ldp tmpReg, xzr, [sp], #-amount + // Note that behaviour of ldp where two source registers are the same is unpredictable. + const regNumber tmpReg = targetReg; + GetEmitter()->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, REG_ZR, REG_SPBASE, -(ssize_t)amount, + INS_OPTS_POST_INDEX); + } + else + { + // ldr wzr, [sp] + // sub, sp, #amount + GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, amount); + genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg()); + } lastTouchDelta = amount; diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index f18099f4f573e5..3091700df42226 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -543,14 +543,14 @@ int LinearScan::BuildNode(GenTree* tree) { assert(dstCount == 1); - // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp): + // Need a variable number of temp regs (see genLclHeap() in codegenarm64.cpp): // Here '-' means don't care. // // Size? Init Memory? # temp regs // 0 - 0 - // const and <=6 ptr words - 0 + // const and <=UnrollLimit - 0 // const and 6 ptr words Yes 0 + // >UnrollLimit Yes 0 // Non-const Yes 0 // Non-const No 2 // @@ -569,12 +569,9 @@ int LinearScan::BuildNode(GenTree* tree) // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size. // This should also help in debugging as we can examine the original size specified with // localloc. - sizeVal = AlignUp(sizeVal, STACK_ALIGN); - size_t stpCount = sizeVal / (REGSIZE_BYTES * 2); + sizeVal = AlignUp(sizeVal, STACK_ALIGN); - // For small allocations up to 4 'stp' instructions (i.e. 16 to 64 bytes of localloc) - // - if (stpCount <= 4) + if (sizeVal <= LCLHEAP_UNROLL_LIMIT) { // Need no internal registers } From 36e1fe5c5d9e70ce5e392a04f9edf112dd50bb37 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Sat, 5 Feb 2022 17:39:23 -0800 Subject: [PATCH 3/5] Implement optimization for lclHeap sizes that fit into ldr (immediate) post-index range and fix an error in src/coreclr/jit/codegenarm64.cpp --- src/coreclr/jit/codegenarm64.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 08542f6ce2ccd1..616710d68c4e3d 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2355,7 +2355,16 @@ void CodeGen::genLclHeap(GenTree* tree) // The SP might already be in the guard page, so we must touch it BEFORE // the alloc, not after. - if (emitter::canEncodeLoadOrStorePairOffset(amount, EA_8BYTE)) + // Note the we check against the lower boundary of the post-index immediate range [-256, 256) + // since the offset is -amount. + const bool canEncodeLoadRegPostIndexOffset = amount <= 256; + + if (canEncodeLoadRegPostIndexOffset) + { + GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, -(ssize_t)amount, + INS_OPTS_POST_INDEX); + } + else if (emitter::canEncodeLoadOrStorePairOffset(-(ssize_t)amount, EA_8BYTE)) { // The following probes the page and allocates the local heap. // ldp tmpReg, xzr, [sp], #-amount From c84bcfb46b622db2c377f5518c9beda0bb709558 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Sat, 5 Feb 2022 17:39:51 -0800 Subject: [PATCH 4/5] Clarify the comment in src/coreclr/jit/codegenarm64.cpp --- src/coreclr/jit/codegenarm64.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 616710d68c4e3d..bbb0c1a24a61c1 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2368,7 +2368,8 @@ void CodeGen::genLclHeap(GenTree* tree) { // The following probes the page and allocates the local heap. // ldp tmpReg, xzr, [sp], #-amount - // Note that behaviour of ldp where two source registers are the same is unpredictable. + // Note that we cannot use ldp xzr, xzr since + // the behaviour of ldp where two source registers are the same is unpredictable. const regNumber tmpReg = targetReg; GetEmitter()->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, REG_ZR, REG_SPBASE, -(ssize_t)amount, INS_OPTS_POST_INDEX); From 6be9207661d616ea951d8f99811b66423cd62e7a Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Sat, 5 Feb 2022 17:40:51 -0800 Subject: [PATCH 5/5] Address feedback in src/coreclr/jit/codegenarm64.cpp --- src/coreclr/jit/codegenarm64.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index bbb0c1a24a61c1..edc756f18e04cc 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2336,7 +2336,7 @@ void CodeGen::genLclHeap(GenTree* tree) // The following zeroes the remaining space in [finalSp+16, initialSp-16) interval // using a sequence of stp instruction with unsigned offset. - for (ssize_t offset = storePairRegsWritesBytes; offset < (ssize_t)amount - storePairRegsWritesBytes; + for (ssize_t offset = storePairRegsWritesBytes; offset < finalSpDelta; offset += storePairRegsWritesBytes) { // stp xzr, xzr, [sp, #offset]