From 046d9276d6df88deaa5b462b1ea03235f7f746f2 Mon Sep 17 00:00:00 2001 From: RalfKornmannEnvision Date: Wed, 2 Sep 2020 15:14:26 +0200 Subject: [PATCH] ARM64: optimize PREPARE_EXTERNAL:VAR (#21) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are multiple cases were we don't need just the pointer for an external variable but the value that is stored at this location. So far this was done with the PREPARE_EXTERNAL_VAR followed by an ldr x? [x?]. The PREPARE_EXTERNAL_VAR macro needs two instructions (adrp + add). As the ldr instruction supports an offset we can eliminate the add for this use case. The two new macros PREPARE_EXTERNAL_VAR_INDIRECT and PREPARE_EXTERNAL_VAR_INDIRECT_W make use of this. --- src/Native/Runtime/arm64/ExceptionHandling.S | 3 +- .../Runtime/arm64/InteropThunksHelpers.S | 3 +- src/Native/Runtime/arm64/PInvoke.S | 12 +++---- src/Native/Runtime/arm64/StubDispatch.S | 3 +- src/Native/Runtime/arm64/WriteBarriers.S | 34 +++++++------------ .../Runtime/unix/unixasmmacrosarm64.inc | 11 ++++++ 6 files changed, 30 insertions(+), 36 deletions(-) diff --git a/src/Native/Runtime/arm64/ExceptionHandling.S b/src/Native/Runtime/arm64/ExceptionHandling.S index 660820cde15..bb3f2bd9ae0 100644 --- a/src/Native/Runtime/arm64/ExceptionHandling.S +++ b/src/Native/Runtime/arm64/ExceptionHandling.S @@ -465,9 +465,8 @@ PopExInfoLoop: DonePopping: str x3, [x1, #OFFSETOF__Thread__m_pExInfoStackHead] // store the new head on the Thread - PREPARE_EXTERNAL_VAR RhpTrapThreads, x3 + PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 3 - ldr w3, [x3] tbz x3, #TrapThreadsFlags_AbortInProgress_Bit, NoAbort ldr x3, [sp, #rsp_offset_is_not_handling_thread_abort] diff --git a/src/Native/Runtime/arm64/InteropThunksHelpers.S b/src/Native/Runtime/arm64/InteropThunksHelpers.S index 8c6e4198bd7..34ffd58b42d 100644 --- a/src/Native/Runtime/arm64/InteropThunksHelpers.S +++ b/src/Native/Runtime/arm64/InteropThunksHelpers.S @@ -40,8 +40,7 @@ POINTER_SIZE = 0x08 // IntPtr RhGetCommonStubAddress() // LEAF_ENTRY RhGetCommonStubAddress, _TEXT - adrp x0, RhCommonStub - add x0, x0, :lo12:RhCommonStub + PREPARE_EXTERNAL_VAR RhCommonStub, x0 ret LEAF_END RhGetCommonStubAddress, _TEXT diff --git a/src/Native/Runtime/arm64/PInvoke.S b/src/Native/Runtime/arm64/PInvoke.S index d3a2accd33d..ad10690c4f5 100644 --- a/src/Native/Runtime/arm64/PInvoke.S +++ b/src/Native/Runtime/arm64/PInvoke.S @@ -118,9 +118,8 @@ Done: PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, #-0x10 - PREPARE_EXTERNAL_VAR RhpTrapThreads, x10 + PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 10 - ldr w10, [x10] tbz x10, #TrapThreadsFlags_TrapThreads_Bit, NoWait bl RhpWaitForGCNoAbort NoWait: @@ -181,9 +180,8 @@ ThreadAttached: str xzr, [x10, #OFFSETOF__Thread__m_pTransitionFrame] dmb ish - PREPARE_EXTERNAL_VAR RhpTrapThreads, x11 + PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 11 - ldr w11, [x11] tbnz x11, #TrapThreadsFlags_TrapThreads_Bit, TrapThread ret @@ -294,9 +292,8 @@ NESTED_ENTRY RhpPInvoke, _TEXT, NoHandler str x10, [x0, #OFFSETOF__PInvokeTransitionFrame__m_pThread] str x0, [x10, #OFFSETOF__Thread__m_pTransitionFrame] - PREPARE_EXTERNAL_VAR RhpTrapThreads, x9 + PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 9 - ldr w9, [x9] cbnz w9, InvokeRareTrapThread // TrapThreadsFlags_None = 0 ret @@ -310,9 +307,8 @@ LEAF_ENTRY RhpPInvokeReturn, _TEXT mov x10, 0 str x10, [x9, #OFFSETOF__Thread__m_pTransitionFrame] - PREPARE_EXTERNAL_VAR RhpTrapThreads, x9 + PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 9 - ldr w9, [x9] cbnz w9, 0f // TrapThreadsFlags_None = 0 ret 0: diff --git a/src/Native/Runtime/arm64/StubDispatch.S b/src/Native/Runtime/arm64/StubDispatch.S index dd820f1fc81..6548dee6031 100644 --- a/src/Native/Runtime/arm64/StubDispatch.S +++ b/src/Native/Runtime/arm64/StubDispatch.S @@ -99,8 +99,7 @@ // Calling convention of the universal thunk is: // xip0: contains target address for the thunk to call // xip1: contains parameter of the thunks target - adrp xip0, RhpCidResolve - add xip0, xip0, :lo12:RhpCidResolve + PREPARE_EXTERNAL_VAR RhpCidResolve, xip0 mov xip1, x11 b RhpUniversalTransition_DebugStepTailCall LEAF_END RhpInterfaceDispatchSlow, _TEXT diff --git a/src/Native/Runtime/arm64/WriteBarriers.S b/src/Native/Runtime/arm64/WriteBarriers.S index 3c0642857d5..bbb4885ec0a 100644 --- a/src/Native/Runtime/arm64/WriteBarriers.S +++ b/src/Native/Runtime/arm64/WriteBarriers.S @@ -28,8 +28,7 @@ .macro UPDATE_GC_SHADOW destReg, refReg // If g_GCShadow is 0, don't perform the check. - PREPARE_EXTERNAL_VAR g_GCShadow, X9 - ldr x9, [x9] + PREPARE_EXTERNAL_VAR_INDIRECT g_GCShadow, X9 cbz x9, 1f // Save destReg since we're about to modify it (and we need the original value both within the macro and @@ -37,17 +36,14 @@ mov x10, \destReg // Transform destReg into the equivalent address in the shadow heap. - PREPARE_EXTERNAL_VAR g_lowest_address, X9 - ldr x9, [x9] + PREPARE_EXTERNAL_VAR_INDIRECT g_lowest_address, X9 subs \destReg, \destReg, x9 blt 0f - PREPARE_EXTERNAL_VAR g_GCShadow, X9 - ldr x9, [x9] + PREPARE_EXTERNAL_VAR_INDIRECT g_GCShadow, X9 add \destReg, \destReg, x9 - PREPARE_EXTERNAL_VAR g_GCShadowEnd, X9 - ldr x9, [x9] + PREPARE_EXTERNAL_VAR_INDIRECT g_GCShadowEnd, X9 cmp \destReg, x9 bgt 0f @@ -109,8 +105,8 @@ #ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP // Update the write watch table if necessary - PREPARE_EXTERNAL_VAR g_write_watch_table, x\trash - ldr x\trash, [x\trash] + PREPARE_EXTERNAL_VAR_INDIRECT g_write_watch_table, x\trash + cbz x\trash, 2f add x\trash, x\trash, \destReg, lsr #0xc // SoftwareWriteWatch::AddressToTableByteIndexShift ldrb w17, [x\trash] @@ -122,20 +118,17 @@ 2: // We can skip the card table write if the reference is to // an object not on the epehemeral segment. - PREPARE_EXTERNAL_VAR g_ephemeral_low, x\trash - ldr x\trash, [x\trash] + PREPARE_EXTERNAL_VAR_INDIRECT g_ephemeral_low, x\trash cmp \refReg, x\trash blt 0f - PREPARE_EXTERNAL_VAR g_ephemeral_high, x\trash - ldr x\trash, [x\trash] + PREPARE_EXTERNAL_VAR_INDIRECT g_ephemeral_high, x\trash cmp \refReg, x\trash bge 0f // Set this objects card, if it has not already been set. - PREPARE_EXTERNAL_VAR g_card_table, x\trash - ldr x\trash, [x\trash] + PREPARE_EXTERNAL_VAR_INDIRECT g_card_table, x\trash add \trash2, x\trash, \destReg, lsr #11 // Check that this card has not already been written. Avoiding useless writes is a big win on @@ -149,8 +142,7 @@ #ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES // Check if we need to update the card bundle table - PREPARE_EXTERNAL_VAR g_card_bundle_table, x\trash - ldr x\trash, [x\trash] + PREPARE_EXTERNAL_VAR_INDIRECT g_card_bundle_table, x\trash add \trash2, x\trash, \destReg, lsr #21 ldrb w\trash, [\trash2] cmp x\trash, 0xFF @@ -178,13 +170,11 @@ // The "check" of this checked write barrier - is destReg // within the heap? if no, early out. - PREPARE_EXTERNAL_VAR g_lowest_address, x\trash - ldr x\trash, [x\trash] + PREPARE_EXTERNAL_VAR_INDIRECT g_lowest_address, x\trash cmp \destReg, x\trash blt 0f - PREPARE_EXTERNAL_VAR g_highest_address, x\trash - ldr x\trash, [x\trash] + PREPARE_EXTERNAL_VAR_INDIRECT g_highest_address, x\trash cmp \destReg, x\trash bgt 0f diff --git a/src/Native/Runtime/unix/unixasmmacrosarm64.inc b/src/Native/Runtime/unix/unixasmmacrosarm64.inc index 52f3e0e6f6c..262d88df109 100644 --- a/src/Native/Runtime/unix/unixasmmacrosarm64.inc +++ b/src/Native/Runtime/unix/unixasmmacrosarm64.inc @@ -46,6 +46,17 @@ C_FUNC(\Name): add \HelperReg, \HelperReg, :lo12:C_FUNC(\Name) .endm +.macro PREPARE_EXTERNAL_VAR_INDIRECT Name, HelperReg + adrp \HelperReg, C_FUNC(\Name) + ldr \HelperReg, [\HelperReg, :lo12:C_FUNC(\Name)] +.endm + +.macro PREPARE_EXTERNAL_VAR_INDIRECT_W Name, HelperReg + adrp x\HelperReg, C_FUNC(\Name) + ldr w\HelperReg, [x\HelperReg, :lo12:C_FUNC(\Name)] +.endm + + .macro PROLOG_STACK_ALLOC Size sub sp, sp, \Size .cfi_adjust_cfa_offset \Size