From 3a931a8d6ecb38704326b99545c6416f9b88a420 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 3 Jul 2024 23:22:14 +0000 Subject: [PATCH 1/3] Enable TLS on linux/arm64 only for static resolver --- src/coreclr/vm/arm64/asmhelpers.S | 14 ++++++++++++++ src/coreclr/vm/threadstatics.cpp | 19 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/coreclr/vm/arm64/asmhelpers.S b/src/coreclr/vm/arm64/asmhelpers.S index e163d807404e18..ebfefd693f0744 100644 --- a/src/coreclr/vm/arm64/asmhelpers.S +++ b/src/coreclr/vm/arm64/asmhelpers.S @@ -794,4 +794,18 @@ LEAF_ENTRY GetThreadStaticsVariableOffset, _TEXT EPILOG_RETURN LEAF_END GetThreadStaticsVariableOffset, _TEXT // ------------------------------------------------------------------ + +// ------------------------------------------------------------------ +// size_t GetTLSResolverAddress() + +// Helper to get the TLS resolver address. This will be then used to determine if we have a static or dynamic resolver. +LEAF_ENTRY GetTLSResolverAddress, _TEXT + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -32 + adrp x0, :tlsdesc:t_ThreadStatics + ldr x1, [x0, #:tlsdesc_lo12:t_ThreadStatics] + mov x0, x1 + EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 32 + EPILOG_RETURN +LEAF_END GetTLSResolverAddress, _TEXT +// ------------------------------------------------------------------ #endif // !TARGET_OSX diff --git a/src/coreclr/vm/threadstatics.cpp b/src/coreclr/vm/threadstatics.cpp index 0fd8f389f1a7df..448058132c767e 100644 --- a/src/coreclr/vm/threadstatics.cpp +++ b/src/coreclr/vm/threadstatics.cpp @@ -784,6 +784,8 @@ void FreeTLSIndicesForLoaderAllocator(LoaderAllocator *pLoaderAllocator) static void* GetTlsIndexObjectAddress(); +extern "C" size_t GetTLSResolverAddress(); + bool CanJITOptimizeTLSAccess() { LIMITED_METHOD_CONTRACT; @@ -799,6 +801,23 @@ bool CanJITOptimizeTLSAccess() // Optimization is disabled for FreeBSD/arm64 #elif defined(FEATURE_INTERPRETER) // Optimization is disabled when interpreter may be used +#elif defined(TARGET_UNIX) && defined(TARGET_ARM64) + // Optimization is enabled for linux/arm64 only for static resolver. + // For static resolver, the TP offset is same for all threads. + // For dynamic resolver, TP offset returned is that of a JIT thread and + // will be different for the executing thread. + uint32_t* resolverAddress = reinterpret_cast(GetTLSResolverAddress()); + if ( + // nop or hint 32 + ((resolverAddress[0] == 0xd503201f) || (resolverAddress[0] == 0xd503241f)) && + // ldr x0, [x0, #8] + (resolverAddress[1] == 0xf9400400) && + // ret + (resolverAddress[2] == 0xd65f03c0) + ) + { + optimizeThreadStaticAccess = true; + } #else optimizeThreadStaticAccess = true; #if !defined(TARGET_OSX) && defined(TARGET_UNIX) && defined(TARGET_AMD64) From 6ecf11fec6c07aefc61f3647a3b3e358e35b6008 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 3 Jul 2024 23:34:49 +0000 Subject: [PATCH 2/3] Wrap GetTLSResolverAddress() in ifdef --- src/coreclr/vm/threadstatics.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/coreclr/vm/threadstatics.cpp b/src/coreclr/vm/threadstatics.cpp index 448058132c767e..0ccfe1216deefa 100644 --- a/src/coreclr/vm/threadstatics.cpp +++ b/src/coreclr/vm/threadstatics.cpp @@ -784,7 +784,9 @@ void FreeTLSIndicesForLoaderAllocator(LoaderAllocator *pLoaderAllocator) static void* GetTlsIndexObjectAddress(); +#if !defined(TARGET_OSX) && defined(TARGET_UNIX) && defined(TARGET_ARM64) extern "C" size_t GetTLSResolverAddress(); +#endif // !TARGET_OSX && TARGET_UNIX && TARGET_ARM64 bool CanJITOptimizeTLSAccess() { @@ -801,7 +803,7 @@ bool CanJITOptimizeTLSAccess() // Optimization is disabled for FreeBSD/arm64 #elif defined(FEATURE_INTERPRETER) // Optimization is disabled when interpreter may be used -#elif defined(TARGET_UNIX) && defined(TARGET_ARM64) +#elif !defined(TARGET_OSX) && defined(TARGET_UNIX) && defined(TARGET_ARM64) // Optimization is enabled for linux/arm64 only for static resolver. // For static resolver, the TP offset is same for all threads. // For dynamic resolver, TP offset returned is that of a JIT thread and From 67cf8a88ea972556b4a73d3df27673b9027cbac6 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 5 Jul 2024 15:46:55 +0000 Subject: [PATCH 3/3] add check to skip nop for older resolver --- src/coreclr/vm/threadstatics.cpp | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/coreclr/vm/threadstatics.cpp b/src/coreclr/vm/threadstatics.cpp index 0ccfe1216deefa..e745131a4e097b 100644 --- a/src/coreclr/vm/threadstatics.cpp +++ b/src/coreclr/vm/threadstatics.cpp @@ -806,20 +806,31 @@ bool CanJITOptimizeTLSAccess() #elif !defined(TARGET_OSX) && defined(TARGET_UNIX) && defined(TARGET_ARM64) // Optimization is enabled for linux/arm64 only for static resolver. // For static resolver, the TP offset is same for all threads. - // For dynamic resolver, TP offset returned is that of a JIT thread and - // will be different for the executing thread. + // For dynamic resolver, TP offset returned is for the current thread and + // will be different for the other threads. uint32_t* resolverAddress = reinterpret_cast(GetTLSResolverAddress()); - if ( + int ip = 0; + if ((resolverAddress[ip] == 0xd503201f) || (resolverAddress[ip] == 0xd503241f)) + { + // nop might not be present in older resolver, so skip it. + // nop or hint 32 - ((resolverAddress[0] == 0xd503201f) || (resolverAddress[0] == 0xd503241f)) && + ip++; + } + + if ( // ldr x0, [x0, #8] - (resolverAddress[1] == 0xf9400400) && + (resolverAddress[ip] == 0xf9400400) && // ret - (resolverAddress[2] == 0xd65f03c0) + (resolverAddress[ip + 1] == 0xd65f03c0) ) { optimizeThreadStaticAccess = true; } + else + { + _ASSERTE(false && "Unexpected code sequence."); + } #else optimizeThreadStaticAccess = true; #if !defined(TARGET_OSX) && defined(TARGET_UNIX) && defined(TARGET_AMD64)