diff --git a/src/mscorlib/shared/System/Threading/SpinWait.cs b/src/mscorlib/shared/System/Threading/SpinWait.cs index d25d54f26f3f..5346e8d17bdc 100644 --- a/src/mscorlib/shared/System/Threading/SpinWait.cs +++ b/src/mscorlib/shared/System/Threading/SpinWait.cs @@ -69,9 +69,26 @@ public struct SpinWait // numbers may seem fairly arbitrary, but were derived with at least some // thought in the design document. I fully expect they will need to change // over time as we gain more experience with performance. - internal const int YIELD_THRESHOLD = 10; // When to switch over to a true yield. - internal const int SLEEP_0_EVERY_HOW_MANY_TIMES = 5; // After how many yields should we Sleep(0)? - internal const int SLEEP_1_EVERY_HOW_MANY_TIMES = 20; // After how many yields should we Sleep(1)? + internal const int YieldThreshold = 10; // When to switch over to a true yield. + private const int Sleep0EveryHowManyYields = 5; // After how many yields should we Sleep(0)? + internal const int DefaultSleep1Threshold = 20; // After how many yields should we Sleep(1) frequently? + + /// + /// A suggested number of spin iterations before doing a proper wait, such as waiting on an event that becomes signaled + /// when the resource becomes available. + /// + /// + /// These numbers were arrived at by experimenting with different numbers in various cases that currently use it. It's + /// only a suggested value and typically works well when the proper wait is something like an event. + /// + /// Spinning less can lead to early waiting and more context switching, spinning more can decrease latency but may use + /// up some CPU time unnecessarily. Depends on the situation too, for instance SemaphoreSlim uses double this number + /// because the waiting there is currently a lot more expensive (involves more spinning, taking a lock, etc.). It also + /// depends on the likelihood of the spin being successful and how long the wait would be but those are not accounted + /// for here. + /// + internal static readonly int SpinCountforSpinBeforeWait = PlatformHelper.IsSingleProcessor ? 1 : 35; + internal const int Sleep1ThresholdForSpinBeforeWait = 40; // should be greater than SpinCountforSpinBeforeWait // The number of times we've spun already. private int _count; @@ -81,7 +98,12 @@ public struct SpinWait /// public int Count { - get { return _count; } + get => _count; + internal set + { + Debug.Assert(value >= 0); + _count = value; + } } /// @@ -94,10 +116,7 @@ public int Count /// On a single-CPU machine, always yields the processor. On machines with /// multiple CPUs, may yield after an unspecified number of calls. /// - public bool NextSpinWillYield - { - get { return _count > YIELD_THRESHOLD || PlatformHelper.IsSingleProcessor; } - } + public bool NextSpinWillYield => _count >= YieldThreshold || PlatformHelper.IsSingleProcessor; /// /// Performs a single spin. @@ -108,7 +127,27 @@ public bool NextSpinWillYield /// public void SpinOnce() { - if (NextSpinWillYield) + SpinOnce(DefaultSleep1Threshold); + } + + internal void SpinOnce(int sleep1Threshold) + { + Debug.Assert(sleep1Threshold >= YieldThreshold || PlatformHelper.IsSingleProcessor); // so that NextSpinWillYield behaves as requested + + // (_count - YieldThreshold) % 2 == 0: The purpose of this check is to interleave Thread.Yield/Sleep(0) with + // Thread.SpinWait. Otherwise, the following issues occur: + // - When there are no threads to switch to, Yield and Sleep(0) become no-op and it turns the spin loop into a + // busy-spin that may quickly reach the max spin count and cause the thread to enter a wait state, or may + // just busy-spin for longer than desired before a Sleep(1). Completing the spin loop too early can cause + // excessive context switcing if a wait follows, and entering the Sleep(1) stage too early can cause + // excessive delays. + // - If there are multiple threads doing Yield and Sleep(0) (typically from the same spin loop due to + // contention), they may switch between one another, delaying work that can make progress. + if (( + _count >= YieldThreshold && + (_count >= sleep1Threshold || (_count - YieldThreshold) % 2 == 0) + ) || + PlatformHelper.IsSingleProcessor) { // // We must yield. @@ -125,19 +164,21 @@ public void SpinOnce() // configured to use the (default) coarse-grained system timer. // - int yieldsSoFar = (_count >= YIELD_THRESHOLD ? _count - YIELD_THRESHOLD : _count); - - if ((yieldsSoFar % SLEEP_1_EVERY_HOW_MANY_TIMES) == (SLEEP_1_EVERY_HOW_MANY_TIMES - 1)) + if (_count >= sleep1Threshold) { RuntimeThread.Sleep(1); } - else if ((yieldsSoFar % SLEEP_0_EVERY_HOW_MANY_TIMES) == (SLEEP_0_EVERY_HOW_MANY_TIMES - 1)) - { - RuntimeThread.Sleep(0); - } else { - RuntimeThread.Yield(); + int yieldsSoFar = _count >= YieldThreshold ? (_count - YieldThreshold) / 2 : _count; + if ((yieldsSoFar % Sleep0EveryHowManyYields) == (Sleep0EveryHowManyYields - 1)) + { + RuntimeThread.Sleep(0); + } + else + { + RuntimeThread.Yield(); + } } } else @@ -153,11 +194,24 @@ public void SpinOnce() // number of spins we are willing to tolerate to reduce delay to the caller, // since we expect most callers will eventually block anyway. // - RuntimeThread.SpinWait(4 << _count); + // Also, cap the maximum spin count to a value such that many thousands of CPU cycles would not be wasted doing + // the equivalent of YieldProcessor(), as that that point SwitchToThread/Sleep(0) are more likely to be able to + // allow other useful work to run. Long YieldProcessor() loops can help to reduce contention, but Sleep(1) is + // usually better for that. + // + // RuntimeThread.OptimalMaxSpinWaitsPerSpinIteration: + // - See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value. + // + int n = RuntimeThread.OptimalMaxSpinWaitsPerSpinIteration; + if (_count <= 30 && (1 << _count) < n) + { + n = 1 << _count; + } + RuntimeThread.SpinWait(n); } // Finally, increment our spin counter. - _count = (_count == int.MaxValue ? YIELD_THRESHOLD : _count + 1); + _count = (_count == int.MaxValue ? YieldThreshold : _count + 1); } /// @@ -299,9 +353,7 @@ internal static int ProcessorCount /// /// Gets whether the current machine has only a single processor. /// - internal static bool IsSingleProcessor - { - get { return ProcessorCount == 1; } - } + /// This typically does not change on a machine, so it's checked only once. + internal static readonly bool IsSingleProcessor = ProcessorCount == 1; } } diff --git a/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs b/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs index 605f974da0c5..4c67ea3fd625 100644 --- a/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs +++ b/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs @@ -15,6 +15,8 @@ namespace Internal.Runtime.Augments { public class RuntimeThread : CriticalFinalizerObject { + private static int s_optimalMaxSpinWaitsPerSpinIteration; + internal RuntimeThread() { } public static RuntimeThread Create(ThreadStart start) => new Thread(start); @@ -186,6 +188,33 @@ public void DisableComObjectEagerCleanup() private extern bool JoinInternal(int millisecondsTimeout); public static void Sleep(int millisecondsTimeout) => Thread.Sleep(millisecondsTimeout); + + [DllImport(JitHelpers.QCall)] + [SuppressUnmanagedCodeSecurity] + private static extern int GetOptimalMaxSpinWaitsPerSpinIterationInternal(); + + /// + /// Max value to be passed into for optimal delaying. This value is normalized to be + /// appropriate for the processor. + /// + internal static int OptimalMaxSpinWaitsPerSpinIteration + { + get + { + if (s_optimalMaxSpinWaitsPerSpinIteration != 0) + { + return s_optimalMaxSpinWaitsPerSpinIteration; + } + + // This is done lazily because the first call to the function below in the process triggers a measurement that + // takes a nontrivial amount of time. See Thread::InitializeYieldProcessorNormalized(), which describes and + // calculates this value. + s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal(); + Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0); + return s_optimalMaxSpinWaitsPerSpinIteration; + } + } + public static void SpinWait(int iterations) => Thread.SpinWait(iterations); public static bool Yield() => Thread.Yield(); diff --git a/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs b/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs index e39696849989..8a245f060207 100644 --- a/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs +++ b/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs @@ -12,9 +12,6 @@ // // =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -using System; -using System.Threading; -using System.Runtime.InteropServices; using System.Diagnostics; using System.Diagnostics.Contracts; @@ -48,7 +45,6 @@ public class ManualResetEventSlim : IDisposable { // These are the default spin counts we use on single-proc and MP machines. private const int DEFAULT_SPIN_SP = 1; - private const int DEFAULT_SPIN_MP = SpinWait.YIELD_THRESHOLD; private volatile object m_lock; // A lock used for waiting and pulsing. Lazily initialized via EnsureLockObjectCreated() @@ -193,7 +189,7 @@ public ManualResetEventSlim(bool initialState) { // Specify the defualt spin count, and use default spin if we're // on a multi-processor machine. Otherwise, we won't. - Initialize(initialState, DEFAULT_SPIN_MP); + Initialize(initialState, SpinWait.SpinCountforSpinBeforeWait); } /// @@ -563,44 +559,19 @@ public bool Wait(int millisecondsTimeout, CancellationToken cancellationToken) bNeedTimeoutAdjustment = true; } - //spin - int HOW_MANY_SPIN_BEFORE_YIELD = 10; - int HOW_MANY_YIELD_EVERY_SLEEP_0 = 5; - int HOW_MANY_YIELD_EVERY_SLEEP_1 = 20; - + // Spin int spinCount = SpinCount; - for (int i = 0; i < spinCount; i++) + var spinner = new SpinWait(); + while (spinner.Count < spinCount) { + spinner.SpinOnce(SpinWait.Sleep1ThresholdForSpinBeforeWait); + if (IsSet) { return true; } - else if (i < HOW_MANY_SPIN_BEFORE_YIELD) - { - if (i == HOW_MANY_SPIN_BEFORE_YIELD / 2) - { - Thread.Yield(); - } - else - { - Thread.SpinWait(4 << i); - } - } - else if (i % HOW_MANY_YIELD_EVERY_SLEEP_1 == 0) - { - Thread.Sleep(1); - } - else if (i % HOW_MANY_YIELD_EVERY_SLEEP_0 == 0) - { - Thread.Sleep(0); - } - else - { - Thread.Yield(); - } - - if (i >= 100 && i % 10 == 0) // check the cancellation token if the user passed a very large spin count + if (spinner.Count >= 100 && spinner.Count % 10 == 0) // check the cancellation token if the user passed a very large spin count cancellationToken.ThrowIfCancellationRequested(); } diff --git a/src/mscorlib/src/System/Threading/SemaphoreSlim.cs b/src/mscorlib/src/System/Threading/SemaphoreSlim.cs index e00a106eb34a..972b21adae3b 100644 --- a/src/mscorlib/src/System/Threading/SemaphoreSlim.cs +++ b/src/mscorlib/src/System/Threading/SemaphoreSlim.cs @@ -342,15 +342,28 @@ public bool Wait(int millisecondsTimeout, CancellationToken cancellationToken) CancellationTokenRegistration cancellationTokenRegistration = cancellationToken.InternalRegisterWithoutEC(s_cancellationTokenCanceledEventHandler, this); try { - // Perf: first spin wait for the count to be positive, but only up to the first planned yield. + // Perf: first spin wait for the count to be positive. // This additional amount of spinwaiting in addition // to Monitor.Enter()’s spinwaiting has shown measurable perf gains in test scenarios. // + + // Monitor.Enter followed by Monitor.Wait is much more expensive than waiting on an event as it involves another + // spin, contention, etc. The usual number of spin iterations that would otherwise be used here is doubled to + // lessen that extra expense of doing a proper wait. + int spinCount = SpinWait.SpinCountforSpinBeforeWait * 2; + int sleep1Threshold = SpinWait.Sleep1ThresholdForSpinBeforeWait * 2; + SpinWait spin = new SpinWait(); - while (m_currentCount == 0 && !spin.NextSpinWillYield) + while (true) { - spin.SpinOnce(); + spin.SpinOnce(sleep1Threshold); + + if (m_currentCount != 0) + { + break; + } } + // entering the lock and incrementing waiters must not suffer a thread-abort, else we cannot // clean up m_waitCount correctly, which may lead to deadlock due to non-woken waiters. try { } diff --git a/src/mscorlib/src/System/Threading/SpinLock.cs b/src/mscorlib/src/System/Threading/SpinLock.cs index eee73ce2bf48..dbf2024e5dc9 100644 --- a/src/mscorlib/src/System/Threading/SpinLock.cs +++ b/src/mscorlib/src/System/Threading/SpinLock.cs @@ -65,16 +65,9 @@ public struct SpinLock private volatile int m_owner; - // The multiplier factor for the each spinning iteration - // This number has been chosen after trying different numbers on different CPUs (4, 8 and 16 ) and this provided the best results - private const int SPINNING_FACTOR = 100; - // After how many yields, call Sleep(1) private const int SLEEP_ONE_FREQUENCY = 40; - // After how many yields, call Sleep(0) - private const int SLEEP_ZERO_FREQUENCY = 10; - // After how many yields, check the timeout private const int TIMEOUT_CHECK_FREQUENCY = 10; @@ -347,48 +340,24 @@ private void ContinueTryEnter(int millisecondsTimeout, ref bool lockTaken) else //failed to acquire the lock,then try to update the waiters. If the waiters count reached the maximum, jsut break the loop to avoid overflow { if ((observedOwner & WAITERS_MASK) != MAXIMUM_WAITERS) + { + // This can still overflow, but maybe there will never be that many waiters turn = (Interlocked.Add(ref m_owner, 2) & WAITERS_MASK) >> 1; + } } - //***Step 2. Spinning //lock acquired failed and waiters updated - int processorCount = PlatformHelper.ProcessorCount; - if (turn < processorCount) - { - int processFactor = 1; - for (int i = 1; i <= turn * SPINNING_FACTOR; i++) - { - Thread.SpinWait((turn + i) * SPINNING_FACTOR * processFactor); - if (processFactor < processorCount) - processFactor++; - observedOwner = m_owner; - if ((observedOwner & LOCK_ANONYMOUS_OWNED) == LOCK_UNOWNED) - { - int newOwner = (observedOwner & WAITERS_MASK) == 0 ? // Gets the number of waiters, if zero - observedOwner | 1 // don't decrement it. just set the lock bit, it is zzero because a previous call of Exit(false) ehich corrupted the waiters - : (observedOwner - 2) | 1; // otherwise decrement the waiters and set the lock bit - Debug.Assert((newOwner & WAITERS_MASK) >= 0); - - if (CompareExchange(ref m_owner, newOwner, observedOwner, ref lockTaken) == observedOwner) - { - return; - } - } - } - // Check the timeout. - if (millisecondsTimeout != Timeout.Infinite && TimeoutHelper.UpdateTimeOut(startTime, millisecondsTimeout) <= 0) - { - DecrementWaiters(); - return; - } + //*** Step 2, Spinning and Yielding + var spinner = new SpinWait(); + if (turn > PlatformHelper.ProcessorCount) + { + spinner.Count = SpinWait.YieldThreshold; } - - //*** Step 3, Yielding - //Sleep(1) every 50 yields - int yieldsoFar = 0; while (true) { + spinner.SpinOnce(SLEEP_ONE_FREQUENCY); + observedOwner = m_owner; if ((observedOwner & LOCK_ANONYMOUS_OWNED) == LOCK_UNOWNED) { @@ -403,20 +372,7 @@ private void ContinueTryEnter(int millisecondsTimeout, ref bool lockTaken) } } - if (yieldsoFar % SLEEP_ONE_FREQUENCY == 0) - { - Thread.Sleep(1); - } - else if (yieldsoFar % SLEEP_ZERO_FREQUENCY == 0) - { - Thread.Sleep(0); - } - else - { - Thread.Yield(); - } - - if (yieldsoFar % TIMEOUT_CHECK_FREQUENCY == 0) + if (spinner.Count % TIMEOUT_CHECK_FREQUENCY == 0) { //Check the timeout. if (millisecondsTimeout != Timeout.Infinite && TimeoutHelper.UpdateTimeOut(startTime, millisecondsTimeout) <= 0) @@ -425,8 +381,6 @@ private void ContinueTryEnter(int millisecondsTimeout, ref bool lockTaken) return; } } - - yieldsoFar++; } } diff --git a/src/mscorlib/src/System/Threading/Tasks/Task.cs b/src/mscorlib/src/System/Threading/Tasks/Task.cs index 8e848842ee5e..4882ebf5a87f 100644 --- a/src/mscorlib/src/System/Threading/Tasks/Task.cs +++ b/src/mscorlib/src/System/Threading/Tasks/Task.cs @@ -10,19 +10,14 @@ // // =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -using System; using System.Collections.Generic; using System.Collections.ObjectModel; -using System.Runtime; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using System.Runtime.ExceptionServices; -using System.Security; -using System.Threading; using System.Diagnostics; using System.Diagnostics.Contracts; -using Microsoft.Win32; using System.Diagnostics.Tracing; +using System.Runtime.CompilerServices; +using System.Runtime.ExceptionServices; +using Internal.Runtime.Augments; // Disable the "reference to volatile field not treated as volatile" error. #pragma warning disable 0420 @@ -2971,26 +2966,19 @@ private bool SpinWait(int millisecondsTimeout) return false; } - //This code is pretty similar to the custom spinning in MRES except there is no yieling after we exceed the spin count - int spinCount = PlatformHelper.IsSingleProcessor ? 1 : System.Threading.SpinWait.YIELD_THRESHOLD; //spin only once if we are running on a single CPU - for (int i = 0; i < spinCount; i++) + int spinCount = Threading.SpinWait.SpinCountforSpinBeforeWait; + var spinner = new SpinWait(); + while (spinner.Count < spinCount) { + spinner.SpinOnce(Threading.SpinWait.Sleep1ThresholdForSpinBeforeWait); + if (IsCompleted) { return true; } - - if (i == spinCount / 2) - { - Thread.Yield(); - } - else - { - Thread.SpinWait(4 << i); - } } - return IsCompleted; + return false; } /// @@ -3227,7 +3215,7 @@ private void RunContinuations(object continuationObject) // separated out of Fin // Skip synchronous execution of continuations if this task's thread was aborted bool bCanInlineContinuations = !(((m_stateFlags & TASK_STATE_THREAD_WAS_ABORTED) != 0) || - (Thread.CurrentThread.ThreadState == ThreadState.AbortRequested) || + (RuntimeThread.CurrentThread.ThreadState == ThreadState.AbortRequested) || ((m_stateFlags & (int)TaskCreationOptions.RunContinuationsAsynchronously) != 0)); // Handle the single-Action case diff --git a/src/vm/comsynchronizable.cpp b/src/vm/comsynchronizable.cpp index 0554fe338593..8fce346142c4 100644 --- a/src/vm/comsynchronizable.cpp +++ b/src/vm/comsynchronizable.cpp @@ -1624,22 +1624,41 @@ FCIMPL1(FC_BOOL_RET, ThreadNative::IsThreadpoolThread, ThreadBaseObject* thread) } FCIMPLEND +INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration() +{ + QCALL_CONTRACT; + + INT32 optimalMaxNormalizedYieldsPerSpinIteration; + + BEGIN_QCALL; + + Thread::EnsureYieldProcessorNormalizedInitialized(); + optimalMaxNormalizedYieldsPerSpinIteration = Thread::GetOptimalMaxNormalizedYieldsPerSpinIteration(); + + END_QCALL; + + return optimalMaxNormalizedYieldsPerSpinIteration; +} FCIMPL1(void, ThreadNative::SpinWait, int iterations) { FCALL_CONTRACT; + if (iterations <= 0) + { + return; + } + // // If we're not going to spin for long, it's ok to remain in cooperative mode. // The threshold is determined by the cost of entering preemptive mode; if we're // spinning for less than that number of cycles, then switching to preemptive - // mode won't help a GC start any faster. That number is right around 1000000 - // on my machine. + // mode won't help a GC start any faster. // - if (iterations <= 1000000) + if (iterations <= 100000 && Thread::IsYieldProcessorNormalizedInitialized()) { - for(int i = 0; i < iterations; i++) - YieldProcessor(); + for (int i = 0; i < iterations; i++) + Thread::YieldProcessorNormalized(); return; } @@ -1649,8 +1668,9 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations) HELPER_METHOD_FRAME_BEGIN_NOPOLL(); GCX_PREEMP(); - for(int i = 0; i < iterations; i++) - YieldProcessor(); + Thread::EnsureYieldProcessorNormalizedInitialized(); + for (int i = 0; i < iterations; i++) + Thread::YieldProcessorNormalized(); HELPER_METHOD_FRAME_END(); } diff --git a/src/vm/comsynchronizable.h b/src/vm/comsynchronizable.h index 00b055c96070..b280c605b89f 100644 --- a/src/vm/comsynchronizable.h +++ b/src/vm/comsynchronizable.h @@ -97,6 +97,7 @@ friend class ThreadBaseObject; UINT64 QCALLTYPE GetProcessDefaultStackSize(); static FCDECL1(INT32, GetManagedThreadId, ThreadBaseObject* th); + static INT32 QCALLTYPE GetOptimalMaxSpinWaitsPerSpinIteration(); static FCDECL1(void, SpinWait, int iterations); static BOOL QCALLTYPE YieldThread(); static FCDECL0(Object*, GetCurrentThread); diff --git a/src/vm/ecalllist.h b/src/vm/ecalllist.h index 876bafc47b3e..f27e2205fe27 100644 --- a/src/vm/ecalllist.h +++ b/src/vm/ecalllist.h @@ -709,6 +709,7 @@ FCFuncStart(gRuntimeThreadFuncs) #endif // FEATURE_COMINTEROP FCFuncElement("InterruptInternal", ThreadNative::Interrupt) FCFuncElement("JoinInternal", ThreadNative::Join) + QCFuncElement("GetOptimalMaxSpinWaitsPerSpinIterationInternal", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration) FCFuncEnd() FCFuncStart(gThreadFuncs) diff --git a/src/vm/threads.cpp b/src/vm/threads.cpp index b827140dd49b..abc544338b7c 100644 --- a/src/vm/threads.cpp +++ b/src/vm/threads.cpp @@ -11744,3 +11744,87 @@ ULONGLONG Thread::QueryThreadProcessorUsage() return ullCurrentUsage - ullPreviousUsage; } #endif // FEATURE_APPDOMAIN_RESOURCE_MONITORING + +int Thread::s_yieldsPerNormalizedYield = 0; +int Thread::s_optimalMaxNormalizedYieldsPerSpinIteration = 0; + +static Crst s_initializeYieldProcessorNormalizedCrst(CrstLeafLock); +void Thread::InitializeYieldProcessorNormalized() +{ + LIMITED_METHOD_CONTRACT; + + CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst); + + if (IsYieldProcessorNormalizedInitialized()) + { + return; + } + + // Intel pre-Skylake processor: measured typically 14-17 cycles per yield + // Intel post-Skylake processor: measured typically 125-150 cycles per yield + const int DefaultYieldsPerNormalizedYield = 1; // defaults are for when no measurement is done + const int DefaultOptimalMaxNormalizedYieldsPerSpinIteration = 64; // tuned for pre-Skylake processors, for post-Skylake it should be 7 + const int MeasureDurationMs = 10; + const int MaxYieldsPerNormalizedYield = 10; // measured typically 8-9 on pre-Skylake + const int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake + const int NsPerOptimialMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake + const int NsPerSecond = 1000 * 1000 * 1000; + + LARGE_INTEGER li; + if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs) + { + // High precision clock not available or clock resolution is too low, resort to defaults + s_yieldsPerNormalizedYield = DefaultYieldsPerNormalizedYield; + s_optimalMaxNormalizedYieldsPerSpinIteration = DefaultOptimalMaxNormalizedYieldsPerSpinIteration; + return; + } + ULONGLONG ticksPerSecond = li.QuadPart; + + // Measure the nanosecond delay per yield + ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs); + unsigned int yieldCount = 0; + QueryPerformanceCounter(&li); + ULONGLONG startTicks = li.QuadPart; + ULONGLONG elapsedTicks; + do + { + for (int i = 0; i < 10; ++i) + { + YieldProcessor(); + } + yieldCount += 10; + + QueryPerformanceCounter(&li); + ULONGLONG nowTicks = li.QuadPart; + elapsedTicks = nowTicks - startTicks; + } while (elapsedTicks < measureDurationTicks); + double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond); + if (nsPerYield < 1) + { + nsPerYield = 1; + } + + // Calculate the number of yields required to span the duration of a normalized yield + int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5); + if (yieldsPerNormalizedYield < 1) + { + yieldsPerNormalizedYield = 1; + } + else if (yieldsPerNormalizedYield > MaxYieldsPerNormalizedYield) + { + yieldsPerNormalizedYield = MaxYieldsPerNormalizedYield; + } + + // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to + // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a + // better job of allowing other work to run. + int optimalMaxNormalizedYieldsPerSpinIteration = + (int)(NsPerOptimialMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5); + if (optimalMaxNormalizedYieldsPerSpinIteration < 1) + { + optimalMaxNormalizedYieldsPerSpinIteration = 1; + } + + s_yieldsPerNormalizedYield = yieldsPerNormalizedYield; + s_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration; +} diff --git a/src/vm/threads.h b/src/vm/threads.h index ad433e765b39..be36fe624e80 100644 --- a/src/vm/threads.h +++ b/src/vm/threads.h @@ -5362,6 +5362,70 @@ class Thread: public IUnknown m_HijackReturnKind = returnKind; } #endif // FEATURE_HIJACK + +private: + static int s_yieldsPerNormalizedYield; + static int s_optimalMaxNormalizedYieldsPerSpinIteration; + +private: + static void InitializeYieldProcessorNormalized(); + +public: + static bool IsYieldProcessorNormalizedInitialized() + { + LIMITED_METHOD_CONTRACT; + return s_yieldsPerNormalizedYield != 0 && s_optimalMaxNormalizedYieldsPerSpinIteration != 0; + } + +public: + static void EnsureYieldProcessorNormalizedInitialized() + { + LIMITED_METHOD_CONTRACT; + + if (!IsYieldProcessorNormalizedInitialized()) + { + InitializeYieldProcessorNormalized(); + } + } + +public: + static int GetOptimalMaxNormalizedYieldsPerSpinIteration() + { + WRAPPER_NO_CONTRACT; + _ASSERTE(IsYieldProcessorNormalizedInitialized()); + + return s_optimalMaxNormalizedYieldsPerSpinIteration; + } + +public: + static void YieldProcessorNormalized() + { + WRAPPER_NO_CONTRACT; + _ASSERTE(IsYieldProcessorNormalizedInitialized()); + + int n = s_yieldsPerNormalizedYield; + while (--n >= 0) + { + YieldProcessor(); + } + } + + static void YieldProcessorNormalizedWithBackOff(unsigned int spinIteration) + { + WRAPPER_NO_CONTRACT; + _ASSERTE(IsYieldProcessorNormalizedInitialized()); + + int n = s_optimalMaxNormalizedYieldsPerSpinIteration; + if (spinIteration <= 30 && (1 << spinIteration) < n) + { + n = 1 << spinIteration; + } + n *= s_yieldsPerNormalizedYield; + while (--n >= 0) + { + YieldProcessor(); + } + } }; // End of class Thread