diff --git a/src/mscorlib/shared/System/Threading/SpinWait.cs b/src/mscorlib/shared/System/Threading/SpinWait.cs
index d25d54f26f3f..5346e8d17bdc 100644
--- a/src/mscorlib/shared/System/Threading/SpinWait.cs
+++ b/src/mscorlib/shared/System/Threading/SpinWait.cs
@@ -69,9 +69,26 @@ public struct SpinWait
// numbers may seem fairly arbitrary, but were derived with at least some
// thought in the design document. I fully expect they will need to change
// over time as we gain more experience with performance.
- internal const int YIELD_THRESHOLD = 10; // When to switch over to a true yield.
- internal const int SLEEP_0_EVERY_HOW_MANY_TIMES = 5; // After how many yields should we Sleep(0)?
- internal const int SLEEP_1_EVERY_HOW_MANY_TIMES = 20; // After how many yields should we Sleep(1)?
+ internal const int YieldThreshold = 10; // When to switch over to a true yield.
+ private const int Sleep0EveryHowManyYields = 5; // After how many yields should we Sleep(0)?
+ internal const int DefaultSleep1Threshold = 20; // After how many yields should we Sleep(1) frequently?
+
+ ///
+ /// A suggested number of spin iterations before doing a proper wait, such as waiting on an event that becomes signaled
+ /// when the resource becomes available.
+ ///
+ ///
+ /// These numbers were arrived at by experimenting with different numbers in various cases that currently use it. It's
+ /// only a suggested value and typically works well when the proper wait is something like an event.
+ ///
+ /// Spinning less can lead to early waiting and more context switching, spinning more can decrease latency but may use
+ /// up some CPU time unnecessarily. Depends on the situation too, for instance SemaphoreSlim uses double this number
+ /// because the waiting there is currently a lot more expensive (involves more spinning, taking a lock, etc.). It also
+ /// depends on the likelihood of the spin being successful and how long the wait would be but those are not accounted
+ /// for here.
+ ///
+ internal static readonly int SpinCountforSpinBeforeWait = PlatformHelper.IsSingleProcessor ? 1 : 35;
+ internal const int Sleep1ThresholdForSpinBeforeWait = 40; // should be greater than SpinCountforSpinBeforeWait
// The number of times we've spun already.
private int _count;
@@ -81,7 +98,12 @@ public struct SpinWait
///
public int Count
{
- get { return _count; }
+ get => _count;
+ internal set
+ {
+ Debug.Assert(value >= 0);
+ _count = value;
+ }
}
///
@@ -94,10 +116,7 @@ public int Count
/// On a single-CPU machine, always yields the processor. On machines with
/// multiple CPUs, may yield after an unspecified number of calls.
///
- public bool NextSpinWillYield
- {
- get { return _count > YIELD_THRESHOLD || PlatformHelper.IsSingleProcessor; }
- }
+ public bool NextSpinWillYield => _count >= YieldThreshold || PlatformHelper.IsSingleProcessor;
///
/// Performs a single spin.
@@ -108,7 +127,27 @@ public bool NextSpinWillYield
///
public void SpinOnce()
{
- if (NextSpinWillYield)
+ SpinOnce(DefaultSleep1Threshold);
+ }
+
+ internal void SpinOnce(int sleep1Threshold)
+ {
+ Debug.Assert(sleep1Threshold >= YieldThreshold || PlatformHelper.IsSingleProcessor); // so that NextSpinWillYield behaves as requested
+
+ // (_count - YieldThreshold) % 2 == 0: The purpose of this check is to interleave Thread.Yield/Sleep(0) with
+ // Thread.SpinWait. Otherwise, the following issues occur:
+ // - When there are no threads to switch to, Yield and Sleep(0) become no-op and it turns the spin loop into a
+ // busy-spin that may quickly reach the max spin count and cause the thread to enter a wait state, or may
+ // just busy-spin for longer than desired before a Sleep(1). Completing the spin loop too early can cause
+ // excessive context switcing if a wait follows, and entering the Sleep(1) stage too early can cause
+ // excessive delays.
+ // - If there are multiple threads doing Yield and Sleep(0) (typically from the same spin loop due to
+ // contention), they may switch between one another, delaying work that can make progress.
+ if ((
+ _count >= YieldThreshold &&
+ (_count >= sleep1Threshold || (_count - YieldThreshold) % 2 == 0)
+ ) ||
+ PlatformHelper.IsSingleProcessor)
{
//
// We must yield.
@@ -125,19 +164,21 @@ public void SpinOnce()
// configured to use the (default) coarse-grained system timer.
//
- int yieldsSoFar = (_count >= YIELD_THRESHOLD ? _count - YIELD_THRESHOLD : _count);
-
- if ((yieldsSoFar % SLEEP_1_EVERY_HOW_MANY_TIMES) == (SLEEP_1_EVERY_HOW_MANY_TIMES - 1))
+ if (_count >= sleep1Threshold)
{
RuntimeThread.Sleep(1);
}
- else if ((yieldsSoFar % SLEEP_0_EVERY_HOW_MANY_TIMES) == (SLEEP_0_EVERY_HOW_MANY_TIMES - 1))
- {
- RuntimeThread.Sleep(0);
- }
else
{
- RuntimeThread.Yield();
+ int yieldsSoFar = _count >= YieldThreshold ? (_count - YieldThreshold) / 2 : _count;
+ if ((yieldsSoFar % Sleep0EveryHowManyYields) == (Sleep0EveryHowManyYields - 1))
+ {
+ RuntimeThread.Sleep(0);
+ }
+ else
+ {
+ RuntimeThread.Yield();
+ }
}
}
else
@@ -153,11 +194,24 @@ public void SpinOnce()
// number of spins we are willing to tolerate to reduce delay to the caller,
// since we expect most callers will eventually block anyway.
//
- RuntimeThread.SpinWait(4 << _count);
+ // Also, cap the maximum spin count to a value such that many thousands of CPU cycles would not be wasted doing
+ // the equivalent of YieldProcessor(), as that that point SwitchToThread/Sleep(0) are more likely to be able to
+ // allow other useful work to run. Long YieldProcessor() loops can help to reduce contention, but Sleep(1) is
+ // usually better for that.
+ //
+ // RuntimeThread.OptimalMaxSpinWaitsPerSpinIteration:
+ // - See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
+ //
+ int n = RuntimeThread.OptimalMaxSpinWaitsPerSpinIteration;
+ if (_count <= 30 && (1 << _count) < n)
+ {
+ n = 1 << _count;
+ }
+ RuntimeThread.SpinWait(n);
}
// Finally, increment our spin counter.
- _count = (_count == int.MaxValue ? YIELD_THRESHOLD : _count + 1);
+ _count = (_count == int.MaxValue ? YieldThreshold : _count + 1);
}
///
@@ -299,9 +353,7 @@ internal static int ProcessorCount
///
/// Gets whether the current machine has only a single processor.
///
- internal static bool IsSingleProcessor
- {
- get { return ProcessorCount == 1; }
- }
+ /// This typically does not change on a machine, so it's checked only once.
+ internal static readonly bool IsSingleProcessor = ProcessorCount == 1;
}
}
diff --git a/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs b/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs
index 605f974da0c5..4c67ea3fd625 100644
--- a/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs
+++ b/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs
@@ -15,6 +15,8 @@ namespace Internal.Runtime.Augments
{
public class RuntimeThread : CriticalFinalizerObject
{
+ private static int s_optimalMaxSpinWaitsPerSpinIteration;
+
internal RuntimeThread() { }
public static RuntimeThread Create(ThreadStart start) => new Thread(start);
@@ -186,6 +188,33 @@ public void DisableComObjectEagerCleanup()
private extern bool JoinInternal(int millisecondsTimeout);
public static void Sleep(int millisecondsTimeout) => Thread.Sleep(millisecondsTimeout);
+
+ [DllImport(JitHelpers.QCall)]
+ [SuppressUnmanagedCodeSecurity]
+ private static extern int GetOptimalMaxSpinWaitsPerSpinIterationInternal();
+
+ ///
+ /// Max value to be passed into for optimal delaying. This value is normalized to be
+ /// appropriate for the processor.
+ ///
+ internal static int OptimalMaxSpinWaitsPerSpinIteration
+ {
+ get
+ {
+ if (s_optimalMaxSpinWaitsPerSpinIteration != 0)
+ {
+ return s_optimalMaxSpinWaitsPerSpinIteration;
+ }
+
+ // This is done lazily because the first call to the function below in the process triggers a measurement that
+ // takes a nontrivial amount of time. See Thread::InitializeYieldProcessorNormalized(), which describes and
+ // calculates this value.
+ s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal();
+ Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0);
+ return s_optimalMaxSpinWaitsPerSpinIteration;
+ }
+ }
+
public static void SpinWait(int iterations) => Thread.SpinWait(iterations);
public static bool Yield() => Thread.Yield();
diff --git a/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs b/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs
index e39696849989..8a245f060207 100644
--- a/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs
+++ b/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs
@@ -12,9 +12,6 @@
//
// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-using System;
-using System.Threading;
-using System.Runtime.InteropServices;
using System.Diagnostics;
using System.Diagnostics.Contracts;
@@ -48,7 +45,6 @@ public class ManualResetEventSlim : IDisposable
{
// These are the default spin counts we use on single-proc and MP machines.
private const int DEFAULT_SPIN_SP = 1;
- private const int DEFAULT_SPIN_MP = SpinWait.YIELD_THRESHOLD;
private volatile object m_lock;
// A lock used for waiting and pulsing. Lazily initialized via EnsureLockObjectCreated()
@@ -193,7 +189,7 @@ public ManualResetEventSlim(bool initialState)
{
// Specify the defualt spin count, and use default spin if we're
// on a multi-processor machine. Otherwise, we won't.
- Initialize(initialState, DEFAULT_SPIN_MP);
+ Initialize(initialState, SpinWait.SpinCountforSpinBeforeWait);
}
///
@@ -563,44 +559,19 @@ public bool Wait(int millisecondsTimeout, CancellationToken cancellationToken)
bNeedTimeoutAdjustment = true;
}
- //spin
- int HOW_MANY_SPIN_BEFORE_YIELD = 10;
- int HOW_MANY_YIELD_EVERY_SLEEP_0 = 5;
- int HOW_MANY_YIELD_EVERY_SLEEP_1 = 20;
-
+ // Spin
int spinCount = SpinCount;
- for (int i = 0; i < spinCount; i++)
+ var spinner = new SpinWait();
+ while (spinner.Count < spinCount)
{
+ spinner.SpinOnce(SpinWait.Sleep1ThresholdForSpinBeforeWait);
+
if (IsSet)
{
return true;
}
- else if (i < HOW_MANY_SPIN_BEFORE_YIELD)
- {
- if (i == HOW_MANY_SPIN_BEFORE_YIELD / 2)
- {
- Thread.Yield();
- }
- else
- {
- Thread.SpinWait(4 << i);
- }
- }
- else if (i % HOW_MANY_YIELD_EVERY_SLEEP_1 == 0)
- {
- Thread.Sleep(1);
- }
- else if (i % HOW_MANY_YIELD_EVERY_SLEEP_0 == 0)
- {
- Thread.Sleep(0);
- }
- else
- {
- Thread.Yield();
- }
-
- if (i >= 100 && i % 10 == 0) // check the cancellation token if the user passed a very large spin count
+ if (spinner.Count >= 100 && spinner.Count % 10 == 0) // check the cancellation token if the user passed a very large spin count
cancellationToken.ThrowIfCancellationRequested();
}
diff --git a/src/mscorlib/src/System/Threading/SemaphoreSlim.cs b/src/mscorlib/src/System/Threading/SemaphoreSlim.cs
index e00a106eb34a..972b21adae3b 100644
--- a/src/mscorlib/src/System/Threading/SemaphoreSlim.cs
+++ b/src/mscorlib/src/System/Threading/SemaphoreSlim.cs
@@ -342,15 +342,28 @@ public bool Wait(int millisecondsTimeout, CancellationToken cancellationToken)
CancellationTokenRegistration cancellationTokenRegistration = cancellationToken.InternalRegisterWithoutEC(s_cancellationTokenCanceledEventHandler, this);
try
{
- // Perf: first spin wait for the count to be positive, but only up to the first planned yield.
+ // Perf: first spin wait for the count to be positive.
// This additional amount of spinwaiting in addition
// to Monitor.Enter()’s spinwaiting has shown measurable perf gains in test scenarios.
//
+
+ // Monitor.Enter followed by Monitor.Wait is much more expensive than waiting on an event as it involves another
+ // spin, contention, etc. The usual number of spin iterations that would otherwise be used here is doubled to
+ // lessen that extra expense of doing a proper wait.
+ int spinCount = SpinWait.SpinCountforSpinBeforeWait * 2;
+ int sleep1Threshold = SpinWait.Sleep1ThresholdForSpinBeforeWait * 2;
+
SpinWait spin = new SpinWait();
- while (m_currentCount == 0 && !spin.NextSpinWillYield)
+ while (true)
{
- spin.SpinOnce();
+ spin.SpinOnce(sleep1Threshold);
+
+ if (m_currentCount != 0)
+ {
+ break;
+ }
}
+
// entering the lock and incrementing waiters must not suffer a thread-abort, else we cannot
// clean up m_waitCount correctly, which may lead to deadlock due to non-woken waiters.
try { }
diff --git a/src/mscorlib/src/System/Threading/SpinLock.cs b/src/mscorlib/src/System/Threading/SpinLock.cs
index eee73ce2bf48..dbf2024e5dc9 100644
--- a/src/mscorlib/src/System/Threading/SpinLock.cs
+++ b/src/mscorlib/src/System/Threading/SpinLock.cs
@@ -65,16 +65,9 @@ public struct SpinLock
private volatile int m_owner;
- // The multiplier factor for the each spinning iteration
- // This number has been chosen after trying different numbers on different CPUs (4, 8 and 16 ) and this provided the best results
- private const int SPINNING_FACTOR = 100;
-
// After how many yields, call Sleep(1)
private const int SLEEP_ONE_FREQUENCY = 40;
- // After how many yields, call Sleep(0)
- private const int SLEEP_ZERO_FREQUENCY = 10;
-
// After how many yields, check the timeout
private const int TIMEOUT_CHECK_FREQUENCY = 10;
@@ -347,48 +340,24 @@ private void ContinueTryEnter(int millisecondsTimeout, ref bool lockTaken)
else //failed to acquire the lock,then try to update the waiters. If the waiters count reached the maximum, jsut break the loop to avoid overflow
{
if ((observedOwner & WAITERS_MASK) != MAXIMUM_WAITERS)
+ {
+ // This can still overflow, but maybe there will never be that many waiters
turn = (Interlocked.Add(ref m_owner, 2) & WAITERS_MASK) >> 1;
+ }
}
- //***Step 2. Spinning
//lock acquired failed and waiters updated
- int processorCount = PlatformHelper.ProcessorCount;
- if (turn < processorCount)
- {
- int processFactor = 1;
- for (int i = 1; i <= turn * SPINNING_FACTOR; i++)
- {
- Thread.SpinWait((turn + i) * SPINNING_FACTOR * processFactor);
- if (processFactor < processorCount)
- processFactor++;
- observedOwner = m_owner;
- if ((observedOwner & LOCK_ANONYMOUS_OWNED) == LOCK_UNOWNED)
- {
- int newOwner = (observedOwner & WAITERS_MASK) == 0 ? // Gets the number of waiters, if zero
- observedOwner | 1 // don't decrement it. just set the lock bit, it is zzero because a previous call of Exit(false) ehich corrupted the waiters
- : (observedOwner - 2) | 1; // otherwise decrement the waiters and set the lock bit
- Debug.Assert((newOwner & WAITERS_MASK) >= 0);
-
- if (CompareExchange(ref m_owner, newOwner, observedOwner, ref lockTaken) == observedOwner)
- {
- return;
- }
- }
- }
- // Check the timeout.
- if (millisecondsTimeout != Timeout.Infinite && TimeoutHelper.UpdateTimeOut(startTime, millisecondsTimeout) <= 0)
- {
- DecrementWaiters();
- return;
- }
+ //*** Step 2, Spinning and Yielding
+ var spinner = new SpinWait();
+ if (turn > PlatformHelper.ProcessorCount)
+ {
+ spinner.Count = SpinWait.YieldThreshold;
}
-
- //*** Step 3, Yielding
- //Sleep(1) every 50 yields
- int yieldsoFar = 0;
while (true)
{
+ spinner.SpinOnce(SLEEP_ONE_FREQUENCY);
+
observedOwner = m_owner;
if ((observedOwner & LOCK_ANONYMOUS_OWNED) == LOCK_UNOWNED)
{
@@ -403,20 +372,7 @@ private void ContinueTryEnter(int millisecondsTimeout, ref bool lockTaken)
}
}
- if (yieldsoFar % SLEEP_ONE_FREQUENCY == 0)
- {
- Thread.Sleep(1);
- }
- else if (yieldsoFar % SLEEP_ZERO_FREQUENCY == 0)
- {
- Thread.Sleep(0);
- }
- else
- {
- Thread.Yield();
- }
-
- if (yieldsoFar % TIMEOUT_CHECK_FREQUENCY == 0)
+ if (spinner.Count % TIMEOUT_CHECK_FREQUENCY == 0)
{
//Check the timeout.
if (millisecondsTimeout != Timeout.Infinite && TimeoutHelper.UpdateTimeOut(startTime, millisecondsTimeout) <= 0)
@@ -425,8 +381,6 @@ private void ContinueTryEnter(int millisecondsTimeout, ref bool lockTaken)
return;
}
}
-
- yieldsoFar++;
}
}
diff --git a/src/mscorlib/src/System/Threading/Tasks/Task.cs b/src/mscorlib/src/System/Threading/Tasks/Task.cs
index 8e848842ee5e..4882ebf5a87f 100644
--- a/src/mscorlib/src/System/Threading/Tasks/Task.cs
+++ b/src/mscorlib/src/System/Threading/Tasks/Task.cs
@@ -10,19 +10,14 @@
//
// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
-using System.Runtime;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using System.Runtime.ExceptionServices;
-using System.Security;
-using System.Threading;
using System.Diagnostics;
using System.Diagnostics.Contracts;
-using Microsoft.Win32;
using System.Diagnostics.Tracing;
+using System.Runtime.CompilerServices;
+using System.Runtime.ExceptionServices;
+using Internal.Runtime.Augments;
// Disable the "reference to volatile field not treated as volatile" error.
#pragma warning disable 0420
@@ -2971,26 +2966,19 @@ private bool SpinWait(int millisecondsTimeout)
return false;
}
- //This code is pretty similar to the custom spinning in MRES except there is no yieling after we exceed the spin count
- int spinCount = PlatformHelper.IsSingleProcessor ? 1 : System.Threading.SpinWait.YIELD_THRESHOLD; //spin only once if we are running on a single CPU
- for (int i = 0; i < spinCount; i++)
+ int spinCount = Threading.SpinWait.SpinCountforSpinBeforeWait;
+ var spinner = new SpinWait();
+ while (spinner.Count < spinCount)
{
+ spinner.SpinOnce(Threading.SpinWait.Sleep1ThresholdForSpinBeforeWait);
+
if (IsCompleted)
{
return true;
}
-
- if (i == spinCount / 2)
- {
- Thread.Yield();
- }
- else
- {
- Thread.SpinWait(4 << i);
- }
}
- return IsCompleted;
+ return false;
}
///
@@ -3227,7 +3215,7 @@ private void RunContinuations(object continuationObject) // separated out of Fin
// Skip synchronous execution of continuations if this task's thread was aborted
bool bCanInlineContinuations = !(((m_stateFlags & TASK_STATE_THREAD_WAS_ABORTED) != 0) ||
- (Thread.CurrentThread.ThreadState == ThreadState.AbortRequested) ||
+ (RuntimeThread.CurrentThread.ThreadState == ThreadState.AbortRequested) ||
((m_stateFlags & (int)TaskCreationOptions.RunContinuationsAsynchronously) != 0));
// Handle the single-Action case
diff --git a/src/vm/comsynchronizable.cpp b/src/vm/comsynchronizable.cpp
index 0554fe338593..8fce346142c4 100644
--- a/src/vm/comsynchronizable.cpp
+++ b/src/vm/comsynchronizable.cpp
@@ -1624,22 +1624,41 @@ FCIMPL1(FC_BOOL_RET, ThreadNative::IsThreadpoolThread, ThreadBaseObject* thread)
}
FCIMPLEND
+INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration()
+{
+ QCALL_CONTRACT;
+
+ INT32 optimalMaxNormalizedYieldsPerSpinIteration;
+
+ BEGIN_QCALL;
+
+ Thread::EnsureYieldProcessorNormalizedInitialized();
+ optimalMaxNormalizedYieldsPerSpinIteration = Thread::GetOptimalMaxNormalizedYieldsPerSpinIteration();
+
+ END_QCALL;
+
+ return optimalMaxNormalizedYieldsPerSpinIteration;
+}
FCIMPL1(void, ThreadNative::SpinWait, int iterations)
{
FCALL_CONTRACT;
+ if (iterations <= 0)
+ {
+ return;
+ }
+
//
// If we're not going to spin for long, it's ok to remain in cooperative mode.
// The threshold is determined by the cost of entering preemptive mode; if we're
// spinning for less than that number of cycles, then switching to preemptive
- // mode won't help a GC start any faster. That number is right around 1000000
- // on my machine.
+ // mode won't help a GC start any faster.
//
- if (iterations <= 1000000)
+ if (iterations <= 100000 && Thread::IsYieldProcessorNormalizedInitialized())
{
- for(int i = 0; i < iterations; i++)
- YieldProcessor();
+ for (int i = 0; i < iterations; i++)
+ Thread::YieldProcessorNormalized();
return;
}
@@ -1649,8 +1668,9 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations)
HELPER_METHOD_FRAME_BEGIN_NOPOLL();
GCX_PREEMP();
- for(int i = 0; i < iterations; i++)
- YieldProcessor();
+ Thread::EnsureYieldProcessorNormalizedInitialized();
+ for (int i = 0; i < iterations; i++)
+ Thread::YieldProcessorNormalized();
HELPER_METHOD_FRAME_END();
}
diff --git a/src/vm/comsynchronizable.h b/src/vm/comsynchronizable.h
index 00b055c96070..b280c605b89f 100644
--- a/src/vm/comsynchronizable.h
+++ b/src/vm/comsynchronizable.h
@@ -97,6 +97,7 @@ friend class ThreadBaseObject;
UINT64 QCALLTYPE GetProcessDefaultStackSize();
static FCDECL1(INT32, GetManagedThreadId, ThreadBaseObject* th);
+ static INT32 QCALLTYPE GetOptimalMaxSpinWaitsPerSpinIteration();
static FCDECL1(void, SpinWait, int iterations);
static BOOL QCALLTYPE YieldThread();
static FCDECL0(Object*, GetCurrentThread);
diff --git a/src/vm/ecalllist.h b/src/vm/ecalllist.h
index 876bafc47b3e..f27e2205fe27 100644
--- a/src/vm/ecalllist.h
+++ b/src/vm/ecalllist.h
@@ -709,6 +709,7 @@ FCFuncStart(gRuntimeThreadFuncs)
#endif // FEATURE_COMINTEROP
FCFuncElement("InterruptInternal", ThreadNative::Interrupt)
FCFuncElement("JoinInternal", ThreadNative::Join)
+ QCFuncElement("GetOptimalMaxSpinWaitsPerSpinIterationInternal", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
FCFuncEnd()
FCFuncStart(gThreadFuncs)
diff --git a/src/vm/threads.cpp b/src/vm/threads.cpp
index b827140dd49b..abc544338b7c 100644
--- a/src/vm/threads.cpp
+++ b/src/vm/threads.cpp
@@ -11744,3 +11744,87 @@ ULONGLONG Thread::QueryThreadProcessorUsage()
return ullCurrentUsage - ullPreviousUsage;
}
#endif // FEATURE_APPDOMAIN_RESOURCE_MONITORING
+
+int Thread::s_yieldsPerNormalizedYield = 0;
+int Thread::s_optimalMaxNormalizedYieldsPerSpinIteration = 0;
+
+static Crst s_initializeYieldProcessorNormalizedCrst(CrstLeafLock);
+void Thread::InitializeYieldProcessorNormalized()
+{
+ LIMITED_METHOD_CONTRACT;
+
+ CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
+
+ if (IsYieldProcessorNormalizedInitialized())
+ {
+ return;
+ }
+
+ // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
+ // Intel post-Skylake processor: measured typically 125-150 cycles per yield
+ const int DefaultYieldsPerNormalizedYield = 1; // defaults are for when no measurement is done
+ const int DefaultOptimalMaxNormalizedYieldsPerSpinIteration = 64; // tuned for pre-Skylake processors, for post-Skylake it should be 7
+ const int MeasureDurationMs = 10;
+ const int MaxYieldsPerNormalizedYield = 10; // measured typically 8-9 on pre-Skylake
+ const int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
+ const int NsPerOptimialMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
+ const int NsPerSecond = 1000 * 1000 * 1000;
+
+ LARGE_INTEGER li;
+ if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs)
+ {
+ // High precision clock not available or clock resolution is too low, resort to defaults
+ s_yieldsPerNormalizedYield = DefaultYieldsPerNormalizedYield;
+ s_optimalMaxNormalizedYieldsPerSpinIteration = DefaultOptimalMaxNormalizedYieldsPerSpinIteration;
+ return;
+ }
+ ULONGLONG ticksPerSecond = li.QuadPart;
+
+ // Measure the nanosecond delay per yield
+ ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
+ unsigned int yieldCount = 0;
+ QueryPerformanceCounter(&li);
+ ULONGLONG startTicks = li.QuadPart;
+ ULONGLONG elapsedTicks;
+ do
+ {
+ for (int i = 0; i < 10; ++i)
+ {
+ YieldProcessor();
+ }
+ yieldCount += 10;
+
+ QueryPerformanceCounter(&li);
+ ULONGLONG nowTicks = li.QuadPart;
+ elapsedTicks = nowTicks - startTicks;
+ } while (elapsedTicks < measureDurationTicks);
+ double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
+ if (nsPerYield < 1)
+ {
+ nsPerYield = 1;
+ }
+
+ // Calculate the number of yields required to span the duration of a normalized yield
+ int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
+ if (yieldsPerNormalizedYield < 1)
+ {
+ yieldsPerNormalizedYield = 1;
+ }
+ else if (yieldsPerNormalizedYield > MaxYieldsPerNormalizedYield)
+ {
+ yieldsPerNormalizedYield = MaxYieldsPerNormalizedYield;
+ }
+
+ // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
+ // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
+ // better job of allowing other work to run.
+ int optimalMaxNormalizedYieldsPerSpinIteration =
+ (int)(NsPerOptimialMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
+ if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
+ {
+ optimalMaxNormalizedYieldsPerSpinIteration = 1;
+ }
+
+ s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+ s_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
+}
diff --git a/src/vm/threads.h b/src/vm/threads.h
index ad433e765b39..be36fe624e80 100644
--- a/src/vm/threads.h
+++ b/src/vm/threads.h
@@ -5362,6 +5362,70 @@ class Thread: public IUnknown
m_HijackReturnKind = returnKind;
}
#endif // FEATURE_HIJACK
+
+private:
+ static int s_yieldsPerNormalizedYield;
+ static int s_optimalMaxNormalizedYieldsPerSpinIteration;
+
+private:
+ static void InitializeYieldProcessorNormalized();
+
+public:
+ static bool IsYieldProcessorNormalizedInitialized()
+ {
+ LIMITED_METHOD_CONTRACT;
+ return s_yieldsPerNormalizedYield != 0 && s_optimalMaxNormalizedYieldsPerSpinIteration != 0;
+ }
+
+public:
+ static void EnsureYieldProcessorNormalizedInitialized()
+ {
+ LIMITED_METHOD_CONTRACT;
+
+ if (!IsYieldProcessorNormalizedInitialized())
+ {
+ InitializeYieldProcessorNormalized();
+ }
+ }
+
+public:
+ static int GetOptimalMaxNormalizedYieldsPerSpinIteration()
+ {
+ WRAPPER_NO_CONTRACT;
+ _ASSERTE(IsYieldProcessorNormalizedInitialized());
+
+ return s_optimalMaxNormalizedYieldsPerSpinIteration;
+ }
+
+public:
+ static void YieldProcessorNormalized()
+ {
+ WRAPPER_NO_CONTRACT;
+ _ASSERTE(IsYieldProcessorNormalizedInitialized());
+
+ int n = s_yieldsPerNormalizedYield;
+ while (--n >= 0)
+ {
+ YieldProcessor();
+ }
+ }
+
+ static void YieldProcessorNormalizedWithBackOff(unsigned int spinIteration)
+ {
+ WRAPPER_NO_CONTRACT;
+ _ASSERTE(IsYieldProcessorNormalizedInitialized());
+
+ int n = s_optimalMaxNormalizedYieldsPerSpinIteration;
+ if (spinIteration <= 30 && (1 << spinIteration) < n)
+ {
+ n = 1 << spinIteration;
+ }
+ n *= s_yieldsPerNormalizedYield;
+ while (--n >= 0)
+ {
+ YieldProcessor();
+ }
+ }
};
// End of class Thread