diff --git a/src/mscorlib/shared/System/Threading/SpinWait.cs b/src/mscorlib/shared/System/Threading/SpinWait.cs
index d25d54f26f3f..5346e8d17bdc 100644
--- a/src/mscorlib/shared/System/Threading/SpinWait.cs
+++ b/src/mscorlib/shared/System/Threading/SpinWait.cs
@@ -69,9 +69,26 @@ public struct SpinWait
         // numbers may seem fairly arbitrary, but were derived with at least some
         // thought in the design document.  I fully expect they will need to change
         // over time as we gain more experience with performance.
-        internal const int YIELD_THRESHOLD = 10; // When to switch over to a true yield.
-        internal const int SLEEP_0_EVERY_HOW_MANY_TIMES = 5; // After how many yields should we Sleep(0)?
-        internal const int SLEEP_1_EVERY_HOW_MANY_TIMES = 20; // After how many yields should we Sleep(1)?
+        internal const int YieldThreshold = 10; // When to switch over to a true yield.
+        private const int Sleep0EveryHowManyYields = 5; // After how many yields should we Sleep(0)?
+        internal const int DefaultSleep1Threshold = 20; // After how many yields should we Sleep(1) frequently?
+
+        /// <summary>
+        /// A suggested number of spin iterations before doing a proper wait, such as waiting on an event that becomes signaled
+        /// when the resource becomes available.
+        /// </summary>
+        /// <remarks>
+        /// These numbers were arrived at by experimenting with different numbers in various cases that currently use it. It's
+        /// only a suggested value and typically works well when the proper wait is something like an event.
+        /// 
+        /// Spinning less can lead to early waiting and more context switching, spinning more can decrease latency but may use
+        /// up some CPU time unnecessarily. Depends on the situation too, for instance SemaphoreSlim uses double this number
+        /// because the waiting there is currently a lot more expensive (involves more spinning, taking a lock, etc.). It also
+        /// depends on the likelihood of the spin being successful and how long the wait would be but those are not accounted
+        /// for here.
+        /// </remarks>
+        internal static readonly int SpinCountforSpinBeforeWait = PlatformHelper.IsSingleProcessor ? 1 : 35;
+        internal const int Sleep1ThresholdForSpinBeforeWait = 40; // should be greater than SpinCountforSpinBeforeWait
 
         // The number of times we've spun already.
         private int _count;
@@ -81,7 +98,12 @@ public struct SpinWait
         /// </summary>
         public int Count
         {
-            get { return _count; }
+            get => _count;
+            internal set
+            {
+                Debug.Assert(value >= 0);
+                _count = value;
+            }
         }
 
         /// <summary>
@@ -94,10 +116,7 @@ public int Count
         /// On a single-CPU machine, <see cref="SpinOnce"/> always yields the processor. On machines with
         /// multiple CPUs, <see cref="SpinOnce"/> may yield after an unspecified number of calls.
         /// </remarks>
-        public bool NextSpinWillYield
-        {
-            get { return _count > YIELD_THRESHOLD || PlatformHelper.IsSingleProcessor; }
-        }
+        public bool NextSpinWillYield => _count >= YieldThreshold || PlatformHelper.IsSingleProcessor;
 
         /// <summary>
         /// Performs a single spin.
@@ -108,7 +127,27 @@ public bool NextSpinWillYield
         /// </remarks>
         public void SpinOnce()
         {
-            if (NextSpinWillYield)
+            SpinOnce(DefaultSleep1Threshold);
+        }
+
+        internal void SpinOnce(int sleep1Threshold)
+        {
+            Debug.Assert(sleep1Threshold >= YieldThreshold || PlatformHelper.IsSingleProcessor); // so that NextSpinWillYield behaves as requested
+
+            // (_count - YieldThreshold) % 2 == 0: The purpose of this check is to interleave Thread.Yield/Sleep(0) with
+            // Thread.SpinWait. Otherwise, the following issues occur:
+            //   - When there are no threads to switch to, Yield and Sleep(0) become no-op and it turns the spin loop into a
+            //     busy-spin that may quickly reach the max spin count and cause the thread to enter a wait state, or may
+            //     just busy-spin for longer than desired before a Sleep(1). Completing the spin loop too early can cause
+            //     excessive context switcing if a wait follows, and entering the Sleep(1) stage too early can cause
+            //     excessive delays.
+            //   - If there are multiple threads doing Yield and Sleep(0) (typically from the same spin loop due to
+            //     contention), they may switch between one another, delaying work that can make progress.
+            if ((
+                    _count >= YieldThreshold &&
+                    (_count >= sleep1Threshold || (_count - YieldThreshold) % 2 == 0)
+                ) ||
+                PlatformHelper.IsSingleProcessor)
             {
                 //
                 // We must yield.
@@ -125,19 +164,21 @@ public void SpinOnce()
                 // configured to use the (default) coarse-grained system timer.
                 //
 
-                int yieldsSoFar = (_count >= YIELD_THRESHOLD ? _count - YIELD_THRESHOLD : _count);
-
-                if ((yieldsSoFar % SLEEP_1_EVERY_HOW_MANY_TIMES) == (SLEEP_1_EVERY_HOW_MANY_TIMES - 1))
+                if (_count >= sleep1Threshold)
                 {
                     RuntimeThread.Sleep(1);
                 }
-                else if ((yieldsSoFar % SLEEP_0_EVERY_HOW_MANY_TIMES) == (SLEEP_0_EVERY_HOW_MANY_TIMES - 1))
-                {
-                    RuntimeThread.Sleep(0);
-                }
                 else
                 {
-                    RuntimeThread.Yield();
+                    int yieldsSoFar = _count >= YieldThreshold ? (_count - YieldThreshold) / 2 : _count;
+                    if ((yieldsSoFar % Sleep0EveryHowManyYields) == (Sleep0EveryHowManyYields - 1))
+                    {
+                        RuntimeThread.Sleep(0);
+                    }
+                    else
+                    {
+                        RuntimeThread.Yield();
+                    }
                 }
             }
             else
@@ -153,11 +194,24 @@ public void SpinOnce()
                 // number of spins we are willing to tolerate to reduce delay to the caller,
                 // since we expect most callers will eventually block anyway.
                 //
-                RuntimeThread.SpinWait(4 << _count);
+                // Also, cap the maximum spin count to a value such that many thousands of CPU cycles would not be wasted doing
+                // the equivalent of YieldProcessor(), as that that point SwitchToThread/Sleep(0) are more likely to be able to
+                // allow other useful work to run. Long YieldProcessor() loops can help to reduce contention, but Sleep(1) is
+                // usually better for that.
+                //
+                // RuntimeThread.OptimalMaxSpinWaitsPerSpinIteration:
+                //   - See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
+                //
+                int n = RuntimeThread.OptimalMaxSpinWaitsPerSpinIteration;
+                if (_count <= 30 && (1 << _count) < n)
+                {
+                    n = 1 << _count;
+                }
+                RuntimeThread.SpinWait(n);
             }
 
             // Finally, increment our spin counter.
-            _count = (_count == int.MaxValue ? YIELD_THRESHOLD : _count + 1);
+            _count = (_count == int.MaxValue ? YieldThreshold : _count + 1);
         }
 
         /// <summary>
@@ -299,9 +353,7 @@ internal static int ProcessorCount
         /// <summary>
         /// Gets whether the current machine has only a single processor.
         /// </summary>
-        internal static bool IsSingleProcessor
-        {
-            get { return ProcessorCount == 1; }
-        }
+        /// <remarks>This typically does not change on a machine, so it's checked only once.</remarks>
+        internal static readonly bool IsSingleProcessor = ProcessorCount == 1;
     }
 }
diff --git a/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs b/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs
index 605f974da0c5..4c67ea3fd625 100644
--- a/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs
+++ b/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs
@@ -15,6 +15,8 @@ namespace Internal.Runtime.Augments
 {
     public class RuntimeThread : CriticalFinalizerObject
     {
+        private static int s_optimalMaxSpinWaitsPerSpinIteration;
+
         internal RuntimeThread() { }
 
         public static RuntimeThread Create(ThreadStart start) => new Thread(start);
@@ -186,6 +188,33 @@ public void DisableComObjectEagerCleanup()
         private extern bool JoinInternal(int millisecondsTimeout);
 
         public static void Sleep(int millisecondsTimeout) => Thread.Sleep(millisecondsTimeout);
+
+        [DllImport(JitHelpers.QCall)]
+        [SuppressUnmanagedCodeSecurity]
+        private static extern int GetOptimalMaxSpinWaitsPerSpinIterationInternal();
+
+        /// <summary>
+        /// Max value to be passed into <see cref="SpinWait(int)"/> for optimal delaying. This value is normalized to be
+        /// appropriate for the processor.
+        /// </summary>
+        internal static int OptimalMaxSpinWaitsPerSpinIteration
+        {
+            get
+            {
+                if (s_optimalMaxSpinWaitsPerSpinIteration != 0)
+                {
+                    return s_optimalMaxSpinWaitsPerSpinIteration;
+                }
+
+                // This is done lazily because the first call to the function below in the process triggers a measurement that
+                // takes a nontrivial amount of time. See Thread::InitializeYieldProcessorNormalized(), which describes and
+                // calculates this value.
+                s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal();
+                Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0);
+                return s_optimalMaxSpinWaitsPerSpinIteration;
+            }
+        }
+
         public static void SpinWait(int iterations) => Thread.SpinWait(iterations);
         public static bool Yield() => Thread.Yield();
 
diff --git a/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs b/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs
index e39696849989..8a245f060207 100644
--- a/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs
+++ b/src/mscorlib/src/System/Threading/ManualResetEventSlim.cs
@@ -12,9 +12,6 @@
 //
 // =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
-using System;
-using System.Threading;
-using System.Runtime.InteropServices;
 using System.Diagnostics;
 using System.Diagnostics.Contracts;
 
@@ -48,7 +45,6 @@ public class ManualResetEventSlim : IDisposable
     {
         // These are the default spin counts we use on single-proc and MP machines.
         private const int DEFAULT_SPIN_SP = 1;
-        private const int DEFAULT_SPIN_MP = SpinWait.YIELD_THRESHOLD;
 
         private volatile object m_lock;
         // A lock used for waiting and pulsing. Lazily initialized via EnsureLockObjectCreated()
@@ -193,7 +189,7 @@ public ManualResetEventSlim(bool initialState)
         {
             // Specify the defualt spin count, and use default spin if we're
             // on a multi-processor machine. Otherwise, we won't.
-            Initialize(initialState, DEFAULT_SPIN_MP);
+            Initialize(initialState, SpinWait.SpinCountforSpinBeforeWait);
         }
 
         /// <summary>
@@ -563,44 +559,19 @@ public bool Wait(int millisecondsTimeout, CancellationToken cancellationToken)
                     bNeedTimeoutAdjustment = true;
                 }
 
-                //spin
-                int HOW_MANY_SPIN_BEFORE_YIELD = 10;
-                int HOW_MANY_YIELD_EVERY_SLEEP_0 = 5;
-                int HOW_MANY_YIELD_EVERY_SLEEP_1 = 20;
-
+                // Spin
                 int spinCount = SpinCount;
-                for (int i = 0; i < spinCount; i++)
+                var spinner = new SpinWait();
+                while (spinner.Count < spinCount)
                 {
+                    spinner.SpinOnce(SpinWait.Sleep1ThresholdForSpinBeforeWait);
+
                     if (IsSet)
                     {
                         return true;
                     }
 
-                    else if (i < HOW_MANY_SPIN_BEFORE_YIELD)
-                    {
-                        if (i == HOW_MANY_SPIN_BEFORE_YIELD / 2)
-                        {
-                            Thread.Yield();
-                        }
-                        else
-                        {
-                            Thread.SpinWait(4 << i);
-                        }
-                    }
-                    else if (i % HOW_MANY_YIELD_EVERY_SLEEP_1 == 0)
-                    {
-                        Thread.Sleep(1);
-                    }
-                    else if (i % HOW_MANY_YIELD_EVERY_SLEEP_0 == 0)
-                    {
-                        Thread.Sleep(0);
-                    }
-                    else
-                    {
-                        Thread.Yield();
-                    }
-
-                    if (i >= 100 && i % 10 == 0) // check the cancellation token if the user passed a very large spin count
+                    if (spinner.Count >= 100 && spinner.Count % 10 == 0) // check the cancellation token if the user passed a very large spin count
                         cancellationToken.ThrowIfCancellationRequested();
                 }
 
diff --git a/src/mscorlib/src/System/Threading/SemaphoreSlim.cs b/src/mscorlib/src/System/Threading/SemaphoreSlim.cs
index e00a106eb34a..972b21adae3b 100644
--- a/src/mscorlib/src/System/Threading/SemaphoreSlim.cs
+++ b/src/mscorlib/src/System/Threading/SemaphoreSlim.cs
@@ -342,15 +342,28 @@ public bool Wait(int millisecondsTimeout, CancellationToken cancellationToken)
             CancellationTokenRegistration cancellationTokenRegistration = cancellationToken.InternalRegisterWithoutEC(s_cancellationTokenCanceledEventHandler, this);
             try
             {
-                // Perf: first spin wait for the count to be positive, but only up to the first planned yield.
+                // Perf: first spin wait for the count to be positive.
                 //       This additional amount of spinwaiting in addition
                 //       to Monitor.Enter()’s spinwaiting has shown measurable perf gains in test scenarios.
                 //
+
+                // Monitor.Enter followed by Monitor.Wait is much more expensive than waiting on an event as it involves another
+                // spin, contention, etc. The usual number of spin iterations that would otherwise be used here is doubled to
+                // lessen that extra expense of doing a proper wait.
+                int spinCount = SpinWait.SpinCountforSpinBeforeWait * 2;
+                int sleep1Threshold = SpinWait.Sleep1ThresholdForSpinBeforeWait * 2;
+
                 SpinWait spin = new SpinWait();
-                while (m_currentCount == 0 && !spin.NextSpinWillYield)
+                while (true)
                 {
-                    spin.SpinOnce();
+                    spin.SpinOnce(sleep1Threshold);
+
+                    if (m_currentCount != 0)
+                    {
+                        break;
+                    }
                 }
+
                 // entering the lock and incrementing waiters must not suffer a thread-abort, else we cannot
                 // clean up m_waitCount correctly, which may lead to deadlock due to non-woken waiters.
                 try { }
diff --git a/src/mscorlib/src/System/Threading/SpinLock.cs b/src/mscorlib/src/System/Threading/SpinLock.cs
index eee73ce2bf48..dbf2024e5dc9 100644
--- a/src/mscorlib/src/System/Threading/SpinLock.cs
+++ b/src/mscorlib/src/System/Threading/SpinLock.cs
@@ -65,16 +65,9 @@ public struct SpinLock
 
         private volatile int m_owner;
 
-        // The multiplier factor for the each spinning iteration
-        // This number has been chosen after trying different numbers on different CPUs (4, 8 and 16 ) and this provided the best results
-        private const int SPINNING_FACTOR = 100;
-
         // After how many yields, call Sleep(1)
         private const int SLEEP_ONE_FREQUENCY = 40;
 
-        // After how many yields, call Sleep(0)
-        private const int SLEEP_ZERO_FREQUENCY = 10;
-
         // After how many yields, check the timeout
         private const int TIMEOUT_CHECK_FREQUENCY = 10;
 
@@ -347,48 +340,24 @@ private void ContinueTryEnter(int millisecondsTimeout, ref bool lockTaken)
             else //failed to acquire the lock,then try to update the waiters. If the waiters count reached the maximum, jsut break the loop to avoid overflow
             {
                 if ((observedOwner & WAITERS_MASK) != MAXIMUM_WAITERS)
+                {
+                    // This can still overflow, but maybe there will never be that many waiters
                     turn = (Interlocked.Add(ref m_owner, 2) & WAITERS_MASK) >> 1;
+                }
             }
 
-            //***Step 2. Spinning
             //lock acquired failed and waiters updated
-            int processorCount = PlatformHelper.ProcessorCount;
-            if (turn < processorCount)
-            {
-                int processFactor = 1;
-                for (int i = 1; i <= turn * SPINNING_FACTOR; i++)
-                {
-                    Thread.SpinWait((turn + i) * SPINNING_FACTOR * processFactor);
-                    if (processFactor < processorCount)
-                        processFactor++;
-                    observedOwner = m_owner;
-                    if ((observedOwner & LOCK_ANONYMOUS_OWNED) == LOCK_UNOWNED)
-                    {
-                        int newOwner = (observedOwner & WAITERS_MASK) == 0 ? // Gets the number of waiters, if zero
-                            observedOwner | 1 // don't decrement it. just set the lock bit, it is zzero because a previous call of Exit(false) ehich corrupted the waiters
-                            : (observedOwner - 2) | 1; // otherwise decrement the waiters and set the lock bit
-                        Debug.Assert((newOwner & WAITERS_MASK) >= 0);
-
-                        if (CompareExchange(ref m_owner, newOwner, observedOwner, ref lockTaken) == observedOwner)
-                        {
-                            return;
-                        }
-                    }
-                }
 
-                // Check the timeout.
-                if (millisecondsTimeout != Timeout.Infinite && TimeoutHelper.UpdateTimeOut(startTime, millisecondsTimeout) <= 0)
-                {
-                    DecrementWaiters();
-                    return;
-                }
+            //*** Step 2, Spinning and Yielding
+            var spinner = new SpinWait();
+            if (turn > PlatformHelper.ProcessorCount)
+            {
+                spinner.Count = SpinWait.YieldThreshold;
             }
-
-            //*** Step 3, Yielding
-            //Sleep(1) every 50 yields
-            int yieldsoFar = 0;
             while (true)
             {
+                spinner.SpinOnce(SLEEP_ONE_FREQUENCY);
+
                 observedOwner = m_owner;
                 if ((observedOwner & LOCK_ANONYMOUS_OWNED) == LOCK_UNOWNED)
                 {
@@ -403,20 +372,7 @@ private void ContinueTryEnter(int millisecondsTimeout, ref bool lockTaken)
                     }
                 }
 
-                if (yieldsoFar % SLEEP_ONE_FREQUENCY == 0)
-                {
-                    Thread.Sleep(1);
-                }
-                else if (yieldsoFar % SLEEP_ZERO_FREQUENCY == 0)
-                {
-                    Thread.Sleep(0);
-                }
-                else
-                {
-                    Thread.Yield();
-                }
-
-                if (yieldsoFar % TIMEOUT_CHECK_FREQUENCY == 0)
+                if (spinner.Count % TIMEOUT_CHECK_FREQUENCY == 0)
                 {
                     //Check the timeout.
                     if (millisecondsTimeout != Timeout.Infinite && TimeoutHelper.UpdateTimeOut(startTime, millisecondsTimeout) <= 0)
@@ -425,8 +381,6 @@ private void ContinueTryEnter(int millisecondsTimeout, ref bool lockTaken)
                         return;
                     }
                 }
-
-                yieldsoFar++;
             }
         }
 
diff --git a/src/mscorlib/src/System/Threading/Tasks/Task.cs b/src/mscorlib/src/System/Threading/Tasks/Task.cs
index 8e848842ee5e..4882ebf5a87f 100644
--- a/src/mscorlib/src/System/Threading/Tasks/Task.cs
+++ b/src/mscorlib/src/System/Threading/Tasks/Task.cs
@@ -10,19 +10,14 @@
 //
 // =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
-using System;
 using System.Collections.Generic;
 using System.Collections.ObjectModel;
-using System.Runtime;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using System.Runtime.ExceptionServices;
-using System.Security;
-using System.Threading;
 using System.Diagnostics;
 using System.Diagnostics.Contracts;
-using Microsoft.Win32;
 using System.Diagnostics.Tracing;
+using System.Runtime.CompilerServices;
+using System.Runtime.ExceptionServices;
+using Internal.Runtime.Augments;
 
 // Disable the "reference to volatile field not treated as volatile" error.
 #pragma warning disable 0420
@@ -2971,26 +2966,19 @@ private bool SpinWait(int millisecondsTimeout)
                 return false;
             }
 
-            //This code is pretty similar to the custom spinning in MRES except there is no yieling after we exceed the spin count
-            int spinCount = PlatformHelper.IsSingleProcessor ? 1 : System.Threading.SpinWait.YIELD_THRESHOLD; //spin only once if we are running on a single CPU
-            for (int i = 0; i < spinCount; i++)
+            int spinCount = Threading.SpinWait.SpinCountforSpinBeforeWait;
+            var spinner = new SpinWait();
+            while (spinner.Count < spinCount)
             {
+                spinner.SpinOnce(Threading.SpinWait.Sleep1ThresholdForSpinBeforeWait);
+
                 if (IsCompleted)
                 {
                     return true;
                 }
-
-                if (i == spinCount / 2)
-                {
-                    Thread.Yield();
-                }
-                else
-                {
-                    Thread.SpinWait(4 << i);
-                }
             }
 
-            return IsCompleted;
+            return false;
         }
 
         /// <summary>
@@ -3227,7 +3215,7 @@ private void RunContinuations(object continuationObject) // separated out of Fin
 
             // Skip synchronous execution of continuations if this task's thread was aborted
             bool bCanInlineContinuations = !(((m_stateFlags & TASK_STATE_THREAD_WAS_ABORTED) != 0) ||
-                                              (Thread.CurrentThread.ThreadState == ThreadState.AbortRequested) ||
+                                              (RuntimeThread.CurrentThread.ThreadState == ThreadState.AbortRequested) ||
                                               ((m_stateFlags & (int)TaskCreationOptions.RunContinuationsAsynchronously) != 0));
 
             // Handle the single-Action case
diff --git a/src/vm/comsynchronizable.cpp b/src/vm/comsynchronizable.cpp
index 0554fe338593..8fce346142c4 100644
--- a/src/vm/comsynchronizable.cpp
+++ b/src/vm/comsynchronizable.cpp
@@ -1624,22 +1624,41 @@ FCIMPL1(FC_BOOL_RET, ThreadNative::IsThreadpoolThread, ThreadBaseObject* thread)
 }
 FCIMPLEND
 
+INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration()
+{
+    QCALL_CONTRACT;
+
+    INT32 optimalMaxNormalizedYieldsPerSpinIteration;
+
+    BEGIN_QCALL;
+
+    Thread::EnsureYieldProcessorNormalizedInitialized();
+    optimalMaxNormalizedYieldsPerSpinIteration = Thread::GetOptimalMaxNormalizedYieldsPerSpinIteration();
+
+    END_QCALL;
+
+    return optimalMaxNormalizedYieldsPerSpinIteration;
+}
 
 FCIMPL1(void, ThreadNative::SpinWait, int iterations)
 {
     FCALL_CONTRACT;
 
+    if (iterations <= 0)
+    {
+        return;
+    }
+
     //
     // If we're not going to spin for long, it's ok to remain in cooperative mode.
     // The threshold is determined by the cost of entering preemptive mode; if we're
     // spinning for less than that number of cycles, then switching to preemptive
-    // mode won't help a GC start any faster.  That number is right around 1000000 
-    // on my machine.
+    // mode won't help a GC start any faster.
     //
-    if (iterations <= 1000000)
+    if (iterations <= 100000 && Thread::IsYieldProcessorNormalizedInitialized())
     {
-        for(int i = 0; i < iterations; i++)
-            YieldProcessor();
+        for (int i = 0; i < iterations; i++)
+            Thread::YieldProcessorNormalized();
         return;
     }
 
@@ -1649,8 +1668,9 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations)
     HELPER_METHOD_FRAME_BEGIN_NOPOLL();
     GCX_PREEMP();
 
-    for(int i = 0; i < iterations; i++)
-        YieldProcessor();
+    Thread::EnsureYieldProcessorNormalizedInitialized();
+    for (int i = 0; i < iterations; i++)
+        Thread::YieldProcessorNormalized();
 
     HELPER_METHOD_FRAME_END();
 }
diff --git a/src/vm/comsynchronizable.h b/src/vm/comsynchronizable.h
index 00b055c96070..b280c605b89f 100644
--- a/src/vm/comsynchronizable.h
+++ b/src/vm/comsynchronizable.h
@@ -97,6 +97,7 @@ friend class ThreadBaseObject;
     UINT64 QCALLTYPE GetProcessDefaultStackSize();
 
     static FCDECL1(INT32,   GetManagedThreadId, ThreadBaseObject* th);
+    static INT32 QCALLTYPE GetOptimalMaxSpinWaitsPerSpinIteration();
     static FCDECL1(void,    SpinWait,                       int iterations);
     static BOOL QCALLTYPE YieldThread();
     static FCDECL0(Object*, GetCurrentThread);
diff --git a/src/vm/ecalllist.h b/src/vm/ecalllist.h
index 876bafc47b3e..f27e2205fe27 100644
--- a/src/vm/ecalllist.h
+++ b/src/vm/ecalllist.h
@@ -709,6 +709,7 @@ FCFuncStart(gRuntimeThreadFuncs)
 #endif // FEATURE_COMINTEROP
     FCFuncElement("InterruptInternal", ThreadNative::Interrupt)
     FCFuncElement("JoinInternal", ThreadNative::Join)
+    QCFuncElement("GetOptimalMaxSpinWaitsPerSpinIterationInternal", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
 FCFuncEnd()
 
 FCFuncStart(gThreadFuncs)
diff --git a/src/vm/threads.cpp b/src/vm/threads.cpp
index b827140dd49b..abc544338b7c 100644
--- a/src/vm/threads.cpp
+++ b/src/vm/threads.cpp
@@ -11744,3 +11744,87 @@ ULONGLONG Thread::QueryThreadProcessorUsage()
     return ullCurrentUsage - ullPreviousUsage;
 }
 #endif // FEATURE_APPDOMAIN_RESOURCE_MONITORING
+
+int Thread::s_yieldsPerNormalizedYield = 0;
+int Thread::s_optimalMaxNormalizedYieldsPerSpinIteration = 0;
+
+static Crst s_initializeYieldProcessorNormalizedCrst(CrstLeafLock);
+void Thread::InitializeYieldProcessorNormalized()
+{
+    LIMITED_METHOD_CONTRACT;
+
+    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
+
+    if (IsYieldProcessorNormalizedInitialized())
+    {
+        return;
+    }
+
+    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
+    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
+    const int DefaultYieldsPerNormalizedYield = 1; // defaults are for when no measurement is done
+    const int DefaultOptimalMaxNormalizedYieldsPerSpinIteration = 64; // tuned for pre-Skylake processors, for post-Skylake it should be 7
+    const int MeasureDurationMs = 10;
+    const int MaxYieldsPerNormalizedYield = 10; // measured typically 8-9 on pre-Skylake
+    const int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
+    const int NsPerOptimialMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
+    const int NsPerSecond = 1000 * 1000 * 1000;
+
+    LARGE_INTEGER li;
+    if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs)
+    {
+        // High precision clock not available or clock resolution is too low, resort to defaults
+        s_yieldsPerNormalizedYield = DefaultYieldsPerNormalizedYield;
+        s_optimalMaxNormalizedYieldsPerSpinIteration = DefaultOptimalMaxNormalizedYieldsPerSpinIteration;
+        return;
+    }
+    ULONGLONG ticksPerSecond = li.QuadPart;
+
+    // Measure the nanosecond delay per yield
+    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
+    unsigned int yieldCount = 0;
+    QueryPerformanceCounter(&li);
+    ULONGLONG startTicks = li.QuadPart;
+    ULONGLONG elapsedTicks;
+    do
+    {
+        for (int i = 0; i < 10; ++i)
+        {
+            YieldProcessor();
+        }
+        yieldCount += 10;
+
+        QueryPerformanceCounter(&li);
+        ULONGLONG nowTicks = li.QuadPart;
+        elapsedTicks = nowTicks - startTicks;
+    } while (elapsedTicks < measureDurationTicks);
+    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
+    if (nsPerYield < 1)
+    {
+        nsPerYield = 1;
+    }
+
+    // Calculate the number of yields required to span the duration of a normalized yield
+    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
+    if (yieldsPerNormalizedYield < 1)
+    {
+        yieldsPerNormalizedYield = 1;
+    }
+    else if (yieldsPerNormalizedYield > MaxYieldsPerNormalizedYield)
+    {
+        yieldsPerNormalizedYield = MaxYieldsPerNormalizedYield;
+    }
+
+    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
+    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
+    // better job of allowing other work to run.
+    int optimalMaxNormalizedYieldsPerSpinIteration =
+        (int)(NsPerOptimialMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
+    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
+    {
+        optimalMaxNormalizedYieldsPerSpinIteration = 1;
+    }
+
+    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+    s_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
+}
diff --git a/src/vm/threads.h b/src/vm/threads.h
index ad433e765b39..be36fe624e80 100644
--- a/src/vm/threads.h
+++ b/src/vm/threads.h
@@ -5362,6 +5362,70 @@ class Thread: public IUnknown
         m_HijackReturnKind = returnKind;
     }
 #endif // FEATURE_HIJACK
+
+private:
+    static int s_yieldsPerNormalizedYield;
+    static int s_optimalMaxNormalizedYieldsPerSpinIteration;
+
+private:
+    static void InitializeYieldProcessorNormalized();
+
+public:
+    static bool IsYieldProcessorNormalizedInitialized()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return s_yieldsPerNormalizedYield != 0 && s_optimalMaxNormalizedYieldsPerSpinIteration != 0;
+    }
+
+public:
+    static void EnsureYieldProcessorNormalizedInitialized()
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        if (!IsYieldProcessorNormalizedInitialized())
+        {
+            InitializeYieldProcessorNormalized();
+        }
+    }
+
+public:
+    static int GetOptimalMaxNormalizedYieldsPerSpinIteration()
+    {
+        WRAPPER_NO_CONTRACT;
+        _ASSERTE(IsYieldProcessorNormalizedInitialized());
+
+        return s_optimalMaxNormalizedYieldsPerSpinIteration;
+    }
+
+public:
+    static void YieldProcessorNormalized()
+    {
+        WRAPPER_NO_CONTRACT;
+        _ASSERTE(IsYieldProcessorNormalizedInitialized());
+
+        int n = s_yieldsPerNormalizedYield;
+        while (--n >= 0)
+        {
+            YieldProcessor();
+        }
+    }
+
+    static void YieldProcessorNormalizedWithBackOff(unsigned int spinIteration)
+    {
+        WRAPPER_NO_CONTRACT;
+        _ASSERTE(IsYieldProcessorNormalizedInitialized());
+
+        int n = s_optimalMaxNormalizedYieldsPerSpinIteration;
+        if (spinIteration <= 30 && (1 << spinIteration) < n)
+        {
+            n = 1 << spinIteration;
+        }
+        n *= s_yieldsPerNormalizedYield;
+        while (--n >= 0)
+        {
+            YieldProcessor();
+        }
+    }
 };
 
 // End of class Thread