From d7ee11fc7353437a218d369187c09fdfd243d51a Mon Sep 17 00:00:00 2001
From: vsadov <vsadov@microsoft.com>
Date: Mon, 2 Dec 2019 17:00:38 -0800
Subject: [PATCH 01/11] Adjusting `GetCurrentProcessorId` caching to different
 environments.

---
 .../src/System/Threading/Thread.CoreCLR.cs    | 176 ++++++++++++++++--
 1 file changed, 162 insertions(+), 14 deletions(-)

diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
index 32b6ac43e0ffab..ed36f2d2d4813c 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
@@ -486,19 +486,31 @@ private static int CalculateOptimalMaxSpinWaitsPerSpinIteration()
         [MethodImpl(MethodImplOptions.InternalCall)]
         private static extern int GetCurrentProcessorNumber();
 
-        // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
-        // the t_currentProcessorIdCache are counting down to get it periodically refreshed.
-        // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
-        // actions that are likely to result in changing the executing core
-        [ThreadStatic]
-        private static int t_currentProcessorIdCache;
+        // t_currentProcessorId lives in a separate class to make sure the class is fully initialized by the time we use the field
+        private class CoreIdCache
+        {
+            // The upper bits of t_currentProcessorId are the currentProcessorId. The lower bits of
+            // the t_currentProcessorIdCache are counting down to get it periodically refreshed.
+            // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
+            // actions that are likely to result in changing the executing core
+            [ThreadStatic]
+            internal static int t_currentProcessorId;
+        }
 
         private const int ProcessorIdCacheShift = 16;
         private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
-        private const int ProcessorIdRefreshRate = 5000;
+        // 50 is our best guess.
+        // Based on further calibration it is likley to be adjusted lower.
+        // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number.
+        private static int ProcessorIdRefreshRate = 50;
+        // We will not adjust higher than this though.
+        private const int MaxIdRefreshRate = 5000;
 
         private static int RefreshCurrentProcessorId()
         {
+            if (sCalibrationSamples != null)
+                CalibrateOnce();
+
             int currentProcessorId = GetCurrentProcessorNumber();
 
             // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
@@ -507,23 +519,23 @@ private static int RefreshCurrentProcessorId()
             // by default, we use the current managed thread ID as a proxy.
             if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;
 
-            // Add offset to make it clear that it is not guaranteed to be 0-based processor number
-            currentProcessorId += 100;
-
             Debug.Assert(ProcessorIdRefreshRate <= ProcessorIdCacheCountDownMask);
 
             // Mask with int.MaxValue to ensure the execution Id is not negative
-            t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate;
+            CoreIdCache.t_currentProcessorId = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate;
 
             return currentProcessorId;
         }
 
-        // Cached processor id used as a hint for which per-core stack to access. It is periodically
-        // refreshed to trail the actual thread core affinity.
+        // Cached processor id could be used as a hint for which per-core stripe of data to access to avoid sharing.
+        // It is periodically refreshed to trail the actual thread core affinity.
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static int GetCurrentProcessorId()
         {
-            int currentProcessorIdCache = t_currentProcessorIdCache--;
+            if (IsCoreIdReallyFast || ProcessorIdRefreshRate <= 2)
+                return GetCurrentProcessorNumber();
+
+            int currentProcessorIdCache = CoreIdCache.t_currentProcessorId--;
             if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
             {
                 return RefreshCurrentProcessorId();
@@ -532,6 +544,142 @@ public static int GetCurrentProcessorId()
             return currentProcessorIdCache >> ProcessorIdCacheShift;
         }
 
+        // do a fast check and record in a readonly static so that it could become a JIT constant
+        internal static readonly bool IsCoreIdReallyFast = SimpleCoreIdSpeedCheck();
+
+        // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false.
+        // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch.
+        private static bool SimpleCoreIdSpeedCheck()
+        {
+            // warm up the code paths.
+            int id = UninlinedThreadStatic() | GetCurrentProcessorNumber();
+            long _05usec = Stopwatch.Frequency / 1000000;
+
+            // limit quick test to 100 microseconds.
+            // If we are on slow hardware, we should calibrate anyways.
+            long limit = Stopwatch.Frequency / 10000 + Stopwatch.GetTimestamp();
+            for (int i = 0; i < 10; i++)
+            {
+                int iters = 1;
+                long t1 = 0;
+                // double the sample size until it is 0.5 usec.
+                while (t1 < _05usec)
+                {
+                    iters *= 2;
+                    t1 = Stopwatch.GetTimestamp();
+                    for (int j = 0; j < iters; j++)
+                    {
+                        id = GetCurrentProcessorNumber();
+                    }
+                    t1 = Stopwatch.GetTimestamp() - t1;
+                }
+
+                // assuming TLS cannot be a lot slower than getting ID, this should take 1-5 usec
+                long t2 = Stopwatch.GetTimestamp();
+                for (int j = 0; j < iters; j++)
+                {
+                    UninlinedThreadStatic();
+                }
+                long t3 = Stopwatch.GetTimestamp();
+
+                // if getting ID took longer than 2x TLS access, we should consider caching.
+                if (t3 > limit || (t3 - t2) * 2 < t1)
+                {
+                    return false;
+                }
+            }
+
+            // Make sure the result was not negative, which would indicate "Not Supported"
+            return id >= 0;
+        }
+
+        // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.
+        // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching.
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        private static int UninlinedThreadStatic()
+        {
+            return CoreIdCache.t_currentProcessorId;
+        }
+
+        // We must collect multiple samples to account for irregularities caused by GC and context switches.
+        // Why we keep an array of samples and do not adjust as we go:
+        //   We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all
+        //   the calibration work to happen much sooner. There is no urgency in being calibrated while the app
+        //   is in start-up mode. That would just add to the "rush hour" traffic.
+        private static int sCalibrationToDo;
+        private static int sCalibrationDone;
+        // 25 is chosen to budget the sampling under 100 msec total, assuming 4 msec per sample.
+        private const int CalibrationSamples = 25;
+        private static double[] sCalibrationSamples = new double[CalibrationSamples * 2];
+
+        private static void CalibrateOnce()
+        {
+            if (sCalibrationToDo >= CalibrationSamples)
+                return;
+
+            int sample = Interlocked.Increment(ref sCalibrationToDo) - 1;
+            if (sample >= CalibrationSamples)
+                return;
+
+            // Actual calibration step. Let's try to fit into ~4 msec.
+            double[] calibrationState = sCalibrationSamples;
+
+            int id = 0;
+            long t1 = 0;
+            long _1msec = Stopwatch.Frequency / 1000;
+            int iters = 1;
+
+            // double the sample size until it is 1 msec.
+            // we may spend up to 3 msec in this loop in a worst case.
+            while (t1 < _1msec)
+            {
+                iters *= 2;
+                t1 = Stopwatch.GetTimestamp();
+                for (int i = 0; i < iters; i++)
+                {
+                    id = GetCurrentProcessorNumber();
+                }
+                t1 = Stopwatch.GetTimestamp() - t1;
+            }
+
+            // assuming TLS cannot be a lot slower than ID, this should take 1 msec
+            long t2 = Stopwatch.GetTimestamp();
+            for (int i = 0; i < iters; i++)
+            {
+                UninlinedThreadStatic();
+            }
+            long t3 = Stopwatch.GetTimestamp();
+
+            // if we have useful measurements, record a sample
+            if (id >= 0 && t1 > 0 && t3 - t2 > 0)
+            {
+                calibrationState[sample * 2] = (double)t1 / iters;            // ID
+                calibrationState[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS
+            }
+            else
+            {
+                // API is not functional or clock did not go forward.
+                // just pretend it was a very expensive sample with default ratio.
+                calibrationState[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec;
+                calibrationState[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec
+            }
+
+            // If this was the last sample computed, get best times and update the ratio of ID to TLS.
+            if (Interlocked.Increment(ref sCalibrationDone) == CalibrationSamples)
+            {
+                double idMin = double.MaxValue;
+                double tlsMin = double.MaxValue;
+                for (int i = 0; i < CalibrationSamples; i++)
+                {
+                    idMin = Math.Min(idMin, calibrationState[i * 2]);       //ID
+                    tlsMin = Math.Min(tlsMin, calibrationState[i * 2 + 1]); //TLS
+                }
+
+                sCalibrationSamples = null!;
+                ProcessorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin));
+            }
+        }
+
         internal void ResetThreadPoolThread()
         {
             // Currently implemented in unmanaged method Thread::InternalReset and

From 9ffc3351a3ad2abe1cd5552ee447c8b4f7efee75 Mon Sep 17 00:00:00 2001
From: vsadov <vsadov@microsoft.com>
Date: Tue, 3 Dec 2019 19:22:40 -0800
Subject: [PATCH 02/11] Addressed some PR comments (style, code structure).

---
 .../src/System/Threading/Thread.CoreCLR.cs    | 265 +++++++++---------
 1 file changed, 138 insertions(+), 127 deletions(-)

diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
index ed36f2d2d4813c..60d337183ee402 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
@@ -486,45 +486,145 @@ private static int CalculateOptimalMaxSpinWaitsPerSpinIteration()
         [MethodImpl(MethodImplOptions.InternalCall)]
         private static extern int GetCurrentProcessorNumber();
 
-        // t_currentProcessorId lives in a separate class to make sure the class is fully initialized by the time we use the field
-        private class CoreIdCache
+        private static class CoreIdCache
         {
-            // The upper bits of t_currentProcessorId are the currentProcessorId. The lower bits of
+            // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
             // the t_currentProcessorIdCache are counting down to get it periodically refreshed.
             // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
             // actions that are likely to result in changing the executing core
             [ThreadStatic]
-            internal static int t_currentProcessorId;
-        }
+            private static int t_currentProcessorIdCache;
+
+            private const int ProcessorIdCacheShift = 16;
+            private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
+            // 50 is our best guess.
+            // Based on further calibration it is likley to be adjusted lower.
+            // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number.
+            private static int s_processorIdRefreshRate = 50;
+            // We will not adjust higher than this though.
+            private const int MaxIdRefreshRate = 5000;
+
+            private static int RefreshCurrentProcessorId()
+            {
+                double[]? calibrationSamples = s_CalibrationSamples;
+                if (calibrationSamples != null)
+                    CalibrateOnce(calibrationSamples);
 
-        private const int ProcessorIdCacheShift = 16;
-        private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
-        // 50 is our best guess.
-        // Based on further calibration it is likley to be adjusted lower.
-        // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number.
-        private static int ProcessorIdRefreshRate = 50;
-        // We will not adjust higher than this though.
-        private const int MaxIdRefreshRate = 5000;
+                int currentProcessorId = GetCurrentProcessorNumber();
 
-        private static int RefreshCurrentProcessorId()
-        {
-            if (sCalibrationSamples != null)
-                CalibrateOnce();
+                // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
+                // doesn't exist on all platforms.  On those it doesn't exist on, GetCurrentProcessorNumber()
+                // returns -1.  As a fallback in that case and to spread the threads across the buckets
+                // by default, we use the current managed thread ID as a proxy.
+                if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;
 
-            int currentProcessorId = GetCurrentProcessorNumber();
+                Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask);
 
-            // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
-            // doesn't exist on all platforms.  On those it doesn't exist on, GetCurrentProcessorNumber()
-            // returns -1.  As a fallback in that case and to spread the threads across the buckets
-            // by default, we use the current managed thread ID as a proxy.
-            if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;
+                // Mask with int.MaxValue to ensure the execution Id is not negative
+                t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | s_processorIdRefreshRate;
 
-            Debug.Assert(ProcessorIdRefreshRate <= ProcessorIdCacheCountDownMask);
+                return currentProcessorId;
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            internal static int GetCurrentProcessorId()
+            {
+                if (s_processorIdRefreshRate <= 2)
+                    return GetCurrentProcessorNumber();
+
+                int currentProcessorIdCache = t_currentProcessorIdCache--;
+                if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
+                {
+                    return RefreshCurrentProcessorId();
+                }
+
+                return currentProcessorIdCache >> ProcessorIdCacheShift;
+            }
+
+            // We must collect multiple samples to account for irregularities caused by GC and context switches.
+            // Why we keep an array of samples and do not adjust as we go:
+            //   We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all
+            //   the calibration work to happen much sooner. There is no urgency in being calibrated while the app
+            //   is in start-up mode. That would just add to the "rush hour" traffic.
+            private static int s_CalibrationToDo;
+            private static int s_CalibrationDone;
+            // 25 is chosen to budget the sampling under 100 msec total, assuming 4 msec per sample.
+            private const int CalibrationSampleCount = 25;
+            private static double[]? s_CalibrationSamples = new double[CalibrationSampleCount * 2];
+
+            private static void CalibrateOnce(double[] calibrationSamples)
+            {
+                if (s_CalibrationToDo >= CalibrationSampleCount)
+                    return;
+
+                int sample = Interlocked.Increment(ref s_CalibrationToDo) - 1;
+                if (sample >= CalibrationSampleCount)
+                    return;
+
+                // Actual calibration step. Let's try to fit into ~4 msec.
+                int id = 0;
+                long t1 = 0;
+                long oneMillisecond = Stopwatch.Frequency / 1000;
+                int iters = 1;
+
+                // double the sample size until it is 1 msec.
+                // we may spend up to 3 msec in this loop in a worst case.
+                while (t1 < oneMillisecond)
+                {
+                    iters *= 2;
+                    t1 = Stopwatch.GetTimestamp();
+                    for (int i = 0; i < iters; i++)
+                    {
+                        id = GetCurrentProcessorNumber();
+                    }
+                    t1 = Stopwatch.GetTimestamp() - t1;
+                }
+
+                // assuming TLS cannot be a lot slower than ID, this should take 1 msec
+                long t2 = Stopwatch.GetTimestamp();
+                for (int i = 0; i < iters; i++)
+                {
+                    UninlinedThreadStatic();
+                }
+                long t3 = Stopwatch.GetTimestamp();
+
+                // if we have useful measurements, record a sample
+                if (id >= 0 && t1 > 0 && t3 - t2 > 0)
+                {
+                    calibrationSamples[sample * 2] = (double)t1 / iters;            // ID
+                    calibrationSamples[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS
+                }
+                else
+                {
+                    // API is not functional or clock did not go forward.
+                    // just pretend it was a very expensive sample with default ratio.
+                    calibrationSamples[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec;
+                    calibrationSamples[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec
+                }
+
+                // If this was the last sample computed, get best times and update the ratio of ID to TLS.
+                if (Interlocked.Increment(ref s_CalibrationDone) == CalibrationSampleCount)
+                {
+                    double idMin = double.MaxValue;
+                    double tlsMin = double.MaxValue;
+                    for (int i = 0; i < CalibrationSampleCount; i++)
+                    {
+                        idMin = Math.Min(idMin, calibrationSamples[i * 2]);       //ID
+                        tlsMin = Math.Min(tlsMin, calibrationSamples[i * 2 + 1]); //TLS
+                    }
 
-            // Mask with int.MaxValue to ensure the execution Id is not negative
-            CoreIdCache.t_currentProcessorId = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate;
+                    s_CalibrationSamples = null;
+                    s_processorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin));
+                }
+            }
 
-            return currentProcessorId;
+            // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.
+            // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching.
+            [MethodImpl(MethodImplOptions.NoInlining)]
+            internal static int UninlinedThreadStatic()
+            {
+                return t_currentProcessorIdCache;
+            }
         }
 
         // Cached processor id could be used as a hint for which per-core stripe of data to access to avoid sharing.
@@ -532,28 +632,26 @@ private static int RefreshCurrentProcessorId()
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static int GetCurrentProcessorId()
         {
-            if (IsCoreIdReallyFast || ProcessorIdRefreshRate <= 2)
+            if (s_isCoreIdReallyFast)
                 return GetCurrentProcessorNumber();
 
-            int currentProcessorIdCache = CoreIdCache.t_currentProcessorId--;
-            if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
-            {
-                return RefreshCurrentProcessorId();
-            }
-
-            return currentProcessorIdCache >> ProcessorIdCacheShift;
+            return CoreIdCache.GetCurrentProcessorId();
         }
 
         // do a fast check and record in a readonly static so that it could become a JIT constant
-        internal static readonly bool IsCoreIdReallyFast = SimpleCoreIdSpeedCheck();
+        private static readonly bool s_isCoreIdReallyFast = SimpleCoreIdSpeedCheck();
 
         // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false.
         // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch.
         private static bool SimpleCoreIdSpeedCheck()
         {
+            // NOTE: We do not check the frequency of the Stopwatch.
+            //       If the resolution, precision or access time to the timer are inadequate for our measures here,
+            //       the test will fail anyways.
+
             // warm up the code paths.
-            int id = UninlinedThreadStatic() | GetCurrentProcessorNumber();
-            long _05usec = Stopwatch.Frequency / 1000000;
+            int id = CoreIdCache.UninlinedThreadStatic() | GetCurrentProcessorNumber();
+            long halfMicrosecond = Stopwatch.Frequency / 1000000;
 
             // limit quick test to 100 microseconds.
             // If we are on slow hardware, we should calibrate anyways.
@@ -563,7 +661,7 @@ private static bool SimpleCoreIdSpeedCheck()
                 int iters = 1;
                 long t1 = 0;
                 // double the sample size until it is 0.5 usec.
-                while (t1 < _05usec)
+                while (t1 < halfMicrosecond)
                 {
                     iters *= 2;
                     t1 = Stopwatch.GetTimestamp();
@@ -578,7 +676,7 @@ private static bool SimpleCoreIdSpeedCheck()
                 long t2 = Stopwatch.GetTimestamp();
                 for (int j = 0; j < iters; j++)
                 {
-                    UninlinedThreadStatic();
+                    CoreIdCache.UninlinedThreadStatic();
                 }
                 long t3 = Stopwatch.GetTimestamp();
 
@@ -593,93 +691,6 @@ private static bool SimpleCoreIdSpeedCheck()
             return id >= 0;
         }
 
-        // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.
-        // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching.
-        [MethodImpl(MethodImplOptions.NoInlining)]
-        private static int UninlinedThreadStatic()
-        {
-            return CoreIdCache.t_currentProcessorId;
-        }
-
-        // We must collect multiple samples to account for irregularities caused by GC and context switches.
-        // Why we keep an array of samples and do not adjust as we go:
-        //   We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all
-        //   the calibration work to happen much sooner. There is no urgency in being calibrated while the app
-        //   is in start-up mode. That would just add to the "rush hour" traffic.
-        private static int sCalibrationToDo;
-        private static int sCalibrationDone;
-        // 25 is chosen to budget the sampling under 100 msec total, assuming 4 msec per sample.
-        private const int CalibrationSamples = 25;
-        private static double[] sCalibrationSamples = new double[CalibrationSamples * 2];
-
-        private static void CalibrateOnce()
-        {
-            if (sCalibrationToDo >= CalibrationSamples)
-                return;
-
-            int sample = Interlocked.Increment(ref sCalibrationToDo) - 1;
-            if (sample >= CalibrationSamples)
-                return;
-
-            // Actual calibration step. Let's try to fit into ~4 msec.
-            double[] calibrationState = sCalibrationSamples;
-
-            int id = 0;
-            long t1 = 0;
-            long _1msec = Stopwatch.Frequency / 1000;
-            int iters = 1;
-
-            // double the sample size until it is 1 msec.
-            // we may spend up to 3 msec in this loop in a worst case.
-            while (t1 < _1msec)
-            {
-                iters *= 2;
-                t1 = Stopwatch.GetTimestamp();
-                for (int i = 0; i < iters; i++)
-                {
-                    id = GetCurrentProcessorNumber();
-                }
-                t1 = Stopwatch.GetTimestamp() - t1;
-            }
-
-            // assuming TLS cannot be a lot slower than ID, this should take 1 msec
-            long t2 = Stopwatch.GetTimestamp();
-            for (int i = 0; i < iters; i++)
-            {
-                UninlinedThreadStatic();
-            }
-            long t3 = Stopwatch.GetTimestamp();
-
-            // if we have useful measurements, record a sample
-            if (id >= 0 && t1 > 0 && t3 - t2 > 0)
-            {
-                calibrationState[sample * 2] = (double)t1 / iters;            // ID
-                calibrationState[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS
-            }
-            else
-            {
-                // API is not functional or clock did not go forward.
-                // just pretend it was a very expensive sample with default ratio.
-                calibrationState[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec;
-                calibrationState[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec
-            }
-
-            // If this was the last sample computed, get best times and update the ratio of ID to TLS.
-            if (Interlocked.Increment(ref sCalibrationDone) == CalibrationSamples)
-            {
-                double idMin = double.MaxValue;
-                double tlsMin = double.MaxValue;
-                for (int i = 0; i < CalibrationSamples; i++)
-                {
-                    idMin = Math.Min(idMin, calibrationState[i * 2]);       //ID
-                    tlsMin = Math.Min(tlsMin, calibrationState[i * 2 + 1]); //TLS
-                }
-
-                sCalibrationSamples = null!;
-                ProcessorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin));
-            }
-        }
-
         internal void ResetThreadPoolThread()
         {
             // Currently implemented in unmanaged method Thread::InternalReset and

From df36e9db936292cf0d78db6e70bc5b1bee0b9af2 Mon Sep 17 00:00:00 2001
From: vsadov <vsadov@microsoft.com>
Date: Tue, 3 Dec 2019 23:58:18 -0800
Subject: [PATCH 03/11] fitting calibration under 5 msec total fixed some
 comments

---
 .../src/System/Threading/Thread.CoreCLR.cs    | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
index 60d337183ee402..b182b0930f05b4 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
@@ -548,8 +548,8 @@ internal static int GetCurrentProcessorId()
             //   is in start-up mode. That would just add to the "rush hour" traffic.
             private static int s_CalibrationToDo;
             private static int s_CalibrationDone;
-            // 25 is chosen to budget the sampling under 100 msec total, assuming 4 msec per sample.
-            private const int CalibrationSampleCount = 25;
+            // 10 is chosen to budget the sampling under 5 msec total, assuming 0.5 msec per sample.
+            private const int CalibrationSampleCount = 10;
             private static double[]? s_CalibrationSamples = new double[CalibrationSampleCount * 2];
 
             private static void CalibrateOnce(double[] calibrationSamples)
@@ -561,15 +561,15 @@ private static void CalibrateOnce(double[] calibrationSamples)
                 if (sample >= CalibrationSampleCount)
                     return;
 
-                // Actual calibration step. Let's try to fit into ~4 msec.
+                // Actual calibration step. Let's try to fit into ~50 usec.
                 int id = 0;
                 long t1 = 0;
-                long oneMillisecond = Stopwatch.Frequency / 1000;
+                long twentyMicrosecond = Stopwatch.Frequency / 50000;
                 int iters = 1;
 
                 // double the sample size until it is 1 msec.
-                // we may spend up to 3 msec in this loop in a worst case.
-                while (t1 < oneMillisecond)
+                // we may spend up to 40 usec in this loop in a worst case.
+                while (t1 < twentyMicrosecond)
                 {
                     iters *= 2;
                     t1 = Stopwatch.GetTimestamp();
@@ -580,7 +580,7 @@ private static void CalibrateOnce(double[] calibrationSamples)
                     t1 = Stopwatch.GetTimestamp() - t1;
                 }
 
-                // assuming TLS cannot be a lot slower than ID, this should take 1 msec
+                // assuming TLS takes 1/2 of CoreID time or less, this should take 10 usec or less
                 long t2 = Stopwatch.GetTimestamp();
                 for (int i = 0; i < iters; i++)
                 {
@@ -651,17 +651,17 @@ private static bool SimpleCoreIdSpeedCheck()
 
             // warm up the code paths.
             int id = CoreIdCache.UninlinedThreadStatic() | GetCurrentProcessorNumber();
-            long halfMicrosecond = Stopwatch.Frequency / 1000000;
+            long oneMicrosecond = Stopwatch.Frequency / 1000000;
 
-            // limit quick test to 100 microseconds.
+            // this loop should take < 50 usec. limit it to 100 usec just in case.
             // If we are on slow hardware, we should calibrate anyways.
             long limit = Stopwatch.Frequency / 10000 + Stopwatch.GetTimestamp();
             for (int i = 0; i < 10; i++)
             {
                 int iters = 1;
                 long t1 = 0;
-                // double the sample size until it is 0.5 usec.
-                while (t1 < halfMicrosecond)
+                // double the sample size until it is 1 usec.
+                while (t1 < oneMicrosecond)
                 {
                     iters *= 2;
                     t1 = Stopwatch.GetTimestamp();
@@ -672,7 +672,7 @@ private static bool SimpleCoreIdSpeedCheck()
                     t1 = Stopwatch.GetTimestamp() - t1;
                 }
 
-                // assuming TLS cannot be a lot slower than getting ID, this should take 1-5 usec
+                // assuming TLS cannot be a lot slower than getting ID, this should take 1-2 usec
                 long t2 = Stopwatch.GetTimestamp();
                 for (int j = 0; j < iters; j++)
                 {

From f911773ecbe56d5c88156e9090e15fe0dbe29359 Mon Sep 17 00:00:00 2001
From: vsadov <vsadov@microsoft.com>
Date: Wed, 4 Dec 2019 10:30:15 -0800
Subject: [PATCH 04/11] moved ProcessorIdCache to a separate file.

---
 .../System.Private.CoreLib.csproj             |   1 +
 .../src/System/Threading/ProcessorIdCache.cs  | 150 +++++++++++++++++
 .../src/System/Threading/Thread.CoreCLR.cs    | 155 +-----------------
 3 files changed, 158 insertions(+), 148 deletions(-)
 create mode 100644 src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs

diff --git a/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj b/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj
index 58b4d5fac6c028..38a1209abc0ba3 100644
--- a/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj
+++ b/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj
@@ -274,6 +274,7 @@
     <Compile Include="$(BclSourcesRoot)\System\Threading\StackCrawlMark.cs" />
     <Compile Include="$(BclSourcesRoot)\System\Threading\SynchronizationContext.CoreCLR.cs" />
     <Compile Include="$(BclSourcesRoot)\System\Threading\Thread.CoreCLR.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Threading\ProcessorIdCache.cs" />
     <Compile Include="$(BclSourcesRoot)\System\Threading\ThreadPool.CoreCLR.cs" />
     <Compile Include="$(BclSourcesRoot)\System\Threading\Timer.CoreCLR.cs" />
     <Compile Include="$(BclSourcesRoot)\System\Threading\WaitHandle.CoreCLR.cs" />
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
new file mode 100644
index 00000000000000..f09079b8cd1786
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
@@ -0,0 +1,150 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+
+namespace System.Threading
+{
+    internal static class ProcessorIdCache
+    {
+        // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
+        // the t_currentProcessorIdCache are counting down to get it periodically refreshed.
+        // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
+        // actions that are likely to result in changing the executing core
+        [ThreadStatic]
+        private static int t_currentProcessorIdCache;
+
+        private const int ProcessorIdCacheShift = 16;
+        private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
+        // 50 is our best guess.
+        // Based on further calibration it is likley to be adjusted lower.
+        // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number.
+        private static int s_processorIdRefreshRate = 50;
+        // We will not adjust higher than this though.
+        private const int MaxIdRefreshRate = 5000;
+
+        private static int RefreshCurrentProcessorId()
+        {
+            double[]? calibrationSamples = s_CalibrationSamples;
+            if (calibrationSamples != null)
+                CalibrateOnce(calibrationSamples);
+
+            int currentProcessorId = Thread.GetCurrentProcessorNumber();
+
+            // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
+            // doesn't exist on all platforms.  On those it doesn't exist on, GetCurrentProcessorNumber()
+            // returns -1.  As a fallback in that case and to spread the threads across the buckets
+            // by default, we use the current managed thread ID as a proxy.
+            if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;
+
+            Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask);
+
+            // Mask with int.MaxValue to ensure the execution Id is not negative
+            t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | s_processorIdRefreshRate;
+
+            return currentProcessorId;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static int GetCurrentProcessorId()
+        {
+            if (s_processorIdRefreshRate <= 2)
+                return Thread.GetCurrentProcessorNumber();
+
+            int currentProcessorIdCache = t_currentProcessorIdCache--;
+            if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
+            {
+                return RefreshCurrentProcessorId();
+            }
+
+            return currentProcessorIdCache >> ProcessorIdCacheShift;
+        }
+
+        // We must collect multiple samples to account for irregularities caused by GC and context switches.
+        // Why we keep an array of samples and do not adjust as we go:
+        //   We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all
+        //   the calibration work to happen much sooner. There is no urgency in being calibrated while the app
+        //   is in start-up mode. That would just add to the "rush hour" traffic.
+        private static int s_CalibrationToDo;
+        private static int s_CalibrationDone;
+        // 10 is chosen to budget the sampling under 5 msec total, assuming 0.5 msec per sample.
+        private const int CalibrationSampleCount = 10;
+        private static double[]? s_CalibrationSamples = new double[CalibrationSampleCount * 2];
+
+        private static void CalibrateOnce(double[] calibrationSamples)
+        {
+            if (s_CalibrationToDo >= CalibrationSampleCount)
+                return;
+
+            int sample = Interlocked.Increment(ref s_CalibrationToDo) - 1;
+            if (sample >= CalibrationSampleCount)
+                return;
+
+            // Actual calibration step. Let's try to fit into ~50 usec.
+            int id = 0;
+            long t1 = 0;
+            long twentyMicrosecond = Stopwatch.Frequency / 50000;
+            int iters = 1;
+
+            // double the sample size until it is 1 msec.
+            // we may spend up to 40 usec in this loop in a worst case.
+            while (t1 < twentyMicrosecond)
+            {
+                iters *= 2;
+                t1 = Stopwatch.GetTimestamp();
+                for (int i = 0; i < iters; i++)
+                {
+                    id = Thread.GetCurrentProcessorNumber();
+                }
+                t1 = Stopwatch.GetTimestamp() - t1;
+            }
+
+            // assuming TLS takes 1/2 of ProcessorNumber time or less, this should take 10 usec or less
+            long t2 = Stopwatch.GetTimestamp();
+            for (int i = 0; i < iters; i++)
+            {
+                UninlinedThreadStatic();
+            }
+            long t3 = Stopwatch.GetTimestamp();
+
+            // if we have useful measurements, record a sample
+            if (id >= 0 && t1 > 0 && t3 - t2 > 0)
+            {
+                calibrationSamples[sample * 2] = (double)t1 / iters;            // ID
+                calibrationSamples[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS
+            }
+            else
+            {
+                // API is not functional or clock did not go forward.
+                // just pretend it was a very expensive sample with default ratio.
+                calibrationSamples[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec;
+                calibrationSamples[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec
+            }
+
+            // If this was the last sample computed, get best times and update the ratio of ID to TLS.
+            if (Interlocked.Increment(ref s_CalibrationDone) == CalibrationSampleCount)
+            {
+                double idMin = double.MaxValue;
+                double tlsMin = double.MaxValue;
+                for (int i = 0; i < CalibrationSampleCount; i++)
+                {
+                    idMin = Math.Min(idMin, calibrationSamples[i * 2]);       //ID
+                    tlsMin = Math.Min(tlsMin, calibrationSamples[i * 2 + 1]); //TLS
+                }
+
+                s_CalibrationSamples = null;
+                s_processorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin));
+            }
+        }
+
+        // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.
+        // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching.
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        internal static int UninlinedThreadStatic()
+        {
+            return t_currentProcessorIdCache;
+        }
+    }
+}
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
index b182b0930f05b4..4fc4278e35bd33 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
@@ -484,173 +484,32 @@ private static int CalculateOptimalMaxSpinWaitsPerSpinIteration()
         }
 
         [MethodImpl(MethodImplOptions.InternalCall)]
-        private static extern int GetCurrentProcessorNumber();
-
-        private static class CoreIdCache
-        {
-            // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
-            // the t_currentProcessorIdCache are counting down to get it periodically refreshed.
-            // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
-            // actions that are likely to result in changing the executing core
-            [ThreadStatic]
-            private static int t_currentProcessorIdCache;
-
-            private const int ProcessorIdCacheShift = 16;
-            private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
-            // 50 is our best guess.
-            // Based on further calibration it is likley to be adjusted lower.
-            // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number.
-            private static int s_processorIdRefreshRate = 50;
-            // We will not adjust higher than this though.
-            private const int MaxIdRefreshRate = 5000;
-
-            private static int RefreshCurrentProcessorId()
-            {
-                double[]? calibrationSamples = s_CalibrationSamples;
-                if (calibrationSamples != null)
-                    CalibrateOnce(calibrationSamples);
-
-                int currentProcessorId = GetCurrentProcessorNumber();
-
-                // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
-                // doesn't exist on all platforms.  On those it doesn't exist on, GetCurrentProcessorNumber()
-                // returns -1.  As a fallback in that case and to spread the threads across the buckets
-                // by default, we use the current managed thread ID as a proxy.
-                if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;
-
-                Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask);
-
-                // Mask with int.MaxValue to ensure the execution Id is not negative
-                t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | s_processorIdRefreshRate;
-
-                return currentProcessorId;
-            }
-
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            internal static int GetCurrentProcessorId()
-            {
-                if (s_processorIdRefreshRate <= 2)
-                    return GetCurrentProcessorNumber();
-
-                int currentProcessorIdCache = t_currentProcessorIdCache--;
-                if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
-                {
-                    return RefreshCurrentProcessorId();
-                }
-
-                return currentProcessorIdCache >> ProcessorIdCacheShift;
-            }
-
-            // We must collect multiple samples to account for irregularities caused by GC and context switches.
-            // Why we keep an array of samples and do not adjust as we go:
-            //   We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all
-            //   the calibration work to happen much sooner. There is no urgency in being calibrated while the app
-            //   is in start-up mode. That would just add to the "rush hour" traffic.
-            private static int s_CalibrationToDo;
-            private static int s_CalibrationDone;
-            // 10 is chosen to budget the sampling under 5 msec total, assuming 0.5 msec per sample.
-            private const int CalibrationSampleCount = 10;
-            private static double[]? s_CalibrationSamples = new double[CalibrationSampleCount * 2];
-
-            private static void CalibrateOnce(double[] calibrationSamples)
-            {
-                if (s_CalibrationToDo >= CalibrationSampleCount)
-                    return;
-
-                int sample = Interlocked.Increment(ref s_CalibrationToDo) - 1;
-                if (sample >= CalibrationSampleCount)
-                    return;
-
-                // Actual calibration step. Let's try to fit into ~50 usec.
-                int id = 0;
-                long t1 = 0;
-                long twentyMicrosecond = Stopwatch.Frequency / 50000;
-                int iters = 1;
-
-                // double the sample size until it is 1 msec.
-                // we may spend up to 40 usec in this loop in a worst case.
-                while (t1 < twentyMicrosecond)
-                {
-                    iters *= 2;
-                    t1 = Stopwatch.GetTimestamp();
-                    for (int i = 0; i < iters; i++)
-                    {
-                        id = GetCurrentProcessorNumber();
-                    }
-                    t1 = Stopwatch.GetTimestamp() - t1;
-                }
-
-                // assuming TLS takes 1/2 of CoreID time or less, this should take 10 usec or less
-                long t2 = Stopwatch.GetTimestamp();
-                for (int i = 0; i < iters; i++)
-                {
-                    UninlinedThreadStatic();
-                }
-                long t3 = Stopwatch.GetTimestamp();
-
-                // if we have useful measurements, record a sample
-                if (id >= 0 && t1 > 0 && t3 - t2 > 0)
-                {
-                    calibrationSamples[sample * 2] = (double)t1 / iters;            // ID
-                    calibrationSamples[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS
-                }
-                else
-                {
-                    // API is not functional or clock did not go forward.
-                    // just pretend it was a very expensive sample with default ratio.
-                    calibrationSamples[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec;
-                    calibrationSamples[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec
-                }
-
-                // If this was the last sample computed, get best times and update the ratio of ID to TLS.
-                if (Interlocked.Increment(ref s_CalibrationDone) == CalibrationSampleCount)
-                {
-                    double idMin = double.MaxValue;
-                    double tlsMin = double.MaxValue;
-                    for (int i = 0; i < CalibrationSampleCount; i++)
-                    {
-                        idMin = Math.Min(idMin, calibrationSamples[i * 2]);       //ID
-                        tlsMin = Math.Min(tlsMin, calibrationSamples[i * 2 + 1]); //TLS
-                    }
-
-                    s_CalibrationSamples = null;
-                    s_processorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin));
-                }
-            }
-
-            // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.
-            // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching.
-            [MethodImpl(MethodImplOptions.NoInlining)]
-            internal static int UninlinedThreadStatic()
-            {
-                return t_currentProcessorIdCache;
-            }
-        }
+        internal static extern int GetCurrentProcessorNumber();
 
         // Cached processor id could be used as a hint for which per-core stripe of data to access to avoid sharing.
         // It is periodically refreshed to trail the actual thread core affinity.
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static int GetCurrentProcessorId()
         {
-            if (s_isCoreIdReallyFast)
+            if (s_isProcessorNumberReallyFast)
                 return GetCurrentProcessorNumber();
 
-            return CoreIdCache.GetCurrentProcessorId();
+            return ProcessorIdCache.GetCurrentProcessorId();
         }
 
         // do a fast check and record in a readonly static so that it could become a JIT constant
-        private static readonly bool s_isCoreIdReallyFast = SimpleCoreIdSpeedCheck();
+        private static readonly bool s_isProcessorNumberReallyFast = SimpleProcessorNumberSpeedCheck();
 
         // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false.
         // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch.
-        private static bool SimpleCoreIdSpeedCheck()
+        private static bool SimpleProcessorNumberSpeedCheck()
         {
             // NOTE: We do not check the frequency of the Stopwatch.
             //       If the resolution, precision or access time to the timer are inadequate for our measures here,
             //       the test will fail anyways.
 
             // warm up the code paths.
-            int id = CoreIdCache.UninlinedThreadStatic() | GetCurrentProcessorNumber();
+            int id = ProcessorIdCache.UninlinedThreadStatic() | GetCurrentProcessorNumber();
             long oneMicrosecond = Stopwatch.Frequency / 1000000;
 
             // this loop should take < 50 usec. limit it to 100 usec just in case.
@@ -676,7 +535,7 @@ private static bool SimpleCoreIdSpeedCheck()
                 long t2 = Stopwatch.GetTimestamp();
                 for (int j = 0; j < iters; j++)
                 {
-                    CoreIdCache.UninlinedThreadStatic();
+                    ProcessorIdCache.UninlinedThreadStatic();
                 }
                 long t3 = Stopwatch.GetTimestamp();
 

From bb4847f48fc4fa440306920aa5831a933c51d773 Mon Sep 17 00:00:00 2001
From: vsadov <vsadov@microsoft.com>
Date: Thu, 5 Dec 2019 18:32:57 -0800
Subject: [PATCH 05/11] increased "fast" threshold to 3x and moved
 SimpleProcessorNumberSpeedCheck to ProcessorIDCache.cs

---
 .../src/System/Threading/ProcessorIdCache.cs  | 53 +++++++++++++++++--
 .../src/System/Threading/Thread.CoreCLR.cs    | 52 +-----------------
 2 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
index f09079b8cd1786..ef0eb3d893ba55 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
@@ -50,9 +50,6 @@ private static int RefreshCurrentProcessorId()
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static int GetCurrentProcessorId()
         {
-            if (s_processorIdRefreshRate <= 2)
-                return Thread.GetCurrentProcessorNumber();
-
             int currentProcessorIdCache = t_currentProcessorIdCache--;
             if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
             {
@@ -139,6 +136,56 @@ private static void CalibrateOnce(double[] calibrationSamples)
             }
         }
 
+        // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false.
+        // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch.
+        internal static bool SimpleProcessorNumberSpeedCheck()
+        {
+            // NOTE: We do not check the frequency of the Stopwatch.
+            //       If the resolution, precision or access time to the timer are inadequate for our measures here,
+            //       the test will fail anyways.
+
+            // warm up the code paths.
+            int id = UninlinedThreadStatic() | Thread.GetCurrentProcessorNumber();
+            long oneMicrosecond = Stopwatch.Frequency / 1000000;
+
+            // this loop should take < 50 usec. limit it to 100 usec just in case.
+            // If we are on slow hardware, we should calibrate anyways.
+            long limit = Stopwatch.Frequency / 10000 + Stopwatch.GetTimestamp();
+            for (int i = 0; i < 10; i++)
+            {
+                int iters = 1;
+                long t1 = 0;
+                // double the sample size until it is 1 usec.
+                while (t1 < oneMicrosecond)
+                {
+                    iters *= 2;
+                    t1 = Stopwatch.GetTimestamp();
+                    for (int j = 0; j < iters; j++)
+                    {
+                        id = Thread.GetCurrentProcessorNumber();
+                    }
+                    t1 = Stopwatch.GetTimestamp() - t1;
+                }
+
+                // assuming TLS cannot be a lot slower than getting ID, this should take 1-2 usec
+                long t2 = Stopwatch.GetTimestamp();
+                for (int j = 0; j < iters; j++)
+                {
+                    UninlinedThreadStatic();
+                }
+                long t3 = Stopwatch.GetTimestamp();
+
+                // if getting ID took longer than 3x TLS access, we should consider caching.
+                if (t3 > limit || (t3 - t2) * 3 < t1)
+                {
+                    return false;
+                }
+            }
+
+            // Make sure the result was not negative, which would indicate "Not Supported"
+            return id >= 0;
+        }
+
         // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.
         // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching.
         [MethodImpl(MethodImplOptions.NoInlining)]
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
index 4fc4278e35bd33..3c17b39c2b558b 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
@@ -498,57 +498,7 @@ public static int GetCurrentProcessorId()
         }
 
         // do a fast check and record in a readonly static so that it could become a JIT constant
-        private static readonly bool s_isProcessorNumberReallyFast = SimpleProcessorNumberSpeedCheck();
-
-        // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false.
-        // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch.
-        private static bool SimpleProcessorNumberSpeedCheck()
-        {
-            // NOTE: We do not check the frequency of the Stopwatch.
-            //       If the resolution, precision or access time to the timer are inadequate for our measures here,
-            //       the test will fail anyways.
-
-            // warm up the code paths.
-            int id = ProcessorIdCache.UninlinedThreadStatic() | GetCurrentProcessorNumber();
-            long oneMicrosecond = Stopwatch.Frequency / 1000000;
-
-            // this loop should take < 50 usec. limit it to 100 usec just in case.
-            // If we are on slow hardware, we should calibrate anyways.
-            long limit = Stopwatch.Frequency / 10000 + Stopwatch.GetTimestamp();
-            for (int i = 0; i < 10; i++)
-            {
-                int iters = 1;
-                long t1 = 0;
-                // double the sample size until it is 1 usec.
-                while (t1 < oneMicrosecond)
-                {
-                    iters *= 2;
-                    t1 = Stopwatch.GetTimestamp();
-                    for (int j = 0; j < iters; j++)
-                    {
-                        id = GetCurrentProcessorNumber();
-                    }
-                    t1 = Stopwatch.GetTimestamp() - t1;
-                }
-
-                // assuming TLS cannot be a lot slower than getting ID, this should take 1-2 usec
-                long t2 = Stopwatch.GetTimestamp();
-                for (int j = 0; j < iters; j++)
-                {
-                    ProcessorIdCache.UninlinedThreadStatic();
-                }
-                long t3 = Stopwatch.GetTimestamp();
-
-                // if getting ID took longer than 2x TLS access, we should consider caching.
-                if (t3 > limit || (t3 - t2) * 2 < t1)
-                {
-                    return false;
-                }
-            }
-
-            // Make sure the result was not negative, which would indicate "Not Supported"
-            return id >= 0;
-        }
+        private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.SimpleProcessorNumberSpeedCheck();
 
         internal void ResetThreadPoolThread()
         {

From f36c13b5c9990e1f214ddd128464bd0446541672 Mon Sep 17 00:00:00 2001
From: vsadov <vsadov@microsoft.com>
Date: Fri, 6 Dec 2019 17:56:23 -0800
Subject: [PATCH 06/11] Drop the sample array when quick check is a pass.

---
 .../src/System/Threading/ProcessorIdCache.cs             | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
index ef0eb3d893ba55..223b23eb3a933f 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
@@ -183,7 +183,14 @@ internal static bool SimpleProcessorNumberSpeedCheck()
             }
 
             // Make sure the result was not negative, which would indicate "Not Supported"
-            return id >= 0;
+            if (id < 0)
+            {
+                return false;
+            }
+
+            // GetCurrentProcessorNumber  is fast, no more checks needed.
+            s_CalibrationSamples = null;
+            return true;
         }
 
         // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.

From 6efaca4e98d157d59c1a8c5f1f69aec3bbe2d113 Mon Sep 17 00:00:00 2001
From: vsadov <vsadov@microsoft.com>
Date: Wed, 11 Dec 2019 21:56:04 -0800
Subject: [PATCH 07/11] One-pass ProcessorNumberSpeedCheck.

---
 .../src/System/Threading/ProcessorIdCache.cs  | 140 ++++--------------
 .../src/System/Threading/Thread.CoreCLR.cs    |   2 +-
 2 files changed, 31 insertions(+), 111 deletions(-)

diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
index 223b23eb3a933f..e1ec09a94ec1b5 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
@@ -19,18 +19,14 @@ internal static class ProcessorIdCache
         private const int ProcessorIdCacheShift = 16;
         private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
         // 50 is our best guess.
-        // Based on further calibration it is likley to be adjusted lower.
-        // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number.
+        // Based on speed check it will be adjusted, typically lower.
+        // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be adjusted to a higher number.
         private static int s_processorIdRefreshRate = 50;
         // We will not adjust higher than this though.
         private const int MaxIdRefreshRate = 5000;
 
         private static int RefreshCurrentProcessorId()
         {
-            double[]? calibrationSamples = s_CalibrationSamples;
-            if (calibrationSamples != null)
-                CalibrateOnce(calibrationSamples);
-
             int currentProcessorId = Thread.GetCurrentProcessorNumber();
 
             // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
@@ -59,138 +55,62 @@ internal static int GetCurrentProcessorId()
             return currentProcessorIdCache >> ProcessorIdCacheShift;
         }
 
-        // We must collect multiple samples to account for irregularities caused by GC and context switches.
-        // Why we keep an array of samples and do not adjust as we go:
-        //   We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all
-        //   the calibration work to happen much sooner. There is no urgency in being calibrated while the app
-        //   is in start-up mode. That would just add to the "rush hour" traffic.
-        private static int s_CalibrationToDo;
-        private static int s_CalibrationDone;
-        // 10 is chosen to budget the sampling under 5 msec total, assuming 0.5 msec per sample.
-        private const int CalibrationSampleCount = 10;
-        private static double[]? s_CalibrationSamples = new double[CalibrationSampleCount * 2];
-
-        private static void CalibrateOnce(double[] calibrationSamples)
-        {
-            if (s_CalibrationToDo >= CalibrationSampleCount)
-                return;
-
-            int sample = Interlocked.Increment(ref s_CalibrationToDo) - 1;
-            if (sample >= CalibrationSampleCount)
-                return;
-
-            // Actual calibration step. Let's try to fit into ~50 usec.
-            int id = 0;
-            long t1 = 0;
-            long twentyMicrosecond = Stopwatch.Frequency / 50000;
-            int iters = 1;
-
-            // double the sample size until it is 1 msec.
-            // we may spend up to 40 usec in this loop in a worst case.
-            while (t1 < twentyMicrosecond)
-            {
-                iters *= 2;
-                t1 = Stopwatch.GetTimestamp();
-                for (int i = 0; i < iters; i++)
-                {
-                    id = Thread.GetCurrentProcessorNumber();
-                }
-                t1 = Stopwatch.GetTimestamp() - t1;
-            }
-
-            // assuming TLS takes 1/2 of ProcessorNumber time or less, this should take 10 usec or less
-            long t2 = Stopwatch.GetTimestamp();
-            for (int i = 0; i < iters; i++)
-            {
-                UninlinedThreadStatic();
-            }
-            long t3 = Stopwatch.GetTimestamp();
-
-            // if we have useful measurements, record a sample
-            if (id >= 0 && t1 > 0 && t3 - t2 > 0)
-            {
-                calibrationSamples[sample * 2] = (double)t1 / iters;            // ID
-                calibrationSamples[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS
-            }
-            else
-            {
-                // API is not functional or clock did not go forward.
-                // just pretend it was a very expensive sample with default ratio.
-                calibrationSamples[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec;
-                calibrationSamples[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec
-            }
-
-            // If this was the last sample computed, get best times and update the ratio of ID to TLS.
-            if (Interlocked.Increment(ref s_CalibrationDone) == CalibrationSampleCount)
-            {
-                double idMin = double.MaxValue;
-                double tlsMin = double.MaxValue;
-                for (int i = 0; i < CalibrationSampleCount; i++)
-                {
-                    idMin = Math.Min(idMin, calibrationSamples[i * 2]);       //ID
-                    tlsMin = Math.Min(tlsMin, calibrationSamples[i * 2 + 1]); //TLS
-                }
-
-                s_CalibrationSamples = null;
-                s_processorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin));
-            }
-        }
-
         // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false.
         // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch.
-        internal static bool SimpleProcessorNumberSpeedCheck()
+        internal static unsafe int ProcessorNumberSpeedCheck()
         {
             // NOTE: We do not check the frequency of the Stopwatch.
+            //       The frequency often does not match the actual timer refresh rate anyways.
             //       If the resolution, precision or access time to the timer are inadequate for our measures here,
             //       the test will fail anyways.
 
+            // default values pretend to be very long samples with default ratio
+            double minID = Stopwatch.Frequency * 50;  // 50 sec
+            double minTLS = Stopwatch.Frequency;      // 1 sec
+
             // warm up the code paths.
-            int id = UninlinedThreadStatic() | Thread.GetCurrentProcessorNumber();
-            long oneMicrosecond = Stopwatch.Frequency / 1000000;
+            UninlinedThreadStatic();
+            if (Thread.GetCurrentProcessorNumber() < 0)
+                return MaxIdRefreshRate;
 
-            // this loop should take < 50 usec. limit it to 100 usec just in case.
-            // If we are on slow hardware, we should calibrate anyways.
-            long limit = Stopwatch.Frequency / 10000 + Stopwatch.GetTimestamp();
+            long oneMicrosecond = Stopwatch.Frequency / 1000000;
             for (int i = 0; i < 10; i++)
             {
-                int iters = 1;
+                // we will measure at least 16 iterations and at least 1 microsecond
+                int iters = 16;
                 long t1 = 0;
-                // double the sample size until it is 1 usec.
                 while (t1 < oneMicrosecond)
                 {
                     iters *= 2;
                     t1 = Stopwatch.GetTimestamp();
                     for (int j = 0; j < iters; j++)
                     {
-                        id = Thread.GetCurrentProcessorNumber();
+                        Thread.GetCurrentProcessorNumber();
                     }
                     t1 = Stopwatch.GetTimestamp() - t1;
                 }
 
-                // assuming TLS cannot be a lot slower than getting ID, this should take 1-2 usec
-                long t2 = Stopwatch.GetTimestamp();
-                for (int j = 0; j < iters; j++)
-                {
-                    UninlinedThreadStatic();
-                }
-                long t3 = Stopwatch.GetTimestamp();
+                minID = Math.Min(minID, (double)t1 / iters);
 
-                // if getting ID took longer than 3x TLS access, we should consider caching.
-                if (t3 > limit || (t3 - t2) * 3 < t1)
+                // we will measure at least 16 iterations and at least 1 microsecond
+                iters = 16;
+                t1 = 0;
+                while (t1 < oneMicrosecond)
                 {
-                    return false;
+                    iters *= 2;
+                    t1 = Stopwatch.GetTimestamp();
+                    for (int j = 0; j < iters; j++)
+                    {
+                        UninlinedThreadStatic();
+                    }
+                    t1 = Stopwatch.GetTimestamp() - t1;
                 }
-            }
 
-            // Make sure the result was not negative, which would indicate "Not Supported"
-            if (id < 0)
-            {
-                return false;
+                minTLS = Math.Min(minTLS, (double)t1 / iters);
             }
 
-            // GetCurrentProcessorNumber  is fast, no more checks needed.
-            s_CalibrationSamples = null;
-            return true;
+            s_processorIdRefreshRate = Math.Min((int)(minID / minTLS), MaxIdRefreshRate);
+            return s_processorIdRefreshRate;
         }
 
         // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
index 3c17b39c2b558b..c0a2e37de65f25 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
@@ -498,7 +498,7 @@ public static int GetCurrentProcessorId()
         }
 
         // do a fast check and record in a readonly static so that it could become a JIT constant
-        private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.SimpleProcessorNumberSpeedCheck();
+        private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.ProcessorNumberSpeedCheck() <= 3;
 
         internal void ResetThreadPoolThread()
         {

From 23c6de21c3ec1903b4296d671ea75a55a56bc256 Mon Sep 17 00:00:00 2001
From: vsadov <vsadov@microsoft.com>
Date: Thu, 12 Dec 2019 13:26:22 -0800
Subject: [PATCH 08/11] PR feedback (modulo refresh rate)

---
 .../src/System/Threading/ProcessorIdCache.cs  | 37 ++++++++++---------
 .../src/System/Threading/Thread.CoreCLR.cs    |  2 +-
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
index e1ec09a94ec1b5..824eb71036d288 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
@@ -18,10 +18,8 @@ internal static class ProcessorIdCache
 
         private const int ProcessorIdCacheShift = 16;
         private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
-        // 50 is our best guess.
-        // Based on speed check it will be adjusted, typically lower.
-        // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be adjusted to a higher number.
-        private static int s_processorIdRefreshRate = 50;
+        // Refresh rate of the cache. Will be derived from a speed check of GetCurrentProcessorNumber API.
+        private static int s_processorIdRefreshRate;
         // We will not adjust higher than this though.
         private const int MaxIdRefreshRate = 5000;
 
@@ -57,7 +55,7 @@ internal static int GetCurrentProcessorId()
 
         // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false.
         // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch.
-        internal static unsafe int ProcessorNumberSpeedCheck()
+        internal static bool ProcessorNumberSpeedCheck()
         {
             // NOTE: We do not check the frequency of the Stopwatch.
             //       The frequency often does not match the actual timer refresh rate anyways.
@@ -70,16 +68,20 @@ internal static unsafe int ProcessorNumberSpeedCheck()
 
             // warm up the code paths.
             UninlinedThreadStatic();
+            // also check if API is actually functional (-1 means not supported)
             if (Thread.GetCurrentProcessorNumber() < 0)
-                return MaxIdRefreshRate;
+            {
+                s_processorIdRefreshRate = int.MaxValue;
+                return false;
+            }
 
             long oneMicrosecond = Stopwatch.Frequency / 1000000;
             for (int i = 0; i < 10; i++)
             {
                 // we will measure at least 16 iterations and at least 1 microsecond
-                int iters = 16;
-                long t1 = 0;
-                while (t1 < oneMicrosecond)
+                long t1;
+                int iters = 8;
+                do
                 {
                     iters *= 2;
                     t1 = Stopwatch.GetTimestamp();
@@ -88,29 +90,30 @@ internal static unsafe int ProcessorNumberSpeedCheck()
                         Thread.GetCurrentProcessorNumber();
                     }
                     t1 = Stopwatch.GetTimestamp() - t1;
-                }
+                } while (t1 < oneMicrosecond);
 
                 minID = Math.Min(minID, (double)t1 / iters);
 
-                // we will measure at least 16 iterations and at least 1 microsecond
-                iters = 16;
-                t1 = 0;
-                while (t1 < oneMicrosecond)
+                // we will measure at least 1 microsecond,
+                // and use at least 1/2 of ProcID iterations
+                // we assume that TLS can't be more than 2x slower than ProcID
+                iters /= 2;
+                do
                 {
-                    iters *= 2;
                     t1 = Stopwatch.GetTimestamp();
                     for (int j = 0; j < iters; j++)
                     {
                         UninlinedThreadStatic();
                     }
                     t1 = Stopwatch.GetTimestamp() - t1;
-                }
+                    iters *= 2;
+                } while (t1 < oneMicrosecond) ;
 
                 minTLS = Math.Min(minTLS, (double)t1 / iters);
             }
 
             s_processorIdRefreshRate = Math.Min((int)(minID / minTLS), MaxIdRefreshRate);
-            return s_processorIdRefreshRate;
+            return s_processorIdRefreshRate >= 3;
         }
 
         // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
index c0a2e37de65f25..8626aaed38bf5e 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
@@ -498,7 +498,7 @@ public static int GetCurrentProcessorId()
         }
 
         // do a fast check and record in a readonly static so that it could become a JIT constant
-        private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.ProcessorNumberSpeedCheck() <= 3;
+        private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.ProcessorNumberSpeedCheck();
 
         internal void ResetThreadPoolThread()
         {

From 3154533273f3f5f6c587d1531a462757627cc2d8 Mon Sep 17 00:00:00 2001
From: vsadov <vsadov@microsoft.com>
Date: Sat, 14 Dec 2019 13:26:38 -0800
Subject: [PATCH 09/11] setting amortization ratio to 5 couple fixes

---
 .../src/System/Threading/ProcessorIdCache.cs   | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
index 824eb71036d288..27dba76d1eac10 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
@@ -31,7 +31,8 @@ private static int RefreshCurrentProcessorId()
             // doesn't exist on all platforms.  On those it doesn't exist on, GetCurrentProcessorNumber()
             // returns -1.  As a fallback in that case and to spread the threads across the buckets
             // by default, we use the current managed thread ID as a proxy.
-            if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;
+            if (currentProcessorId < 0)
+                currentProcessorId = Environment.CurrentManagedThreadId;
 
             Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask);
 
@@ -62,9 +63,8 @@ internal static bool ProcessorNumberSpeedCheck()
             //       If the resolution, precision or access time to the timer are inadequate for our measures here,
             //       the test will fail anyways.
 
-            // default values pretend to be very long samples with default ratio
-            double minID = Stopwatch.Frequency * 50;  // 50 sec
-            double minTLS = Stopwatch.Frequency;      // 1 sec
+            double minID = double.MaxValue;
+            double minTLS = double.MaxValue;
 
             // warm up the code paths.
             UninlinedThreadStatic();
@@ -75,7 +75,7 @@ internal static bool ProcessorNumberSpeedCheck()
                 return false;
             }
 
-            long oneMicrosecond = Stopwatch.Frequency / 1000000;
+            long oneMicrosecond = Stopwatch.Frequency / 1000000 + 1;
             for (int i = 0; i < 10; i++)
             {
                 // we will measure at least 16 iterations and at least 1 microsecond
@@ -97,23 +97,23 @@ internal static bool ProcessorNumberSpeedCheck()
                 // we will measure at least 1 microsecond,
                 // and use at least 1/2 of ProcID iterations
                 // we assume that TLS can't be more than 2x slower than ProcID
-                iters /= 2;
+                iters /= 4;
                 do
                 {
+                    iters *= 2;
                     t1 = Stopwatch.GetTimestamp();
                     for (int j = 0; j < iters; j++)
                     {
                         UninlinedThreadStatic();
                     }
                     t1 = Stopwatch.GetTimestamp() - t1;
-                    iters *= 2;
                 } while (t1 < oneMicrosecond) ;
 
                 minTLS = Math.Min(minTLS, (double)t1 / iters);
             }
 
-            s_processorIdRefreshRate = Math.Min((int)(minID / minTLS), MaxIdRefreshRate);
-            return s_processorIdRefreshRate >= 3;
+            s_processorIdRefreshRate = Math.Min((int)(minID * 5 / minTLS), MaxIdRefreshRate);
+            return s_processorIdRefreshRate <= 5;
         }
 
         // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.

From f483aa9492e98e45812a5511e0f43ab093054b60 Mon Sep 17 00:00:00 2001
From: vsadov <vsadov@microsoft.com>
Date: Sat, 14 Dec 2019 16:51:53 -0800
Subject: [PATCH 10/11] fix unimplemented `GetCurrentProcessorNumber` case

---
 .../src/System/Threading/ProcessorIdCache.cs                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
index 27dba76d1eac10..feb231f66c5437 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
@@ -71,7 +71,7 @@ internal static bool ProcessorNumberSpeedCheck()
             // also check if API is actually functional (-1 means not supported)
             if (Thread.GetCurrentProcessorNumber() < 0)
             {
-                s_processorIdRefreshRate = int.MaxValue;
+                s_processorIdRefreshRate = ProcessorIdCacheCountDownMask;
                 return false;
             }
 

From 1618787fa4471c84bbddc86980ba3efdcae9f0d4 Mon Sep 17 00:00:00 2001
From: vsadov <vsadov@microsoft.com>
Date: Sun, 15 Dec 2019 00:15:21 -0800
Subject: [PATCH 11/11] PR feedback

---
 .../src/System/Threading/ProcessorIdCache.cs  | 45 +++++++++++++++----
 .../src/System/Threading/Thread.CoreCLR.cs    |  3 +-
 2 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
index feb231f66c5437..7f11ab5efa8c1a 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
@@ -79,20 +79,20 @@ internal static bool ProcessorNumberSpeedCheck()
             for (int i = 0; i < 10; i++)
             {
                 // we will measure at least 16 iterations and at least 1 microsecond
-                long t1;
+                long t;
                 int iters = 8;
                 do
                 {
                     iters *= 2;
-                    t1 = Stopwatch.GetTimestamp();
+                    t = Stopwatch.GetTimestamp();
                     for (int j = 0; j < iters; j++)
                     {
                         Thread.GetCurrentProcessorNumber();
                     }
-                    t1 = Stopwatch.GetTimestamp() - t1;
-                } while (t1 < oneMicrosecond);
+                    t = Stopwatch.GetTimestamp() - t;
+                } while (t < oneMicrosecond);
 
-                minID = Math.Min(minID, (double)t1 / iters);
+                minID = Math.Min(minID, (double)t / iters);
 
                 // we will measure at least 1 microsecond,
                 // and use at least 1/2 of ProcID iterations
@@ -101,18 +101,45 @@ internal static bool ProcessorNumberSpeedCheck()
                 do
                 {
                     iters *= 2;
-                    t1 = Stopwatch.GetTimestamp();
+                    t = Stopwatch.GetTimestamp();
                     for (int j = 0; j < iters; j++)
                     {
                         UninlinedThreadStatic();
                     }
-                    t1 = Stopwatch.GetTimestamp() - t1;
-                } while (t1 < oneMicrosecond) ;
+                    t = Stopwatch.GetTimestamp() - t;
+                } while (t < oneMicrosecond);
 
-                minTLS = Math.Min(minTLS, (double)t1 / iters);
+                minTLS = Math.Min(minTLS, (double)t / iters);
             }
 
+            // A few words about choosing cache refresh rate:
+            //
+            // There are too reasons why data structures use core affinity:
+            // 1) To improve locality - avoid running on one core and using data in other core's cache.
+            // 2) To reduce sharing - avoid multiple threads using the same piece of data.
+            //
+            // Scenarios with large footprint, like striped caches, are sensitive to both parts. It is desirable to access
+            // large data from the "right" core.
+            // In scenarios where the state is small, like a striped counter, it is mostly about sharing.
+            // Otherwise the state is small and occasionally moving counter to a different core via cache miss is not a big deal.
+            //
+            // In scenarios that care more about sharing precise results of GetCurrentProcessorNumber may not justify
+            // the cost unless the underlying implementation is very cheap.
+            // In such cases it is desirable to amortize the cost over multiple accesses by caching in a ThreadStatic.
+            //
+            // In addition to the data structure, the benefits also depend on use pattern and on concurrency level.
+            // I.E. if an array pool user only rents array "just in case" but does not actually use it, and concurrency level is low,
+            // a longer refresh would be beneficial since that could lower the API cost.
+            // If array is actually used, then there is benefit from higher precision of the API and shorter refresh is more attractive.
+            //
+            // Overall we do not know the ideal refresh rate and using some kind of dynamic feedback is unlikely to be feasible.
+            // Experiments have shown, however, that 5x amortization rate is a good enough balance between precision and cost of the API.
             s_processorIdRefreshRate = Math.Min((int)(minID * 5 / minTLS), MaxIdRefreshRate);
+
+            // In a case if GetCurrentProcessorNumber is particularly fast, like it happens on platforms supporting RDPID instruction,
+            // caching is not an improvement, thus it is desirable to bypass the cache entirely.
+            // Such systems consistently derive the refresh rate at or below 2-3, while the next tier, RDTSCP based implementations result in ~10,
+            // so we use "5" as a criteria to separate "fast" machines from the rest.
             return s_processorIdRefreshRate <= 5;
         }
 
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
index 8626aaed38bf5e..f2d9fe66f9a53e 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
@@ -497,7 +497,8 @@ public static int GetCurrentProcessorId()
             return ProcessorIdCache.GetCurrentProcessorId();
         }
 
-        // do a fast check and record in a readonly static so that it could become a JIT constant
+        // a speed check will determine refresh rate of the cache and will report if caching is not advisable.
+        // we will record that in a readonly static so that it could become a JIT constant and bypass caching entirely.
         private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.ProcessorNumberSpeedCheck();
 
         internal void ResetThreadPoolThread()