From d7ee11fc7353437a218d369187c09fdfd243d51a Mon Sep 17 00:00:00 2001 From: vsadov Date: Mon, 2 Dec 2019 17:00:38 -0800 Subject: [PATCH 01/11] Adjusting `GetCurrentProcessorId` caching to different environments. --- .../src/System/Threading/Thread.CoreCLR.cs | 176 ++++++++++++++++-- 1 file changed, 162 insertions(+), 14 deletions(-) diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs index 32b6ac43e0ffab..ed36f2d2d4813c 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs @@ -486,19 +486,31 @@ private static int CalculateOptimalMaxSpinWaitsPerSpinIteration() [MethodImpl(MethodImplOptions.InternalCall)] private static extern int GetCurrentProcessorNumber(); - // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of - // the t_currentProcessorIdCache are counting down to get it periodically refreshed. - // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar - // actions that are likely to result in changing the executing core - [ThreadStatic] - private static int t_currentProcessorIdCache; + // t_currentProcessorId lives in a separate class to make sure the class is fully initialized by the time we use the field + private class CoreIdCache + { + // The upper bits of t_currentProcessorId are the currentProcessorId. The lower bits of + // the t_currentProcessorIdCache are counting down to get it periodically refreshed. + // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar + // actions that are likely to result in changing the executing core + [ThreadStatic] + internal static int t_currentProcessorId; + } private const int ProcessorIdCacheShift = 16; private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1; - private const int ProcessorIdRefreshRate = 5000; + // 50 is our best guess. + // Based on further calibration it is likley to be adjusted lower. + // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number. + private static int ProcessorIdRefreshRate = 50; + // We will not adjust higher than this though. + private const int MaxIdRefreshRate = 5000; private static int RefreshCurrentProcessorId() { + if (sCalibrationSamples != null) + CalibrateOnce(); + int currentProcessorId = GetCurrentProcessorNumber(); // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which @@ -507,23 +519,23 @@ private static int RefreshCurrentProcessorId() // by default, we use the current managed thread ID as a proxy. if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId; - // Add offset to make it clear that it is not guaranteed to be 0-based processor number - currentProcessorId += 100; - Debug.Assert(ProcessorIdRefreshRate <= ProcessorIdCacheCountDownMask); // Mask with int.MaxValue to ensure the execution Id is not negative - t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate; + CoreIdCache.t_currentProcessorId = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate; return currentProcessorId; } - // Cached processor id used as a hint for which per-core stack to access. It is periodically - // refreshed to trail the actual thread core affinity. + // Cached processor id could be used as a hint for which per-core stripe of data to access to avoid sharing. + // It is periodically refreshed to trail the actual thread core affinity. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int GetCurrentProcessorId() { - int currentProcessorIdCache = t_currentProcessorIdCache--; + if (IsCoreIdReallyFast || ProcessorIdRefreshRate <= 2) + return GetCurrentProcessorNumber(); + + int currentProcessorIdCache = CoreIdCache.t_currentProcessorId--; if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0) { return RefreshCurrentProcessorId(); @@ -532,6 +544,142 @@ public static int GetCurrentProcessorId() return currentProcessorIdCache >> ProcessorIdCacheShift; } + // do a fast check and record in a readonly static so that it could become a JIT constant + internal static readonly bool IsCoreIdReallyFast = SimpleCoreIdSpeedCheck(); + + // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false. + // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch. + private static bool SimpleCoreIdSpeedCheck() + { + // warm up the code paths. + int id = UninlinedThreadStatic() | GetCurrentProcessorNumber(); + long _05usec = Stopwatch.Frequency / 1000000; + + // limit quick test to 100 microseconds. + // If we are on slow hardware, we should calibrate anyways. + long limit = Stopwatch.Frequency / 10000 + Stopwatch.GetTimestamp(); + for (int i = 0; i < 10; i++) + { + int iters = 1; + long t1 = 0; + // double the sample size until it is 0.5 usec. + while (t1 < _05usec) + { + iters *= 2; + t1 = Stopwatch.GetTimestamp(); + for (int j = 0; j < iters; j++) + { + id = GetCurrentProcessorNumber(); + } + t1 = Stopwatch.GetTimestamp() - t1; + } + + // assuming TLS cannot be a lot slower than getting ID, this should take 1-5 usec + long t2 = Stopwatch.GetTimestamp(); + for (int j = 0; j < iters; j++) + { + UninlinedThreadStatic(); + } + long t3 = Stopwatch.GetTimestamp(); + + // if getting ID took longer than 2x TLS access, we should consider caching. + if (t3 > limit || (t3 - t2) * 2 < t1) + { + return false; + } + } + + // Make sure the result was not negative, which would indicate "Not Supported" + return id >= 0; + } + + // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access. + // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching. + [MethodImpl(MethodImplOptions.NoInlining)] + private static int UninlinedThreadStatic() + { + return CoreIdCache.t_currentProcessorId; + } + + // We must collect multiple samples to account for irregularities caused by GC and context switches. + // Why we keep an array of samples and do not adjust as we go: + // We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all + // the calibration work to happen much sooner. There is no urgency in being calibrated while the app + // is in start-up mode. That would just add to the "rush hour" traffic. + private static int sCalibrationToDo; + private static int sCalibrationDone; + // 25 is chosen to budget the sampling under 100 msec total, assuming 4 msec per sample. + private const int CalibrationSamples = 25; + private static double[] sCalibrationSamples = new double[CalibrationSamples * 2]; + + private static void CalibrateOnce() + { + if (sCalibrationToDo >= CalibrationSamples) + return; + + int sample = Interlocked.Increment(ref sCalibrationToDo) - 1; + if (sample >= CalibrationSamples) + return; + + // Actual calibration step. Let's try to fit into ~4 msec. + double[] calibrationState = sCalibrationSamples; + + int id = 0; + long t1 = 0; + long _1msec = Stopwatch.Frequency / 1000; + int iters = 1; + + // double the sample size until it is 1 msec. + // we may spend up to 3 msec in this loop in a worst case. + while (t1 < _1msec) + { + iters *= 2; + t1 = Stopwatch.GetTimestamp(); + for (int i = 0; i < iters; i++) + { + id = GetCurrentProcessorNumber(); + } + t1 = Stopwatch.GetTimestamp() - t1; + } + + // assuming TLS cannot be a lot slower than ID, this should take 1 msec + long t2 = Stopwatch.GetTimestamp(); + for (int i = 0; i < iters; i++) + { + UninlinedThreadStatic(); + } + long t3 = Stopwatch.GetTimestamp(); + + // if we have useful measurements, record a sample + if (id >= 0 && t1 > 0 && t3 - t2 > 0) + { + calibrationState[sample * 2] = (double)t1 / iters; // ID + calibrationState[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS + } + else + { + // API is not functional or clock did not go forward. + // just pretend it was a very expensive sample with default ratio. + calibrationState[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec; + calibrationState[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec + } + + // If this was the last sample computed, get best times and update the ratio of ID to TLS. + if (Interlocked.Increment(ref sCalibrationDone) == CalibrationSamples) + { + double idMin = double.MaxValue; + double tlsMin = double.MaxValue; + for (int i = 0; i < CalibrationSamples; i++) + { + idMin = Math.Min(idMin, calibrationState[i * 2]); //ID + tlsMin = Math.Min(tlsMin, calibrationState[i * 2 + 1]); //TLS + } + + sCalibrationSamples = null!; + ProcessorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin)); + } + } + internal void ResetThreadPoolThread() { // Currently implemented in unmanaged method Thread::InternalReset and From 9ffc3351a3ad2abe1cd5552ee447c8b4f7efee75 Mon Sep 17 00:00:00 2001 From: vsadov Date: Tue, 3 Dec 2019 19:22:40 -0800 Subject: [PATCH 02/11] Addressed some PR comments (style, code structure). --- .../src/System/Threading/Thread.CoreCLR.cs | 265 +++++++++--------- 1 file changed, 138 insertions(+), 127 deletions(-) diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs index ed36f2d2d4813c..60d337183ee402 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs @@ -486,45 +486,145 @@ private static int CalculateOptimalMaxSpinWaitsPerSpinIteration() [MethodImpl(MethodImplOptions.InternalCall)] private static extern int GetCurrentProcessorNumber(); - // t_currentProcessorId lives in a separate class to make sure the class is fully initialized by the time we use the field - private class CoreIdCache + private static class CoreIdCache { - // The upper bits of t_currentProcessorId are the currentProcessorId. The lower bits of + // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of // the t_currentProcessorIdCache are counting down to get it periodically refreshed. // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar // actions that are likely to result in changing the executing core [ThreadStatic] - internal static int t_currentProcessorId; - } + private static int t_currentProcessorIdCache; + + private const int ProcessorIdCacheShift = 16; + private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1; + // 50 is our best guess. + // Based on further calibration it is likley to be adjusted lower. + // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number. + private static int s_processorIdRefreshRate = 50; + // We will not adjust higher than this though. + private const int MaxIdRefreshRate = 5000; + + private static int RefreshCurrentProcessorId() + { + double[]? calibrationSamples = s_CalibrationSamples; + if (calibrationSamples != null) + CalibrateOnce(calibrationSamples); - private const int ProcessorIdCacheShift = 16; - private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1; - // 50 is our best guess. - // Based on further calibration it is likley to be adjusted lower. - // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number. - private static int ProcessorIdRefreshRate = 50; - // We will not adjust higher than this though. - private const int MaxIdRefreshRate = 5000; + int currentProcessorId = GetCurrentProcessorNumber(); - private static int RefreshCurrentProcessorId() - { - if (sCalibrationSamples != null) - CalibrateOnce(); + // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which + // doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber() + // returns -1. As a fallback in that case and to spread the threads across the buckets + // by default, we use the current managed thread ID as a proxy. + if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId; - int currentProcessorId = GetCurrentProcessorNumber(); + Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask); - // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which - // doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber() - // returns -1. As a fallback in that case and to spread the threads across the buckets - // by default, we use the current managed thread ID as a proxy. - if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId; + // Mask with int.MaxValue to ensure the execution Id is not negative + t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | s_processorIdRefreshRate; - Debug.Assert(ProcessorIdRefreshRate <= ProcessorIdCacheCountDownMask); + return currentProcessorId; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int GetCurrentProcessorId() + { + if (s_processorIdRefreshRate <= 2) + return GetCurrentProcessorNumber(); + + int currentProcessorIdCache = t_currentProcessorIdCache--; + if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0) + { + return RefreshCurrentProcessorId(); + } + + return currentProcessorIdCache >> ProcessorIdCacheShift; + } + + // We must collect multiple samples to account for irregularities caused by GC and context switches. + // Why we keep an array of samples and do not adjust as we go: + // We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all + // the calibration work to happen much sooner. There is no urgency in being calibrated while the app + // is in start-up mode. That would just add to the "rush hour" traffic. + private static int s_CalibrationToDo; + private static int s_CalibrationDone; + // 25 is chosen to budget the sampling under 100 msec total, assuming 4 msec per sample. + private const int CalibrationSampleCount = 25; + private static double[]? s_CalibrationSamples = new double[CalibrationSampleCount * 2]; + + private static void CalibrateOnce(double[] calibrationSamples) + { + if (s_CalibrationToDo >= CalibrationSampleCount) + return; + + int sample = Interlocked.Increment(ref s_CalibrationToDo) - 1; + if (sample >= CalibrationSampleCount) + return; + + // Actual calibration step. Let's try to fit into ~4 msec. + int id = 0; + long t1 = 0; + long oneMillisecond = Stopwatch.Frequency / 1000; + int iters = 1; + + // double the sample size until it is 1 msec. + // we may spend up to 3 msec in this loop in a worst case. + while (t1 < oneMillisecond) + { + iters *= 2; + t1 = Stopwatch.GetTimestamp(); + for (int i = 0; i < iters; i++) + { + id = GetCurrentProcessorNumber(); + } + t1 = Stopwatch.GetTimestamp() - t1; + } + + // assuming TLS cannot be a lot slower than ID, this should take 1 msec + long t2 = Stopwatch.GetTimestamp(); + for (int i = 0; i < iters; i++) + { + UninlinedThreadStatic(); + } + long t3 = Stopwatch.GetTimestamp(); + + // if we have useful measurements, record a sample + if (id >= 0 && t1 > 0 && t3 - t2 > 0) + { + calibrationSamples[sample * 2] = (double)t1 / iters; // ID + calibrationSamples[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS + } + else + { + // API is not functional or clock did not go forward. + // just pretend it was a very expensive sample with default ratio. + calibrationSamples[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec; + calibrationSamples[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec + } + + // If this was the last sample computed, get best times and update the ratio of ID to TLS. + if (Interlocked.Increment(ref s_CalibrationDone) == CalibrationSampleCount) + { + double idMin = double.MaxValue; + double tlsMin = double.MaxValue; + for (int i = 0; i < CalibrationSampleCount; i++) + { + idMin = Math.Min(idMin, calibrationSamples[i * 2]); //ID + tlsMin = Math.Min(tlsMin, calibrationSamples[i * 2 + 1]); //TLS + } - // Mask with int.MaxValue to ensure the execution Id is not negative - CoreIdCache.t_currentProcessorId = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate; + s_CalibrationSamples = null; + s_processorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin)); + } + } - return currentProcessorId; + // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access. + // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching. + [MethodImpl(MethodImplOptions.NoInlining)] + internal static int UninlinedThreadStatic() + { + return t_currentProcessorIdCache; + } } // Cached processor id could be used as a hint for which per-core stripe of data to access to avoid sharing. @@ -532,28 +632,26 @@ private static int RefreshCurrentProcessorId() [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int GetCurrentProcessorId() { - if (IsCoreIdReallyFast || ProcessorIdRefreshRate <= 2) + if (s_isCoreIdReallyFast) return GetCurrentProcessorNumber(); - int currentProcessorIdCache = CoreIdCache.t_currentProcessorId--; - if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0) - { - return RefreshCurrentProcessorId(); - } - - return currentProcessorIdCache >> ProcessorIdCacheShift; + return CoreIdCache.GetCurrentProcessorId(); } // do a fast check and record in a readonly static so that it could become a JIT constant - internal static readonly bool IsCoreIdReallyFast = SimpleCoreIdSpeedCheck(); + private static readonly bool s_isCoreIdReallyFast = SimpleCoreIdSpeedCheck(); // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false. // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch. private static bool SimpleCoreIdSpeedCheck() { + // NOTE: We do not check the frequency of the Stopwatch. + // If the resolution, precision or access time to the timer are inadequate for our measures here, + // the test will fail anyways. + // warm up the code paths. - int id = UninlinedThreadStatic() | GetCurrentProcessorNumber(); - long _05usec = Stopwatch.Frequency / 1000000; + int id = CoreIdCache.UninlinedThreadStatic() | GetCurrentProcessorNumber(); + long halfMicrosecond = Stopwatch.Frequency / 1000000; // limit quick test to 100 microseconds. // If we are on slow hardware, we should calibrate anyways. @@ -563,7 +661,7 @@ private static bool SimpleCoreIdSpeedCheck() int iters = 1; long t1 = 0; // double the sample size until it is 0.5 usec. - while (t1 < _05usec) + while (t1 < halfMicrosecond) { iters *= 2; t1 = Stopwatch.GetTimestamp(); @@ -578,7 +676,7 @@ private static bool SimpleCoreIdSpeedCheck() long t2 = Stopwatch.GetTimestamp(); for (int j = 0; j < iters; j++) { - UninlinedThreadStatic(); + CoreIdCache.UninlinedThreadStatic(); } long t3 = Stopwatch.GetTimestamp(); @@ -593,93 +691,6 @@ private static bool SimpleCoreIdSpeedCheck() return id >= 0; } - // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access. - // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching. - [MethodImpl(MethodImplOptions.NoInlining)] - private static int UninlinedThreadStatic() - { - return CoreIdCache.t_currentProcessorId; - } - - // We must collect multiple samples to account for irregularities caused by GC and context switches. - // Why we keep an array of samples and do not adjust as we go: - // We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all - // the calibration work to happen much sooner. There is no urgency in being calibrated while the app - // is in start-up mode. That would just add to the "rush hour" traffic. - private static int sCalibrationToDo; - private static int sCalibrationDone; - // 25 is chosen to budget the sampling under 100 msec total, assuming 4 msec per sample. - private const int CalibrationSamples = 25; - private static double[] sCalibrationSamples = new double[CalibrationSamples * 2]; - - private static void CalibrateOnce() - { - if (sCalibrationToDo >= CalibrationSamples) - return; - - int sample = Interlocked.Increment(ref sCalibrationToDo) - 1; - if (sample >= CalibrationSamples) - return; - - // Actual calibration step. Let's try to fit into ~4 msec. - double[] calibrationState = sCalibrationSamples; - - int id = 0; - long t1 = 0; - long _1msec = Stopwatch.Frequency / 1000; - int iters = 1; - - // double the sample size until it is 1 msec. - // we may spend up to 3 msec in this loop in a worst case. - while (t1 < _1msec) - { - iters *= 2; - t1 = Stopwatch.GetTimestamp(); - for (int i = 0; i < iters; i++) - { - id = GetCurrentProcessorNumber(); - } - t1 = Stopwatch.GetTimestamp() - t1; - } - - // assuming TLS cannot be a lot slower than ID, this should take 1 msec - long t2 = Stopwatch.GetTimestamp(); - for (int i = 0; i < iters; i++) - { - UninlinedThreadStatic(); - } - long t3 = Stopwatch.GetTimestamp(); - - // if we have useful measurements, record a sample - if (id >= 0 && t1 > 0 && t3 - t2 > 0) - { - calibrationState[sample * 2] = (double)t1 / iters; // ID - calibrationState[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS - } - else - { - // API is not functional or clock did not go forward. - // just pretend it was a very expensive sample with default ratio. - calibrationState[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec; - calibrationState[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec - } - - // If this was the last sample computed, get best times and update the ratio of ID to TLS. - if (Interlocked.Increment(ref sCalibrationDone) == CalibrationSamples) - { - double idMin = double.MaxValue; - double tlsMin = double.MaxValue; - for (int i = 0; i < CalibrationSamples; i++) - { - idMin = Math.Min(idMin, calibrationState[i * 2]); //ID - tlsMin = Math.Min(tlsMin, calibrationState[i * 2 + 1]); //TLS - } - - sCalibrationSamples = null!; - ProcessorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin)); - } - } - internal void ResetThreadPoolThread() { // Currently implemented in unmanaged method Thread::InternalReset and From df36e9db936292cf0d78db6e70bc5b1bee0b9af2 Mon Sep 17 00:00:00 2001 From: vsadov Date: Tue, 3 Dec 2019 23:58:18 -0800 Subject: [PATCH 03/11] fitting calibration under 5 msec total fixed some comments --- .../src/System/Threading/Thread.CoreCLR.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs index 60d337183ee402..b182b0930f05b4 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs @@ -548,8 +548,8 @@ internal static int GetCurrentProcessorId() // is in start-up mode. That would just add to the "rush hour" traffic. private static int s_CalibrationToDo; private static int s_CalibrationDone; - // 25 is chosen to budget the sampling under 100 msec total, assuming 4 msec per sample. - private const int CalibrationSampleCount = 25; + // 10 is chosen to budget the sampling under 5 msec total, assuming 0.5 msec per sample. + private const int CalibrationSampleCount = 10; private static double[]? s_CalibrationSamples = new double[CalibrationSampleCount * 2]; private static void CalibrateOnce(double[] calibrationSamples) @@ -561,15 +561,15 @@ private static void CalibrateOnce(double[] calibrationSamples) if (sample >= CalibrationSampleCount) return; - // Actual calibration step. Let's try to fit into ~4 msec. + // Actual calibration step. Let's try to fit into ~50 usec. int id = 0; long t1 = 0; - long oneMillisecond = Stopwatch.Frequency / 1000; + long twentyMicrosecond = Stopwatch.Frequency / 50000; int iters = 1; // double the sample size until it is 1 msec. - // we may spend up to 3 msec in this loop in a worst case. - while (t1 < oneMillisecond) + // we may spend up to 40 usec in this loop in a worst case. + while (t1 < twentyMicrosecond) { iters *= 2; t1 = Stopwatch.GetTimestamp(); @@ -580,7 +580,7 @@ private static void CalibrateOnce(double[] calibrationSamples) t1 = Stopwatch.GetTimestamp() - t1; } - // assuming TLS cannot be a lot slower than ID, this should take 1 msec + // assuming TLS takes 1/2 of CoreID time or less, this should take 10 usec or less long t2 = Stopwatch.GetTimestamp(); for (int i = 0; i < iters; i++) { @@ -651,17 +651,17 @@ private static bool SimpleCoreIdSpeedCheck() // warm up the code paths. int id = CoreIdCache.UninlinedThreadStatic() | GetCurrentProcessorNumber(); - long halfMicrosecond = Stopwatch.Frequency / 1000000; + long oneMicrosecond = Stopwatch.Frequency / 1000000; - // limit quick test to 100 microseconds. + // this loop should take < 50 usec. limit it to 100 usec just in case. // If we are on slow hardware, we should calibrate anyways. long limit = Stopwatch.Frequency / 10000 + Stopwatch.GetTimestamp(); for (int i = 0; i < 10; i++) { int iters = 1; long t1 = 0; - // double the sample size until it is 0.5 usec. - while (t1 < halfMicrosecond) + // double the sample size until it is 1 usec. + while (t1 < oneMicrosecond) { iters *= 2; t1 = Stopwatch.GetTimestamp(); @@ -672,7 +672,7 @@ private static bool SimpleCoreIdSpeedCheck() t1 = Stopwatch.GetTimestamp() - t1; } - // assuming TLS cannot be a lot slower than getting ID, this should take 1-5 usec + // assuming TLS cannot be a lot slower than getting ID, this should take 1-2 usec long t2 = Stopwatch.GetTimestamp(); for (int j = 0; j < iters; j++) { From f911773ecbe56d5c88156e9090e15fe0dbe29359 Mon Sep 17 00:00:00 2001 From: vsadov Date: Wed, 4 Dec 2019 10:30:15 -0800 Subject: [PATCH 04/11] moved ProcessorIdCache to a separate file. --- .../System.Private.CoreLib.csproj | 1 + .../src/System/Threading/ProcessorIdCache.cs | 150 +++++++++++++++++ .../src/System/Threading/Thread.CoreCLR.cs | 155 +----------------- 3 files changed, 158 insertions(+), 148 deletions(-) create mode 100644 src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs diff --git a/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj b/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj index 58b4d5fac6c028..38a1209abc0ba3 100644 --- a/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj +++ b/src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj @@ -274,6 +274,7 @@ + diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs new file mode 100644 index 00000000000000..f09079b8cd1786 --- /dev/null +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs @@ -0,0 +1,150 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Runtime.CompilerServices; + +namespace System.Threading +{ + internal static class ProcessorIdCache + { + // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of + // the t_currentProcessorIdCache are counting down to get it periodically refreshed. + // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar + // actions that are likely to result in changing the executing core + [ThreadStatic] + private static int t_currentProcessorIdCache; + + private const int ProcessorIdCacheShift = 16; + private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1; + // 50 is our best guess. + // Based on further calibration it is likley to be adjusted lower. + // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number. + private static int s_processorIdRefreshRate = 50; + // We will not adjust higher than this though. + private const int MaxIdRefreshRate = 5000; + + private static int RefreshCurrentProcessorId() + { + double[]? calibrationSamples = s_CalibrationSamples; + if (calibrationSamples != null) + CalibrateOnce(calibrationSamples); + + int currentProcessorId = Thread.GetCurrentProcessorNumber(); + + // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which + // doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber() + // returns -1. As a fallback in that case and to spread the threads across the buckets + // by default, we use the current managed thread ID as a proxy. + if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId; + + Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask); + + // Mask with int.MaxValue to ensure the execution Id is not negative + t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | s_processorIdRefreshRate; + + return currentProcessorId; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int GetCurrentProcessorId() + { + if (s_processorIdRefreshRate <= 2) + return Thread.GetCurrentProcessorNumber(); + + int currentProcessorIdCache = t_currentProcessorIdCache--; + if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0) + { + return RefreshCurrentProcessorId(); + } + + return currentProcessorIdCache >> ProcessorIdCacheShift; + } + + // We must collect multiple samples to account for irregularities caused by GC and context switches. + // Why we keep an array of samples and do not adjust as we go: + // We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all + // the calibration work to happen much sooner. There is no urgency in being calibrated while the app + // is in start-up mode. That would just add to the "rush hour" traffic. + private static int s_CalibrationToDo; + private static int s_CalibrationDone; + // 10 is chosen to budget the sampling under 5 msec total, assuming 0.5 msec per sample. + private const int CalibrationSampleCount = 10; + private static double[]? s_CalibrationSamples = new double[CalibrationSampleCount * 2]; + + private static void CalibrateOnce(double[] calibrationSamples) + { + if (s_CalibrationToDo >= CalibrationSampleCount) + return; + + int sample = Interlocked.Increment(ref s_CalibrationToDo) - 1; + if (sample >= CalibrationSampleCount) + return; + + // Actual calibration step. Let's try to fit into ~50 usec. + int id = 0; + long t1 = 0; + long twentyMicrosecond = Stopwatch.Frequency / 50000; + int iters = 1; + + // double the sample size until it is 1 msec. + // we may spend up to 40 usec in this loop in a worst case. + while (t1 < twentyMicrosecond) + { + iters *= 2; + t1 = Stopwatch.GetTimestamp(); + for (int i = 0; i < iters; i++) + { + id = Thread.GetCurrentProcessorNumber(); + } + t1 = Stopwatch.GetTimestamp() - t1; + } + + // assuming TLS takes 1/2 of ProcessorNumber time or less, this should take 10 usec or less + long t2 = Stopwatch.GetTimestamp(); + for (int i = 0; i < iters; i++) + { + UninlinedThreadStatic(); + } + long t3 = Stopwatch.GetTimestamp(); + + // if we have useful measurements, record a sample + if (id >= 0 && t1 > 0 && t3 - t2 > 0) + { + calibrationSamples[sample * 2] = (double)t1 / iters; // ID + calibrationSamples[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS + } + else + { + // API is not functional or clock did not go forward. + // just pretend it was a very expensive sample with default ratio. + calibrationSamples[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec; + calibrationSamples[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec + } + + // If this was the last sample computed, get best times and update the ratio of ID to TLS. + if (Interlocked.Increment(ref s_CalibrationDone) == CalibrationSampleCount) + { + double idMin = double.MaxValue; + double tlsMin = double.MaxValue; + for (int i = 0; i < CalibrationSampleCount; i++) + { + idMin = Math.Min(idMin, calibrationSamples[i * 2]); //ID + tlsMin = Math.Min(tlsMin, calibrationSamples[i * 2 + 1]); //TLS + } + + s_CalibrationSamples = null; + s_processorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin)); + } + } + + // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access. + // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching. + [MethodImpl(MethodImplOptions.NoInlining)] + internal static int UninlinedThreadStatic() + { + return t_currentProcessorIdCache; + } + } +} diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs index b182b0930f05b4..4fc4278e35bd33 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs @@ -484,173 +484,32 @@ private static int CalculateOptimalMaxSpinWaitsPerSpinIteration() } [MethodImpl(MethodImplOptions.InternalCall)] - private static extern int GetCurrentProcessorNumber(); - - private static class CoreIdCache - { - // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of - // the t_currentProcessorIdCache are counting down to get it periodically refreshed. - // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar - // actions that are likely to result in changing the executing core - [ThreadStatic] - private static int t_currentProcessorIdCache; - - private const int ProcessorIdCacheShift = 16; - private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1; - // 50 is our best guess. - // Based on further calibration it is likley to be adjusted lower. - // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number. - private static int s_processorIdRefreshRate = 50; - // We will not adjust higher than this though. - private const int MaxIdRefreshRate = 5000; - - private static int RefreshCurrentProcessorId() - { - double[]? calibrationSamples = s_CalibrationSamples; - if (calibrationSamples != null) - CalibrateOnce(calibrationSamples); - - int currentProcessorId = GetCurrentProcessorNumber(); - - // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which - // doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber() - // returns -1. As a fallback in that case and to spread the threads across the buckets - // by default, we use the current managed thread ID as a proxy. - if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId; - - Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask); - - // Mask with int.MaxValue to ensure the execution Id is not negative - t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | s_processorIdRefreshRate; - - return currentProcessorId; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int GetCurrentProcessorId() - { - if (s_processorIdRefreshRate <= 2) - return GetCurrentProcessorNumber(); - - int currentProcessorIdCache = t_currentProcessorIdCache--; - if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0) - { - return RefreshCurrentProcessorId(); - } - - return currentProcessorIdCache >> ProcessorIdCacheShift; - } - - // We must collect multiple samples to account for irregularities caused by GC and context switches. - // Why we keep an array of samples and do not adjust as we go: - // We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all - // the calibration work to happen much sooner. There is no urgency in being calibrated while the app - // is in start-up mode. That would just add to the "rush hour" traffic. - private static int s_CalibrationToDo; - private static int s_CalibrationDone; - // 10 is chosen to budget the sampling under 5 msec total, assuming 0.5 msec per sample. - private const int CalibrationSampleCount = 10; - private static double[]? s_CalibrationSamples = new double[CalibrationSampleCount * 2]; - - private static void CalibrateOnce(double[] calibrationSamples) - { - if (s_CalibrationToDo >= CalibrationSampleCount) - return; - - int sample = Interlocked.Increment(ref s_CalibrationToDo) - 1; - if (sample >= CalibrationSampleCount) - return; - - // Actual calibration step. Let's try to fit into ~50 usec. - int id = 0; - long t1 = 0; - long twentyMicrosecond = Stopwatch.Frequency / 50000; - int iters = 1; - - // double the sample size until it is 1 msec. - // we may spend up to 40 usec in this loop in a worst case. - while (t1 < twentyMicrosecond) - { - iters *= 2; - t1 = Stopwatch.GetTimestamp(); - for (int i = 0; i < iters; i++) - { - id = GetCurrentProcessorNumber(); - } - t1 = Stopwatch.GetTimestamp() - t1; - } - - // assuming TLS takes 1/2 of CoreID time or less, this should take 10 usec or less - long t2 = Stopwatch.GetTimestamp(); - for (int i = 0; i < iters; i++) - { - UninlinedThreadStatic(); - } - long t3 = Stopwatch.GetTimestamp(); - - // if we have useful measurements, record a sample - if (id >= 0 && t1 > 0 && t3 - t2 > 0) - { - calibrationSamples[sample * 2] = (double)t1 / iters; // ID - calibrationSamples[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS - } - else - { - // API is not functional or clock did not go forward. - // just pretend it was a very expensive sample with default ratio. - calibrationSamples[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec; - calibrationSamples[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec - } - - // If this was the last sample computed, get best times and update the ratio of ID to TLS. - if (Interlocked.Increment(ref s_CalibrationDone) == CalibrationSampleCount) - { - double idMin = double.MaxValue; - double tlsMin = double.MaxValue; - for (int i = 0; i < CalibrationSampleCount; i++) - { - idMin = Math.Min(idMin, calibrationSamples[i * 2]); //ID - tlsMin = Math.Min(tlsMin, calibrationSamples[i * 2 + 1]); //TLS - } - - s_CalibrationSamples = null; - s_processorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin)); - } - } - - // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access. - // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching. - [MethodImpl(MethodImplOptions.NoInlining)] - internal static int UninlinedThreadStatic() - { - return t_currentProcessorIdCache; - } - } + internal static extern int GetCurrentProcessorNumber(); // Cached processor id could be used as a hint for which per-core stripe of data to access to avoid sharing. // It is periodically refreshed to trail the actual thread core affinity. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int GetCurrentProcessorId() { - if (s_isCoreIdReallyFast) + if (s_isProcessorNumberReallyFast) return GetCurrentProcessorNumber(); - return CoreIdCache.GetCurrentProcessorId(); + return ProcessorIdCache.GetCurrentProcessorId(); } // do a fast check and record in a readonly static so that it could become a JIT constant - private static readonly bool s_isCoreIdReallyFast = SimpleCoreIdSpeedCheck(); + private static readonly bool s_isProcessorNumberReallyFast = SimpleProcessorNumberSpeedCheck(); // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false. // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch. - private static bool SimpleCoreIdSpeedCheck() + private static bool SimpleProcessorNumberSpeedCheck() { // NOTE: We do not check the frequency of the Stopwatch. // If the resolution, precision or access time to the timer are inadequate for our measures here, // the test will fail anyways. // warm up the code paths. - int id = CoreIdCache.UninlinedThreadStatic() | GetCurrentProcessorNumber(); + int id = ProcessorIdCache.UninlinedThreadStatic() | GetCurrentProcessorNumber(); long oneMicrosecond = Stopwatch.Frequency / 1000000; // this loop should take < 50 usec. limit it to 100 usec just in case. @@ -676,7 +535,7 @@ private static bool SimpleCoreIdSpeedCheck() long t2 = Stopwatch.GetTimestamp(); for (int j = 0; j < iters; j++) { - CoreIdCache.UninlinedThreadStatic(); + ProcessorIdCache.UninlinedThreadStatic(); } long t3 = Stopwatch.GetTimestamp(); From bb4847f48fc4fa440306920aa5831a933c51d773 Mon Sep 17 00:00:00 2001 From: vsadov Date: Thu, 5 Dec 2019 18:32:57 -0800 Subject: [PATCH 05/11] increased "fast" threshold to 3x and moved SimpleProcessorNumberSpeedCheck to ProcessorIDCache.cs --- .../src/System/Threading/ProcessorIdCache.cs | 53 +++++++++++++++++-- .../src/System/Threading/Thread.CoreCLR.cs | 52 +----------------- 2 files changed, 51 insertions(+), 54 deletions(-) diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs index f09079b8cd1786..ef0eb3d893ba55 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs @@ -50,9 +50,6 @@ private static int RefreshCurrentProcessorId() [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static int GetCurrentProcessorId() { - if (s_processorIdRefreshRate <= 2) - return Thread.GetCurrentProcessorNumber(); - int currentProcessorIdCache = t_currentProcessorIdCache--; if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0) { @@ -139,6 +136,56 @@ private static void CalibrateOnce(double[] calibrationSamples) } } + // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false. + // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch. + internal static bool SimpleProcessorNumberSpeedCheck() + { + // NOTE: We do not check the frequency of the Stopwatch. + // If the resolution, precision or access time to the timer are inadequate for our measures here, + // the test will fail anyways. + + // warm up the code paths. + int id = UninlinedThreadStatic() | Thread.GetCurrentProcessorNumber(); + long oneMicrosecond = Stopwatch.Frequency / 1000000; + + // this loop should take < 50 usec. limit it to 100 usec just in case. + // If we are on slow hardware, we should calibrate anyways. + long limit = Stopwatch.Frequency / 10000 + Stopwatch.GetTimestamp(); + for (int i = 0; i < 10; i++) + { + int iters = 1; + long t1 = 0; + // double the sample size until it is 1 usec. + while (t1 < oneMicrosecond) + { + iters *= 2; + t1 = Stopwatch.GetTimestamp(); + for (int j = 0; j < iters; j++) + { + id = Thread.GetCurrentProcessorNumber(); + } + t1 = Stopwatch.GetTimestamp() - t1; + } + + // assuming TLS cannot be a lot slower than getting ID, this should take 1-2 usec + long t2 = Stopwatch.GetTimestamp(); + for (int j = 0; j < iters; j++) + { + UninlinedThreadStatic(); + } + long t3 = Stopwatch.GetTimestamp(); + + // if getting ID took longer than 3x TLS access, we should consider caching. + if (t3 > limit || (t3 - t2) * 3 < t1) + { + return false; + } + } + + // Make sure the result was not negative, which would indicate "Not Supported" + return id >= 0; + } + // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access. // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching. [MethodImpl(MethodImplOptions.NoInlining)] diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs index 4fc4278e35bd33..3c17b39c2b558b 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs @@ -498,57 +498,7 @@ public static int GetCurrentProcessorId() } // do a fast check and record in a readonly static so that it could become a JIT constant - private static readonly bool s_isProcessorNumberReallyFast = SimpleProcessorNumberSpeedCheck(); - - // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false. - // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch. - private static bool SimpleProcessorNumberSpeedCheck() - { - // NOTE: We do not check the frequency of the Stopwatch. - // If the resolution, precision or access time to the timer are inadequate for our measures here, - // the test will fail anyways. - - // warm up the code paths. - int id = ProcessorIdCache.UninlinedThreadStatic() | GetCurrentProcessorNumber(); - long oneMicrosecond = Stopwatch.Frequency / 1000000; - - // this loop should take < 50 usec. limit it to 100 usec just in case. - // If we are on slow hardware, we should calibrate anyways. - long limit = Stopwatch.Frequency / 10000 + Stopwatch.GetTimestamp(); - for (int i = 0; i < 10; i++) - { - int iters = 1; - long t1 = 0; - // double the sample size until it is 1 usec. - while (t1 < oneMicrosecond) - { - iters *= 2; - t1 = Stopwatch.GetTimestamp(); - for (int j = 0; j < iters; j++) - { - id = GetCurrentProcessorNumber(); - } - t1 = Stopwatch.GetTimestamp() - t1; - } - - // assuming TLS cannot be a lot slower than getting ID, this should take 1-2 usec - long t2 = Stopwatch.GetTimestamp(); - for (int j = 0; j < iters; j++) - { - ProcessorIdCache.UninlinedThreadStatic(); - } - long t3 = Stopwatch.GetTimestamp(); - - // if getting ID took longer than 2x TLS access, we should consider caching. - if (t3 > limit || (t3 - t2) * 2 < t1) - { - return false; - } - } - - // Make sure the result was not negative, which would indicate "Not Supported" - return id >= 0; - } + private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.SimpleProcessorNumberSpeedCheck(); internal void ResetThreadPoolThread() { From f36c13b5c9990e1f214ddd128464bd0446541672 Mon Sep 17 00:00:00 2001 From: vsadov Date: Fri, 6 Dec 2019 17:56:23 -0800 Subject: [PATCH 06/11] Drop the sample array when quick check is a pass. --- .../src/System/Threading/ProcessorIdCache.cs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs index ef0eb3d893ba55..223b23eb3a933f 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs @@ -183,7 +183,14 @@ internal static bool SimpleProcessorNumberSpeedCheck() } // Make sure the result was not negative, which would indicate "Not Supported" - return id >= 0; + if (id < 0) + { + return false; + } + + // GetCurrentProcessorNumber is fast, no more checks needed. + s_CalibrationSamples = null; + return true; } // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access. From 6efaca4e98d157d59c1a8c5f1f69aec3bbe2d113 Mon Sep 17 00:00:00 2001 From: vsadov Date: Wed, 11 Dec 2019 21:56:04 -0800 Subject: [PATCH 07/11] One-pass ProcessorNumberSpeedCheck. --- .../src/System/Threading/ProcessorIdCache.cs | 140 ++++-------------- .../src/System/Threading/Thread.CoreCLR.cs | 2 +- 2 files changed, 31 insertions(+), 111 deletions(-) diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs index 223b23eb3a933f..e1ec09a94ec1b5 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs @@ -19,18 +19,14 @@ internal static class ProcessorIdCache private const int ProcessorIdCacheShift = 16; private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1; // 50 is our best guess. - // Based on further calibration it is likley to be adjusted lower. - // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be recalibrated to a higher number. + // Based on speed check it will be adjusted, typically lower. + // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be adjusted to a higher number. private static int s_processorIdRefreshRate = 50; // We will not adjust higher than this though. private const int MaxIdRefreshRate = 5000; private static int RefreshCurrentProcessorId() { - double[]? calibrationSamples = s_CalibrationSamples; - if (calibrationSamples != null) - CalibrateOnce(calibrationSamples); - int currentProcessorId = Thread.GetCurrentProcessorNumber(); // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which @@ -59,138 +55,62 @@ internal static int GetCurrentProcessorId() return currentProcessorIdCache >> ProcessorIdCacheShift; } - // We must collect multiple samples to account for irregularities caused by GC and context switches. - // Why we keep an array of samples and do not adjust as we go: - // We expect that we will adjust the refresh rate down. If we do that early, we may end up forcing all - // the calibration work to happen much sooner. There is no urgency in being calibrated while the app - // is in start-up mode. That would just add to the "rush hour" traffic. - private static int s_CalibrationToDo; - private static int s_CalibrationDone; - // 10 is chosen to budget the sampling under 5 msec total, assuming 0.5 msec per sample. - private const int CalibrationSampleCount = 10; - private static double[]? s_CalibrationSamples = new double[CalibrationSampleCount * 2]; - - private static void CalibrateOnce(double[] calibrationSamples) - { - if (s_CalibrationToDo >= CalibrationSampleCount) - return; - - int sample = Interlocked.Increment(ref s_CalibrationToDo) - 1; - if (sample >= CalibrationSampleCount) - return; - - // Actual calibration step. Let's try to fit into ~50 usec. - int id = 0; - long t1 = 0; - long twentyMicrosecond = Stopwatch.Frequency / 50000; - int iters = 1; - - // double the sample size until it is 1 msec. - // we may spend up to 40 usec in this loop in a worst case. - while (t1 < twentyMicrosecond) - { - iters *= 2; - t1 = Stopwatch.GetTimestamp(); - for (int i = 0; i < iters; i++) - { - id = Thread.GetCurrentProcessorNumber(); - } - t1 = Stopwatch.GetTimestamp() - t1; - } - - // assuming TLS takes 1/2 of ProcessorNumber time or less, this should take 10 usec or less - long t2 = Stopwatch.GetTimestamp(); - for (int i = 0; i < iters; i++) - { - UninlinedThreadStatic(); - } - long t3 = Stopwatch.GetTimestamp(); - - // if we have useful measurements, record a sample - if (id >= 0 && t1 > 0 && t3 - t2 > 0) - { - calibrationSamples[sample * 2] = (double)t1 / iters; // ID - calibrationSamples[sample * 2 + 1] = (double)(t3 - t2) / iters; // TLS - } - else - { - // API is not functional or clock did not go forward. - // just pretend it was a very expensive sample with default ratio. - calibrationSamples[sample * 2] = (double)Stopwatch.Frequency * 50; // 50 sec; - calibrationSamples[sample * 2 + 1] = Stopwatch.Frequency; // 1 sec - } - - // If this was the last sample computed, get best times and update the ratio of ID to TLS. - if (Interlocked.Increment(ref s_CalibrationDone) == CalibrationSampleCount) - { - double idMin = double.MaxValue; - double tlsMin = double.MaxValue; - for (int i = 0; i < CalibrationSampleCount; i++) - { - idMin = Math.Min(idMin, calibrationSamples[i * 2]); //ID - tlsMin = Math.Min(tlsMin, calibrationSamples[i * 2 + 1]); //TLS - } - - s_CalibrationSamples = null; - s_processorIdRefreshRate = Math.Min(MaxIdRefreshRate, (int)(idMin / tlsMin)); - } - } - // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false. // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch. - internal static bool SimpleProcessorNumberSpeedCheck() + internal static unsafe int ProcessorNumberSpeedCheck() { // NOTE: We do not check the frequency of the Stopwatch. + // The frequency often does not match the actual timer refresh rate anyways. // If the resolution, precision or access time to the timer are inadequate for our measures here, // the test will fail anyways. + // default values pretend to be very long samples with default ratio + double minID = Stopwatch.Frequency * 50; // 50 sec + double minTLS = Stopwatch.Frequency; // 1 sec + // warm up the code paths. - int id = UninlinedThreadStatic() | Thread.GetCurrentProcessorNumber(); - long oneMicrosecond = Stopwatch.Frequency / 1000000; + UninlinedThreadStatic(); + if (Thread.GetCurrentProcessorNumber() < 0) + return MaxIdRefreshRate; - // this loop should take < 50 usec. limit it to 100 usec just in case. - // If we are on slow hardware, we should calibrate anyways. - long limit = Stopwatch.Frequency / 10000 + Stopwatch.GetTimestamp(); + long oneMicrosecond = Stopwatch.Frequency / 1000000; for (int i = 0; i < 10; i++) { - int iters = 1; + // we will measure at least 16 iterations and at least 1 microsecond + int iters = 16; long t1 = 0; - // double the sample size until it is 1 usec. while (t1 < oneMicrosecond) { iters *= 2; t1 = Stopwatch.GetTimestamp(); for (int j = 0; j < iters; j++) { - id = Thread.GetCurrentProcessorNumber(); + Thread.GetCurrentProcessorNumber(); } t1 = Stopwatch.GetTimestamp() - t1; } - // assuming TLS cannot be a lot slower than getting ID, this should take 1-2 usec - long t2 = Stopwatch.GetTimestamp(); - for (int j = 0; j < iters; j++) - { - UninlinedThreadStatic(); - } - long t3 = Stopwatch.GetTimestamp(); + minID = Math.Min(minID, (double)t1 / iters); - // if getting ID took longer than 3x TLS access, we should consider caching. - if (t3 > limit || (t3 - t2) * 3 < t1) + // we will measure at least 16 iterations and at least 1 microsecond + iters = 16; + t1 = 0; + while (t1 < oneMicrosecond) { - return false; + iters *= 2; + t1 = Stopwatch.GetTimestamp(); + for (int j = 0; j < iters; j++) + { + UninlinedThreadStatic(); + } + t1 = Stopwatch.GetTimestamp() - t1; } - } - // Make sure the result was not negative, which would indicate "Not Supported" - if (id < 0) - { - return false; + minTLS = Math.Min(minTLS, (double)t1 / iters); } - // GetCurrentProcessorNumber is fast, no more checks needed. - s_CalibrationSamples = null; - return true; + s_processorIdRefreshRate = Math.Min((int)(minID / minTLS), MaxIdRefreshRate); + return s_processorIdRefreshRate; } // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access. diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs index 3c17b39c2b558b..c0a2e37de65f25 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs @@ -498,7 +498,7 @@ public static int GetCurrentProcessorId() } // do a fast check and record in a readonly static so that it could become a JIT constant - private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.SimpleProcessorNumberSpeedCheck(); + private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.ProcessorNumberSpeedCheck() <= 3; internal void ResetThreadPoolThread() { From 23c6de21c3ec1903b4296d671ea75a55a56bc256 Mon Sep 17 00:00:00 2001 From: vsadov Date: Thu, 12 Dec 2019 13:26:22 -0800 Subject: [PATCH 08/11] PR feedback (modulo refresh rate) --- .../src/System/Threading/ProcessorIdCache.cs | 37 ++++++++++--------- .../src/System/Threading/Thread.CoreCLR.cs | 2 +- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs index e1ec09a94ec1b5..824eb71036d288 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs @@ -18,10 +18,8 @@ internal static class ProcessorIdCache private const int ProcessorIdCacheShift = 16; private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1; - // 50 is our best guess. - // Based on speed check it will be adjusted, typically lower. - // In relatively rare cases of a slow GetCurrentProcessorNumber, it may be adjusted to a higher number. - private static int s_processorIdRefreshRate = 50; + // Refresh rate of the cache. Will be derived from a speed check of GetCurrentProcessorNumber API. + private static int s_processorIdRefreshRate; // We will not adjust higher than this though. private const int MaxIdRefreshRate = 5000; @@ -57,7 +55,7 @@ internal static int GetCurrentProcessorId() // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false. // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch. - internal static unsafe int ProcessorNumberSpeedCheck() + internal static bool ProcessorNumberSpeedCheck() { // NOTE: We do not check the frequency of the Stopwatch. // The frequency often does not match the actual timer refresh rate anyways. @@ -70,16 +68,20 @@ internal static unsafe int ProcessorNumberSpeedCheck() // warm up the code paths. UninlinedThreadStatic(); + // also check if API is actually functional (-1 means not supported) if (Thread.GetCurrentProcessorNumber() < 0) - return MaxIdRefreshRate; + { + s_processorIdRefreshRate = int.MaxValue; + return false; + } long oneMicrosecond = Stopwatch.Frequency / 1000000; for (int i = 0; i < 10; i++) { // we will measure at least 16 iterations and at least 1 microsecond - int iters = 16; - long t1 = 0; - while (t1 < oneMicrosecond) + long t1; + int iters = 8; + do { iters *= 2; t1 = Stopwatch.GetTimestamp(); @@ -88,29 +90,30 @@ internal static unsafe int ProcessorNumberSpeedCheck() Thread.GetCurrentProcessorNumber(); } t1 = Stopwatch.GetTimestamp() - t1; - } + } while (t1 < oneMicrosecond); minID = Math.Min(minID, (double)t1 / iters); - // we will measure at least 16 iterations and at least 1 microsecond - iters = 16; - t1 = 0; - while (t1 < oneMicrosecond) + // we will measure at least 1 microsecond, + // and use at least 1/2 of ProcID iterations + // we assume that TLS can't be more than 2x slower than ProcID + iters /= 2; + do { - iters *= 2; t1 = Stopwatch.GetTimestamp(); for (int j = 0; j < iters; j++) { UninlinedThreadStatic(); } t1 = Stopwatch.GetTimestamp() - t1; - } + iters *= 2; + } while (t1 < oneMicrosecond) ; minTLS = Math.Min(minTLS, (double)t1 / iters); } s_processorIdRefreshRate = Math.Min((int)(minID / minTLS), MaxIdRefreshRate); - return s_processorIdRefreshRate; + return s_processorIdRefreshRate >= 3; } // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access. diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs index c0a2e37de65f25..8626aaed38bf5e 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs @@ -498,7 +498,7 @@ public static int GetCurrentProcessorId() } // do a fast check and record in a readonly static so that it could become a JIT constant - private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.ProcessorNumberSpeedCheck() <= 3; + private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.ProcessorNumberSpeedCheck(); internal void ResetThreadPoolThread() { From 3154533273f3f5f6c587d1531a462757627cc2d8 Mon Sep 17 00:00:00 2001 From: vsadov Date: Sat, 14 Dec 2019 13:26:38 -0800 Subject: [PATCH 09/11] setting amortization ratio to 5 couple fixes --- .../src/System/Threading/ProcessorIdCache.cs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs index 824eb71036d288..27dba76d1eac10 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs @@ -31,7 +31,8 @@ private static int RefreshCurrentProcessorId() // doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber() // returns -1. As a fallback in that case and to spread the threads across the buckets // by default, we use the current managed thread ID as a proxy. - if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId; + if (currentProcessorId < 0) + currentProcessorId = Environment.CurrentManagedThreadId; Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask); @@ -62,9 +63,8 @@ internal static bool ProcessorNumberSpeedCheck() // If the resolution, precision or access time to the timer are inadequate for our measures here, // the test will fail anyways. - // default values pretend to be very long samples with default ratio - double minID = Stopwatch.Frequency * 50; // 50 sec - double minTLS = Stopwatch.Frequency; // 1 sec + double minID = double.MaxValue; + double minTLS = double.MaxValue; // warm up the code paths. UninlinedThreadStatic(); @@ -75,7 +75,7 @@ internal static bool ProcessorNumberSpeedCheck() return false; } - long oneMicrosecond = Stopwatch.Frequency / 1000000; + long oneMicrosecond = Stopwatch.Frequency / 1000000 + 1; for (int i = 0; i < 10; i++) { // we will measure at least 16 iterations and at least 1 microsecond @@ -97,23 +97,23 @@ internal static bool ProcessorNumberSpeedCheck() // we will measure at least 1 microsecond, // and use at least 1/2 of ProcID iterations // we assume that TLS can't be more than 2x slower than ProcID - iters /= 2; + iters /= 4; do { + iters *= 2; t1 = Stopwatch.GetTimestamp(); for (int j = 0; j < iters; j++) { UninlinedThreadStatic(); } t1 = Stopwatch.GetTimestamp() - t1; - iters *= 2; } while (t1 < oneMicrosecond) ; minTLS = Math.Min(minTLS, (double)t1 / iters); } - s_processorIdRefreshRate = Math.Min((int)(minID / minTLS), MaxIdRefreshRate); - return s_processorIdRefreshRate >= 3; + s_processorIdRefreshRate = Math.Min((int)(minID * 5 / minTLS), MaxIdRefreshRate); + return s_processorIdRefreshRate <= 5; } // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access. From f483aa9492e98e45812a5511e0f43ab093054b60 Mon Sep 17 00:00:00 2001 From: vsadov Date: Sat, 14 Dec 2019 16:51:53 -0800 Subject: [PATCH 10/11] fix unimplemented `GetCurrentProcessorNumber` case --- .../src/System/Threading/ProcessorIdCache.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs index 27dba76d1eac10..feb231f66c5437 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs @@ -71,7 +71,7 @@ internal static bool ProcessorNumberSpeedCheck() // also check if API is actually functional (-1 means not supported) if (Thread.GetCurrentProcessorNumber() < 0) { - s_processorIdRefreshRate = int.MaxValue; + s_processorIdRefreshRate = ProcessorIdCacheCountDownMask; return false; } From 1618787fa4471c84bbddc86980ba3efdcae9f0d4 Mon Sep 17 00:00:00 2001 From: vsadov Date: Sun, 15 Dec 2019 00:15:21 -0800 Subject: [PATCH 11/11] PR feedback --- .../src/System/Threading/ProcessorIdCache.cs | 45 +++++++++++++++---- .../src/System/Threading/Thread.CoreCLR.cs | 3 +- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs index feb231f66c5437..7f11ab5efa8c1a 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs @@ -79,20 +79,20 @@ internal static bool ProcessorNumberSpeedCheck() for (int i = 0; i < 10; i++) { // we will measure at least 16 iterations and at least 1 microsecond - long t1; + long t; int iters = 8; do { iters *= 2; - t1 = Stopwatch.GetTimestamp(); + t = Stopwatch.GetTimestamp(); for (int j = 0; j < iters; j++) { Thread.GetCurrentProcessorNumber(); } - t1 = Stopwatch.GetTimestamp() - t1; - } while (t1 < oneMicrosecond); + t = Stopwatch.GetTimestamp() - t; + } while (t < oneMicrosecond); - minID = Math.Min(minID, (double)t1 / iters); + minID = Math.Min(minID, (double)t / iters); // we will measure at least 1 microsecond, // and use at least 1/2 of ProcID iterations @@ -101,18 +101,45 @@ internal static bool ProcessorNumberSpeedCheck() do { iters *= 2; - t1 = Stopwatch.GetTimestamp(); + t = Stopwatch.GetTimestamp(); for (int j = 0; j < iters; j++) { UninlinedThreadStatic(); } - t1 = Stopwatch.GetTimestamp() - t1; - } while (t1 < oneMicrosecond) ; + t = Stopwatch.GetTimestamp() - t; + } while (t < oneMicrosecond); - minTLS = Math.Min(minTLS, (double)t1 / iters); + minTLS = Math.Min(minTLS, (double)t / iters); } + // A few words about choosing cache refresh rate: + // + // There are too reasons why data structures use core affinity: + // 1) To improve locality - avoid running on one core and using data in other core's cache. + // 2) To reduce sharing - avoid multiple threads using the same piece of data. + // + // Scenarios with large footprint, like striped caches, are sensitive to both parts. It is desirable to access + // large data from the "right" core. + // In scenarios where the state is small, like a striped counter, it is mostly about sharing. + // Otherwise the state is small and occasionally moving counter to a different core via cache miss is not a big deal. + // + // In scenarios that care more about sharing precise results of GetCurrentProcessorNumber may not justify + // the cost unless the underlying implementation is very cheap. + // In such cases it is desirable to amortize the cost over multiple accesses by caching in a ThreadStatic. + // + // In addition to the data structure, the benefits also depend on use pattern and on concurrency level. + // I.E. if an array pool user only rents array "just in case" but does not actually use it, and concurrency level is low, + // a longer refresh would be beneficial since that could lower the API cost. + // If array is actually used, then there is benefit from higher precision of the API and shorter refresh is more attractive. + // + // Overall we do not know the ideal refresh rate and using some kind of dynamic feedback is unlikely to be feasible. + // Experiments have shown, however, that 5x amortization rate is a good enough balance between precision and cost of the API. s_processorIdRefreshRate = Math.Min((int)(minID * 5 / minTLS), MaxIdRefreshRate); + + // In a case if GetCurrentProcessorNumber is particularly fast, like it happens on platforms supporting RDPID instruction, + // caching is not an improvement, thus it is desirable to bypass the cache entirely. + // Such systems consistently derive the refresh rate at or below 2-3, while the next tier, RDTSCP based implementations result in ~10, + // so we use "5" as a criteria to separate "fast" machines from the rest. return s_processorIdRefreshRate <= 5; } diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs index 8626aaed38bf5e..f2d9fe66f9a53e 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs @@ -497,7 +497,8 @@ public static int GetCurrentProcessorId() return ProcessorIdCache.GetCurrentProcessorId(); } - // do a fast check and record in a readonly static so that it could become a JIT constant + // a speed check will determine refresh rate of the cache and will report if caching is not advisable. + // we will record that in a readonly static so that it could become a JIT constant and bypass caching entirely. private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.ProcessorNumberSpeedCheck(); internal void ResetThreadPoolThread()