Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@
<Compile Include="$(BclSourcesRoot)\System\Threading\StackCrawlMark.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\SynchronizationContext.CoreCLR.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\Thread.CoreCLR.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\ProcessorIdCache.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\ThreadPool.CoreCLR.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\Timer.CoreCLR.cs" />
<Compile Include="$(BclSourcesRoot)\System\Threading\WaitHandle.CoreCLR.cs" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Diagnostics;
using System.Runtime.CompilerServices;

namespace System.Threading
{
internal static class ProcessorIdCache
{
// The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
// the t_currentProcessorIdCache are counting down to get it periodically refreshed.
// TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
// actions that are likely to result in changing the executing core
[ThreadStatic]
private static int t_currentProcessorIdCache;

private const int ProcessorIdCacheShift = 16;
private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
// Refresh rate of the cache. Will be derived from a speed check of GetCurrentProcessorNumber API.
private static int s_processorIdRefreshRate;
// We will not adjust higher than this though.
private const int MaxIdRefreshRate = 5000;

private static int RefreshCurrentProcessorId()
{
int currentProcessorId = Thread.GetCurrentProcessorNumber();

// On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
// doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber()
// returns -1. As a fallback in that case and to spread the threads across the buckets
// by default, we use the current managed thread ID as a proxy.
if (currentProcessorId < 0)
currentProcessorId = Environment.CurrentManagedThreadId;

Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask);

// Mask with int.MaxValue to ensure the execution Id is not negative
t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | s_processorIdRefreshRate;

return currentProcessorId;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int GetCurrentProcessorId()
{
int currentProcessorIdCache = t_currentProcessorIdCache--;
if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
{
return RefreshCurrentProcessorId();
}

return currentProcessorIdCache >> ProcessorIdCacheShift;
}

// If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false.
// Check more than once - to make sure it was not because TLS was delayed by GC or a context switch.
internal static bool ProcessorNumberSpeedCheck()
{
// NOTE: We do not check the frequency of the Stopwatch.
// The frequency often does not match the actual timer refresh rate anyways.
// If the resolution, precision or access time to the timer are inadequate for our measures here,
// the test will fail anyways.

double minID = double.MaxValue;
double minTLS = double.MaxValue;

// warm up the code paths.
UninlinedThreadStatic();
// also check if API is actually functional (-1 means not supported)
if (Thread.GetCurrentProcessorNumber() < 0)
{
s_processorIdRefreshRate = ProcessorIdCacheCountDownMask;
return false;
}

long oneMicrosecond = Stopwatch.Frequency / 1000000 + 1;
for (int i = 0; i < 10; i++)
{
// we will measure at least 16 iterations and at least 1 microsecond
long t;
int iters = 8;
do
{
iters *= 2;
t = Stopwatch.GetTimestamp();
for (int j = 0; j < iters; j++)
{
Thread.GetCurrentProcessorNumber();
}
t = Stopwatch.GetTimestamp() - t;
} while (t < oneMicrosecond);

minID = Math.Min(minID, (double)t / iters);

// we will measure at least 1 microsecond,
// and use at least 1/2 of ProcID iterations
// we assume that TLS can't be more than 2x slower than ProcID
iters /= 4;
do
{
iters *= 2;
t = Stopwatch.GetTimestamp();
for (int j = 0; j < iters; j++)
{
UninlinedThreadStatic();
}
t = Stopwatch.GetTimestamp() - t;
} while (t < oneMicrosecond);

minTLS = Math.Min(minTLS, (double)t / iters);
}

// A few words about choosing cache refresh rate:
//
// There are too reasons why data structures use core affinity:
// 1) To improve locality - avoid running on one core and using data in other core's cache.
// 2) To reduce sharing - avoid multiple threads using the same piece of data.
//
// Scenarios with large footprint, like striped caches, are sensitive to both parts. It is desirable to access
// large data from the "right" core.
// In scenarios where the state is small, like a striped counter, it is mostly about sharing.
// Otherwise the state is small and occasionally moving counter to a different core via cache miss is not a big deal.
//
// In scenarios that care more about sharing precise results of GetCurrentProcessorNumber may not justify
// the cost unless the underlying implementation is very cheap.
// In such cases it is desirable to amortize the cost over multiple accesses by caching in a ThreadStatic.
//
// In addition to the data structure, the benefits also depend on use pattern and on concurrency level.
// I.E. if an array pool user only rents array "just in case" but does not actually use it, and concurrency level is low,
// a longer refresh would be beneficial since that could lower the API cost.
// If array is actually used, then there is benefit from higher precision of the API and shorter refresh is more attractive.
//
// Overall we do not know the ideal refresh rate and using some kind of dynamic feedback is unlikely to be feasible.
// Experiments have shown, however, that 5x amortization rate is a good enough balance between precision and cost of the API.
s_processorIdRefreshRate = Math.Min((int)(minID * 5 / minTLS), MaxIdRefreshRate);

// In a case if GetCurrentProcessorNumber is particularly fast, like it happens on platforms supporting RDPID instruction,
// caching is not an improvement, thus it is desirable to bypass the cache entirely.
// Such systems consistently derive the refresh rate at or below 2-3, while the next tier, RDTSCP based implementations result in ~10,
// so we use "5" as a criteria to separate "fast" machines from the rest.
return s_processorIdRefreshRate <= 5;
}

// NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.
// Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching.
[MethodImpl(MethodImplOptions.NoInlining)]
internal static int UninlinedThreadStatic()
{
return t_currentProcessorIdCache;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -484,54 +484,23 @@ private static int CalculateOptimalMaxSpinWaitsPerSpinIteration()
}

[MethodImpl(MethodImplOptions.InternalCall)]
private static extern int GetCurrentProcessorNumber();
internal static extern int GetCurrentProcessorNumber();

// The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
// the t_currentProcessorIdCache are counting down to get it periodically refreshed.
// TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
// actions that are likely to result in changing the executing core
[ThreadStatic]
private static int t_currentProcessorIdCache;

private const int ProcessorIdCacheShift = 16;
private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
private const int ProcessorIdRefreshRate = 5000;

private static int RefreshCurrentProcessorId()
{
int currentProcessorId = GetCurrentProcessorNumber();

// On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
// doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber()
// returns -1. As a fallback in that case and to spread the threads across the buckets
// by default, we use the current managed thread ID as a proxy.
if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;

// Add offset to make it clear that it is not guaranteed to be 0-based processor number
currentProcessorId += 100;
Copy link
Contributor

@CoffeeFlux CoffeeFlux Jan 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like the offset was removed in this cache refactoring PR. Are we going to guarantee a 0-based processor number or not? Whichever we go with, the docs ought to be updated to reflect that and I'll happily PR the change. I think if we're not going to guarantee it, the offset is a good idea and should be reintroduced. @VSadov @jkotas

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not guarantee 0-based processor number.

Whether or not to pay the extra cycle to add the offset is an interesting question. I do not have a strong opinion either way.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@CoffeeFlux - We cannot guarantee the [0..CpuCount) range. The underlying API may be nonfunctional and managed threadID used instead. On VMs the core ID could be outside of this range too.

Why do you think adding offset is a good idea though?.
It adds some cost to the API while anyone doing something like GetCurrentProcessorId & mask will be tempted to subtract the offset.

Copy link
Contributor

@CoffeeFlux CoffeeFlux Jan 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My fear is customers relying on the behavior anyway and then ending up with Mono bug reports because we can't guarantee it on mobile, but maybe we should just update the docs to be unambiguous about this. My opinion isn't that strong, I just want it to be clarified somewhere other than comments in the source code that we can't guarantee a range of [0..CpuCount).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After sleeping on it, I think I'm okay just leaving the value as-is and updating the docs. I'll PR the change later today.


Debug.Assert(ProcessorIdRefreshRate <= ProcessorIdCacheCountDownMask);

// Mask with int.MaxValue to ensure the execution Id is not negative
t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate;

return currentProcessorId;
}

// Cached processor id used as a hint for which per-core stack to access. It is periodically
// refreshed to trail the actual thread core affinity.
// Cached processor id could be used as a hint for which per-core stripe of data to access to avoid sharing.
// It is periodically refreshed to trail the actual thread core affinity.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int GetCurrentProcessorId()
{
int currentProcessorIdCache = t_currentProcessorIdCache--;
if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
{
return RefreshCurrentProcessorId();
}
if (s_isProcessorNumberReallyFast)
return GetCurrentProcessorNumber();

return currentProcessorIdCache >> ProcessorIdCacheShift;
return ProcessorIdCache.GetCurrentProcessorId();
}

// a speed check will determine refresh rate of the cache and will report if caching is not advisable.
// we will record that in a readonly static so that it could become a JIT constant and bypass caching entirely.
private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.ProcessorNumberSpeedCheck();

internal void ResetThreadPoolThread()
{
// Currently implemented in unmanaged method Thread::InternalReset and
Expand Down