Description
I noticed some of my stress test unit tests were taking ~6 minutes on .Net 8 and ~5 seconds on .Net Framework 4.8. I ran some benchmarks and was able to get a minimal repro with this.
Benchmark Code
[MaxIterationCount(30)]
public class MultiThreadBenchmark
{
private readonly MyThreadPoolSynchronizationContext _alternativeThreadPool = new MyThreadPoolSynchronizationContext();
[Benchmark]
[Arguments(2)]
[Arguments(7)]
public void MultiThreadTest(int postCount)
{
for (int i = 0; i < postCount; ++i)
{
_alternativeThreadPool.Post(_ => { }, null);
}
_alternativeThreadPool.WaitForAllThreadsToComplete();
}
}
public sealed class MyThreadPoolSynchronizationContext : SynchronizationContext
{
volatile private int _runningActionCount;
private sealed class ThreadRunner
{
private static readonly Stack<ThreadRunner> s_pool = new Stack<ThreadRunner>();
private static int s_threadCounter;
private MyThreadPoolSynchronizationContext _owner;
private readonly object _locker = new object();
private SendOrPostCallback _callback;
private object _state;
public static void Run(MyThreadPoolSynchronizationContext owner, SendOrPostCallback callback, object state)
{
Interlocked.Increment(ref owner._runningActionCount);
bool reused = false;
ThreadRunner threadRunner = null;
lock (s_pool)
{
if (s_pool.Count > 0)
{
reused = true;
threadRunner = s_pool.Pop();
}
}
if (!reused)
{
threadRunner = new ThreadRunner();
}
lock (threadRunner._locker)
{
threadRunner._owner = owner;
threadRunner._callback = callback;
threadRunner._state = state;
if (reused)
{
Monitor.Pulse(threadRunner._locker);
}
else
{
new Thread(threadRunner.ThreadAction)
{
IsBackground = true,
Name = $"MyThreadPoolSynchronizationContext_{Interlocked.Increment(ref s_threadCounter)}"
}.Start();
}
}
}
private void ThreadAction()
{
SetSynchronizationContext(_owner);
while (true)
{
MyThreadPoolSynchronizationContext owner = _owner;
SendOrPostCallback callback = _callback;
object state = _state;
// Allow GC to reclaim memory.
_owner = null;
_callback = null;
_state = null;
callback.Invoke(state);
Interlocked.Decrement(ref owner._runningActionCount);
lock (_locker)
{
lock (s_pool)
{
s_pool.Push(this);
}
Monitor.Wait(_locker);
}
}
}
}
public void WaitForAllThreadsToComplete()
{
var spinner = new SpinWait();
while (_runningActionCount > 0)
{
spinner.SpinOnce();
}
}
public override SynchronizationContext CreateCopy()
{
return this;
}
public override void Post(SendOrPostCallback d, object state)
{
if (d == null)
{
throw new System.ArgumentNullException("d", "SendOrPostCallback may not be null.");
}
ThreadRunner.Run(this, d, state);
}
public override void Send(SendOrPostCallback d, object state)
{
throw new System.InvalidOperationException();
}
}
Regression?
Yes
Data
BenchmarkDotNet v0.15.0, Windows 10 (10.0.19045.5854/22H2/2022Update)
AMD Ryzen 7 9800X3D 4.70GHz, 1 CPU, 16 logical and 8 physical cores
.NET SDK 10.0.100-preview.4.25258.110
[Host] : .NET 8.0.16 (8.0.1625.21506), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Job-CXDRYT : .NET 10.0.0 (10.0.25.25910), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Job-TIUQAM : .NET 8.0.16 (8.0.1625.21506), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Job-AGYXWA : .NET Framework 4.8.1 (4.8.9310.0), X64 RyuJIT VectorSize=256
| Method |
Runtime |
postCount |
Mean |
Error |
StdDev |
Ratio |
RatioSD |
| MultiThreadTest |
.NET 10.0 |
2 |
98.871 us |
19.2124 us |
28.1613 us |
1.09 |
0.48 |
| MultiThreadTest |
.NET 8.0 |
2 |
86.410 us |
16.9564 us |
25.3796 us |
0.96 |
0.42 |
| MultiThreadTest |
.NET Framework 4.8 |
2 |
6.837 us |
0.0946 us |
0.0838 us |
0.08 |
0.02 |
|
|
|
|
|
|
|
|
| MultiThreadTest |
.NET 10.0 |
7 |
342.402 us |
147.1440 us |
220.2382 us |
1.90 |
2.56 |
| MultiThreadTest |
.NET 8.0 |
7 |
1,983.185 us |
370.6014 us |
554.6988 us |
11.01 |
11.85 |
| MultiThreadTest |
.NET Framework 4.8 |
7 |
11.221 us |
0.5488 us |
0.8044 us |
0.06 |
0.06 |
Analysis
It looks like the performance issue is caused by Monitor (I suspect Wait and Pulse are the culprits, but it could be Enter and Exit), but I haven't dug any deeper to confirm.
These benchmarks have a very high variance, but the results are repeatable. Here are the results of another run:
| Method |
Runtime |
postCount |
Mean |
Error |
StdDev |
Ratio |
RatioSD |
| MultiThreadTest |
.NET 10.0 |
2 |
248.262 us |
94.0082 us |
140.7070 us |
1.84 |
3.43 |
| MultiThreadTest |
.NET 8.0 |
2 |
198.853 us |
71.0895 us |
104.2021 us |
1.47 |
2.68 |
| MultiThreadTest |
.NET Framework 4.8 |
2 |
6.795 us |
0.0658 us |
0.0583 us |
0.05 |
0.08 |
|
|
|
|
|
|
|
|
| MultiThreadTest |
.NET 10.0 |
7 |
629.493 us |
83.4623 us |
124.9225 us |
1.05 |
0.36 |
| MultiThreadTest |
.NET 8.0 |
7 |
442.401 us |
173.0234 us |
258.9733 us |
0.74 |
0.49 |
| MultiThreadTest |
.NET Framework 4.8 |
7 |
16.042 us |
0.5617 us |
0.8233 us |
0.03 |
0.01 |
Description
I noticed some of my stress test unit tests were taking ~6 minutes on .Net 8 and ~5 seconds on .Net Framework 4.8. I ran some benchmarks and was able to get a minimal repro with this.
Benchmark Code
Regression?
Yes
Data
BenchmarkDotNet v0.15.0, Windows 10 (10.0.19045.5854/22H2/2022Update)
AMD Ryzen 7 9800X3D 4.70GHz, 1 CPU, 16 logical and 8 physical cores
.NET SDK 10.0.100-preview.4.25258.110
[Host] : .NET 8.0.16 (8.0.1625.21506), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Job-CXDRYT : .NET 10.0.0 (10.0.25.25910), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Job-TIUQAM : .NET 8.0.16 (8.0.1625.21506), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Job-AGYXWA : .NET Framework 4.8.1 (4.8.9310.0), X64 RyuJIT VectorSize=256
Analysis
It looks like the performance issue is caused by
Monitor(I suspectWaitandPulseare the culprits, but it could beEnterandExit), but I haven't dug any deeper to confirm.These benchmarks have a very high variance, but the results are repeatable. Here are the results of another run: