-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Closed
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMItenet-performancePerformance related issuePerformance related issue
Milestone
Description
The place from where I copied the code: https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-csharpcore-1.html
Actual Code:
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;
namespace Test;
public static class Program
{
public static void Main()
{
BenchmarkRunner.Run<Benchy>();
}
[MemoryDiagnoser(true)]
[SimpleJob(RuntimeMoniker.Net70)]
[SimpleJob(RuntimeMoniker.Net60, baseline: true)]
public class Benchy
{
[Benchmark]
public void Benchmark1()
{
MandelBrot.Default();
}
}
public class MandelBrot
{
// x86 version, AVX2
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static byte Process8(double x, double y, double dx)
{
// initial x coords
var x01 = Vector256.Create(x + 0 * dx, x + 1 * dx, x + 2 * dx, x + 3 * dx);
var x02 = Vector256.Create(x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx);
// initial y coords
var y0 = Vector256.Create(y);
Vector256<double> x1 = x01, y1 = y0; // current iteration 1
Vector256<double> x2 = x02, y2 = y0; // current iteration 2
Vector256<double> four = Vector256.Create(4.0); // 4 in each slot
var pass = 0;
// temp space, C# requires init.
Vector256<double>
x12 = Vector256<double>.Zero,
y12 = Vector256<double>.Zero,
x22 = Vector256<double>.Zero,
y22 = Vector256<double>.Zero;
// bit masks for results
uint res1 = 1, res2 = 1;
while (pass < 49 && (res1 != 0 || res2 != 0))
{
// do several between checks a time like other code
for (var p = 0; p < 7; ++p)
{
// unroll loop 2x to decrease register stalls
// squares x*x and y*y
x12 = Avx2.Multiply(x1, x1);
y12 = Avx2.Multiply(y1, y1);
x22 = Avx2.Multiply(x2, x2);
y22 = Avx2.Multiply(y2, y2);
// mixed products x*y
var xy1 = Avx2.Multiply(x1, y1);
var xy2 = Avx2.Multiply(x2, y2);
// diff of squares x*x - y*y
var ds1 = Avx2.Subtract(x12, y12);
var ds2 = Avx2.Subtract(x22, y22);
// 2*x*y
xy1 = Avx2.Add(xy1, xy1);
xy2 = Avx2.Add(xy2, xy2);
// next iters
y1 = Avx2.Add(xy1, y0);
y2 = Avx2.Add(xy2, y0);
x1 = Avx2.Add(ds1, x01);
x2 = Avx2.Add(ds2, x02);
}
pass += 7;
// numbers overflow, which gives an Infinity or NaN, which,
// when compared N < 4, results in false, which is what we want
// sum of squares x*x + y*y, compare to 4 (escape mandelbrot)
var ss1 = Avx2.Add(x12, y12);
var ss2 = Avx2.Add(x22, y22);
// compare - puts all 0 in reg if false, else all 1 (=NaN bitwise)
// when each register is 0, then all points escaped, so exit
var cmp1 = Avx.Compare(ss1, four,
FloatComparisonMode.OrderedLessThanOrEqualNonSignaling);
var cmp2 = Avx.Compare(ss2, four,
FloatComparisonMode.OrderedLessThanOrEqualNonSignaling);
// take top bit from each byte
res1 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp1));
res2 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp2));
}
// can make a mask of bits in any order, which is the +7, +6, .., +1, +0
res1 &=
(1 << (0 + 7)) |
(1 << (8 + 6)) |
(1 << (16 + 5)) |
(1 << (24 + 4));
res2 &=
(1 << (0 + 3)) |
(1 << (8 + 2)) |
(1 << (16 + 1)) |
(1 << (24 + 0));
var res = res1 | res2;
res |= res >> 16;
res |= res >> 8;
return (byte)(res);
}
public static void MainNew()
{
var size = 200;
var lineLength = size >> 3;
var data = new byte[size * lineLength];
// step size
var delta = 2.0 / size; // (0.5 - (-1.5))/size;
Parallel.For(0, size, y =>
{
var yd = y * delta - 1;
for (var x = 0; x < lineLength; x++)
{
var xd = (x * 8) * delta - 1.5;
data[y * lineLength + x] = Process8(xd, yd, delta);
}
});
}
public static void Default()
{
MainNew();
}
}
}
Configuration
BenchmarkDotNet=v0.13.4, OS=Windows 11 (10.0.22621.1105)
Intel Core i5-10400 CPU 2.90GHz, 1 CPU, 12 logical and 6 physical cores
.NET SDK=7.0.102
[Host] : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2
.NET 6.0 : .NET 6.0.13 (6.0.1322.58009), X64 RyuJIT AVX2
.NET 7.0 : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2
<TargetFrameworks>net6.0;net7.0;</TargetFrameworks>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<PublishAot>false</PublishAot>
<ServerGarbageCollection>true</ServerGarbageCollection>
<TieredPGO>true</TieredPGO>
<TieredCompilationQuickJitForLoops>true</TieredCompilationQuickJitForLoops>
<Platforms>AnyCPU;x64</Platforms>
Regression?
The Regression is tested from .NET 6 -> .NET 7
Data
| Method | Job | Runtime | Mean | Error | StdDev | Ratio | RatioSD | Gen0 | Allocated | Alloc Ratio |
|---|---|---|---|---|---|---|---|---|---|---|
| Benchmark1 | .NET 6.0 | .NET 6.0 | 81.72 us | 1.496 us | 1.400 us | 1.00 | 0.00 | 0.1221 | 8.69 KB | 1.00 |
| Benchmark1 | .NET 7.0 | .NET 7.0 | 101.81 us | 0.699 us | 0.654 us | 1.25 | 0.02 | - | 8.53 KB | 0.98 |
Metadata
Metadata
Assignees
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMItenet-performancePerformance related issuePerformance related issue