Skip to content

I accidentaly found performance reggresion using"mandelbrot algorithm" #80757

@milen-denev

Description

@milen-denev

The place from where I copied the code: https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-csharpcore-1.html

Actual Code:

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;

namespace Test;

public static class Program
{
    public static void Main() 
    {
        BenchmarkRunner.Run<Benchy>();
    }

    [MemoryDiagnoser(true)]
    [SimpleJob(RuntimeMoniker.Net70)]
    [SimpleJob(RuntimeMoniker.Net60, baseline: true)]
    public class Benchy
    {
        [Benchmark]
        public void Benchmark1()
        {
            MandelBrot.Default();
        }
    }

    public class MandelBrot
    {
        // x86 version, AVX2
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static byte Process8(double x, double y, double dx)
        {
            // initial x coords
            var x01 = Vector256.Create(x + 0 * dx, x + 1 * dx, x + 2 * dx, x + 3 * dx);
            var x02 = Vector256.Create(x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx);

            // initial y coords
            var y0 = Vector256.Create(y);

            Vector256<double> x1 = x01, y1 = y0; // current iteration 1
            Vector256<double> x2 = x02, y2 = y0; // current iteration 2

            Vector256<double> four = Vector256.Create(4.0); // 4 in each slot        

            var pass = 0;

            // temp space, C# requires init.
            Vector256<double>
                x12 = Vector256<double>.Zero,
                y12 = Vector256<double>.Zero,
                x22 = Vector256<double>.Zero,
                y22 = Vector256<double>.Zero;

            // bit masks for results
            uint res1 = 1, res2 = 1;

            while (pass < 49 && (res1 != 0 || res2 != 0))
            {

                // do several between checks a time like other code
                for (var p = 0; p < 7; ++p)
                {
                    // unroll loop 2x to decrease register stalls

                    // squares x*x and y*y
                    x12 = Avx2.Multiply(x1, x1);
                    y12 = Avx2.Multiply(y1, y1);
                    x22 = Avx2.Multiply(x2, x2);
                    y22 = Avx2.Multiply(y2, y2);

                    // mixed products x*y
                    var xy1 = Avx2.Multiply(x1, y1);
                    var xy2 = Avx2.Multiply(x2, y2);

                    // diff of squares x*x - y*y
                    var ds1 = Avx2.Subtract(x12, y12);
                    var ds2 = Avx2.Subtract(x22, y22);

                    // 2*x*y
                    xy1 = Avx2.Add(xy1, xy1);
                    xy2 = Avx2.Add(xy2, xy2);

                    // next iters
                    y1 = Avx2.Add(xy1, y0);
                    y2 = Avx2.Add(xy2, y0);
                    x1 = Avx2.Add(ds1, x01);
                    x2 = Avx2.Add(ds2, x02);
                }
                pass += 7;

                // numbers overflow, which gives an Infinity or NaN, which, 
                // when compared N < 4, results in false, which is what we want

                // sum of squares x*x + y*y, compare to 4 (escape mandelbrot)
                var ss1 = Avx2.Add(x12, y12);
                var ss2 = Avx2.Add(x22, y22);

                // compare - puts all 0 in reg if false, else all 1 (=NaN bitwise)
                // when each register is 0, then all points escaped, so exit
                var cmp1 = Avx.Compare(ss1, four,
                        FloatComparisonMode.OrderedLessThanOrEqualNonSignaling);
                var cmp2 = Avx.Compare(ss2, four,
                        FloatComparisonMode.OrderedLessThanOrEqualNonSignaling);

                // take top bit from each byte
                res1 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp1));
                res2 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp2));
            }

            // can make a mask of bits in any order, which is the +7, +6, .., +1, +0
            res1 &=
                (1 << (0 + 7)) |
                (1 << (8 + 6)) |
                (1 << (16 + 5)) |
                (1 << (24 + 4));
            res2 &=
                (1 << (0 + 3)) |
                (1 << (8 + 2)) |
                (1 << (16 + 1)) |
                (1 << (24 + 0));

            var res = res1 | res2;
            res |= res >> 16;
            res |= res >> 8;
            return (byte)(res);
        }

        public static void MainNew()
        {

            var size = 200;
            var lineLength = size >> 3;
            var data = new byte[size * lineLength];

            // step size
            var delta = 2.0 / size; // (0.5 - (-1.5))/size;

            Parallel.For(0, size, y =>
            {
                var yd = y * delta - 1;
                for (var x = 0; x < lineLength; x++)
                {
                    var xd = (x * 8) * delta - 1.5;
                    data[y * lineLength + x] = Process8(xd, yd, delta);
                }
            });
        }

        public static void Default()
        {
            MainNew();
        }
    }
}

Configuration

BenchmarkDotNet=v0.13.4, OS=Windows 11 (10.0.22621.1105)
Intel Core i5-10400 CPU 2.90GHz, 1 CPU, 12 logical and 6 physical cores
.NET SDK=7.0.102
[Host] : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2
.NET 6.0 : .NET 6.0.13 (6.0.1322.58009), X64 RyuJIT AVX2
.NET 7.0 : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2

   <TargetFrameworks>net6.0;net7.0;</TargetFrameworks>
   <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   <PublishAot>false</PublishAot>
   <ServerGarbageCollection>true</ServerGarbageCollection>
   <TieredPGO>true</TieredPGO>
   <TieredCompilationQuickJitForLoops>true</TieredCompilationQuickJitForLoops>
   <Platforms>AnyCPU;x64</Platforms>

Regression?

The Regression is tested from .NET 6 -> .NET 7

Data

Method Job Runtime Mean Error StdDev Ratio RatioSD Gen0 Allocated Alloc Ratio
Benchmark1 .NET 6.0 .NET 6.0 81.72 us 1.496 us 1.400 us 1.00 0.00 0.1221 8.69 KB 1.00
Benchmark1 .NET 7.0 .NET 7.0 101.81 us 0.699 us 0.654 us 1.25 0.02 - 8.53 KB 0.98

Metadata

Metadata

Assignees

Labels

area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMItenet-performancePerformance related issue

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions