From 5b3889971ca13cc2f03e91a5a10c6137d4fa8b7b Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Mon, 22 Sep 2025 17:04:34 +0100 Subject: [PATCH] Add MultiplyAdd and AddReduction to SVE microbenchmark --- src/benchmarks/micro/sve/AddReduction.cs | 145 ++++++++++++++++++ src/benchmarks/micro/sve/MultiplyAdd.cs | 179 +++++++++++++++++++++++ 2 files changed, 324 insertions(+) create mode 100644 src/benchmarks/micro/sve/AddReduction.cs create mode 100644 src/benchmarks/micro/sve/MultiplyAdd.cs diff --git a/src/benchmarks/micro/sve/AddReduction.cs b/src/benchmarks/micro/sve/AddReduction.cs new file mode 100644 index 00000000000..a9e41d23452 --- /dev/null +++ b/src/benchmarks/micro/sve/AddReduction.cs @@ -0,0 +1,145 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class AddReduction + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public int Size; + + private double[] _source; + private double _result; + + [GlobalSetup] + public virtual void Setup() + { + _source = ValuesGenerator.Array(Size); + } + + [GlobalCleanup] + public virtual void Verify() + { + double current = _result; + Setup(); + Scalar(); + double scalar = _result; + // Check that the result is the same as scalar (within 10ULP). + // Error is due to rounding floating-point additions in different + // orderings (for Vector128AddReduction and SveAddReduction). + // SveAddSequential has the same ordering as Scalar. + int e = (int)(BitConverter.DoubleToUInt64Bits(scalar) >> 52 & 0x7ff); + if (e == 0) e++; + double ulpScale = Math.ScaleB(1.0f, e - 1023 - 52); + double ulpError = Math.Abs(current - scalar) / ulpScale; + Debug.Assert(ulpError <= 10); + } + + [Benchmark] + public unsafe void Scalar() + { + fixed (double* a = _source) + { + double res = 0.0; + for (int i = 0; i < Size; i++) + { + res += a[i]; + } + _result = res; + } + } + + [Benchmark] + public unsafe void Vector128AddReduction() + { + fixed (double* a = _source) + { + int i = 0; + double res = 0.0; + for (; i <= Size - 2; i += 2) + { + Vector128 data = AdvSimd.LoadVector128(a + i); + // Sum up all lanes and reduce to scalar. + res += AdvSimd.Arm64.AddPairwiseScalar(data).ToScalar(); + } + // Handle Tail. + for (; i < Size; i++) + { + res += a[i]; + } + _result = res; + } + } + + [Benchmark] + public unsafe void SveAddSequential() + { + fixed (double* a = _source) + { + int i = 0; + int cntd = (int)Sve.Count64BitElements(); + + Vector resVec = Vector.Zero; + Vector pTrue = Sve.CreateTrueMaskDouble(); + for (; i <= Size - cntd; i += cntd) + { + Vector data = Sve.LoadVector(pTrue, a + i); + // Sum up all lanes sequentially and add to scalar. + resVec = Sve.AddSequentialAcross(resVec, data); + } + // Get the scalar result from the first lane. + double res = resVec.ToScalar(); + // Handle Tail. + for (; i < Size; i++) + { + res += a[i]; + } + _result = res; + } + } + + [Benchmark] + public unsafe void SveAddReduction() + { + fixed (double* a = _source) + { + int i = 0; + int cntd = (int)Sve.Count64BitElements(); + double res = 0.0; + + Vector pTrue = Sve.CreateTrueMaskDouble(); + for (; i <= Size - cntd; i += cntd) + { + Vector data = Sve.LoadVector(pTrue, a + i); + // Sum up all lanes and reduce to scalar. + res += Sve.AddAcross(data).ToScalar(); + } + // Handle Tail. + for (; i < Size; i++) + { + res += a[i]; + } + _result = res; + } + } + } +} diff --git a/src/benchmarks/micro/sve/MultiplyAdd.cs b/src/benchmarks/micro/sve/MultiplyAdd.cs new file mode 100644 index 00000000000..ceb8ffcc7d3 --- /dev/null +++ b/src/benchmarks/micro/sve/MultiplyAdd.cs @@ -0,0 +1,179 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class MultiplyAdd + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public int Size; + + private int[] _source1; + private int[] _source2; + private int _result; + + [GlobalSetup] + public virtual void Setup() + { + _source1 = ValuesGenerator.Array(Size); + _source2 = ValuesGenerator.Array(Size); + } + + [GlobalCleanup] + public virtual void Verify() + { + int current = _result; + Setup(); + Scalar(); + int scalar = _result; + // Check that the result is the same as the scalar result. + Debug.Assert(current == scalar); + } + + [Benchmark] + public unsafe void Scalar() + { + fixed (int* a = _source1, b = _source2) + { + int res = 0; + for (int i = 0; i < Size; i++) + { + res += (int)(a[i] * b[i]); + } + _result = res; + } + } + + [Benchmark] + public unsafe void Vector128MultiplyAdd() + { + fixed (int* a = _source1, b = _source2) + { + int incr = sizeof(Vector128) / sizeof(int); + int i = 0; + + // The length of the tail is Size modulo 4 * element count. + int lmt = Size - (Size % (incr << 2)); + + Vector128 res1 = Vector128.Zero; + Vector128 res2 = Vector128.Zero; + Vector128 res3 = Vector128.Zero; + Vector128 res4 = Vector128.Zero; + + for (; i < lmt; i += incr << 2) + { + // Unroll loop by 4. + Vector128 aVec1 = AdvSimd.LoadVector128(a + i); + Vector128 bVec1 = AdvSimd.LoadVector128(b + i); + Vector128 aVec2 = AdvSimd.LoadVector128(a + i + incr); + Vector128 bVec2 = AdvSimd.LoadVector128(b + i + incr); + Vector128 aVec3 = AdvSimd.LoadVector128(a + i + incr * 2); + Vector128 bVec3 = AdvSimd.LoadVector128(b + i + incr * 2); + Vector128 aVec4 = AdvSimd.LoadVector128(a + i + incr * 3); + Vector128 bVec4 = AdvSimd.LoadVector128(b + i + incr * 3); + + // Calculate 4 vectors at a time. + res1 = AdvSimd.MultiplyAdd(res1, aVec1, bVec1); + res2 = AdvSimd.MultiplyAdd(res2, aVec2, bVec2); + res3 = AdvSimd.MultiplyAdd(res3, aVec3, bVec3); + res4 = AdvSimd.MultiplyAdd(res4, aVec4, bVec4); + } + + // Sum all the results between the 4 vectors. + res1 = Vector128.Add(res1, res2); + res3 = Vector128.Add(res3, res4); + res1 = Vector128.Add(res1, res3); + int res = AdvSimd.Arm64.AddAcross(res1).ToScalar(); + + // Process any remaining elements. + for (; i < Size; i++) + { + res += (int)(a[i] * b[i]); + } + _result = res; + } + } + + [Benchmark] + public unsafe void SveMultiplyAdd() + { + fixed (int* a = _source1, b = _source2) + { + int i = 0; + int cntw = (int)Sve.Count32BitElements(); + + // The length of the tail is Size modulo 4 * element count. + int lmt = (int)Size - (Size % (cntw << 2)); + + Vector res1 = Vector.Zero; + Vector res2 = Vector.Zero; + Vector res3 = Vector.Zero; + Vector res4 = Vector.Zero; + Vector pTrue = Sve.CreateTrueMaskInt32(); + + while (i < lmt) + { + // Unroll loop by 4. + Vector aVec1 = Sve.LoadVector(pTrue, a + i); + Vector bVec1 = Sve.LoadVector(pTrue, b + i); + Vector aVec2 = Sve.LoadVector(pTrue, a + i + cntw); + Vector bVec2 = Sve.LoadVector(pTrue, b + i + cntw); + Vector aVec3 = Sve.LoadVector(pTrue, a + i + cntw * 2); + Vector bVec3 = Sve.LoadVector(pTrue, b + i + cntw * 2); + Vector aVec4 = Sve.LoadVector(pTrue, a + i + cntw * 3); + Vector bVec4 = Sve.LoadVector(pTrue, b + i + cntw * 3); + + // Calculate 4 vectors at a time. + res1 = Sve.MultiplyAdd(res1, aVec1, bVec1); + res2 = Sve.MultiplyAdd(res2, aVec2, bVec2); + res3 = Sve.MultiplyAdd(res3, aVec3, bVec3); + res4 = Sve.MultiplyAdd(res4, aVec4, bVec4); + + // Increment counter by 4 times the element count. + i = Sve.SaturatingIncrementBy32BitElementCount(i, 4); + } + + // Handle remaining elements using predicates. + lmt = Size; + Vector pLoop = (Vector)Sve.CreateWhileLessThanMask32Bit(i, lmt); + while (Sve.TestAnyTrue(pTrue, pLoop)) + { + Vector aVec = Sve.LoadVector(pLoop, a + i); + Vector bVec = Sve.LoadVector(pLoop, b + i); + + // Apply pLoop mask on the result. + res1 = Sve.ConditionalSelect(pLoop, Sve.MultiplyAdd(res1, aVec, bVec), res1); + + // Increment by a vector length. + i += cntw; + pLoop = (Vector)Sve.CreateWhileLessThanMask32Bit(i, lmt); + } + + // Sum up all elements in the 4 result vectors. + res1 = Sve.Add(res1, res2); + res3 = Sve.Add(res3, res4); + _result = (int)Sve.AddAcross(Sve.Add(res1, res3)).ToScalar(); + } + } + + } +}