Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 145 additions & 0 deletions src/benchmarks/micro/sve/AddReduction.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
using System;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Extensions;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Filters;
using MicroBenchmarks;

namespace SveBenchmarks
{
[BenchmarkCategory(Categories.Runtime)]
[OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)]
[Config(typeof(Config))]
public class AddReduction
{
private class Config : ManualConfig
{
public Config()
{
AddFilter(new SimpleFilter(_ => Sve.IsSupported));
}
}

[Params(15, 127, 527, 10015)]
public int Size;

private double[] _source;
private double _result;

[GlobalSetup]
public virtual void Setup()
{
_source = ValuesGenerator.Array<double>(Size);
}

[GlobalCleanup]
public virtual void Verify()
{
double current = _result;
Setup();
Scalar();
double scalar = _result;
// Check that the result is the same as scalar (within 10ULP).
// Error is due to rounding floating-point additions in different
// orderings (for Vector128AddReduction and SveAddReduction).
Copy link

Copilot AI Oct 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Corrected spelling of 'orderings' to 'orders' for consistency with standard terminology in numerical computing contexts.

Suggested change
// orderings (for Vector128AddReduction and SveAddReduction).
// orders (for Vector128AddReduction and SveAddReduction).

Copilot uses AI. Check for mistakes.
// SveAddSequential has the same ordering as Scalar.
int e = (int)(BitConverter.DoubleToUInt64Bits(scalar) >> 52 & 0x7ff);
if (e == 0) e++;
double ulpScale = Math.ScaleB(1.0f, e - 1023 - 52);
Copy link

Copilot AI Oct 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first argument to Math.ScaleB should be 1.0 (double) instead of 1.0f (float). Using a float literal is inconsistent with the double type of ulpScale and may cause implicit conversion issues or loss of precision.

Suggested change
double ulpScale = Math.ScaleB(1.0f, e - 1023 - 52);
double ulpScale = Math.ScaleB(1.0, e - 1023 - 52);

Copilot uses AI. Check for mistakes.
double ulpError = Math.Abs(current - scalar) / ulpScale;
Debug.Assert(ulpError <= 10);
}

[Benchmark]
public unsafe void Scalar()
{
fixed (double* a = _source)
{
double res = 0.0;
for (int i = 0; i < Size; i++)
{
res += a[i];
}
_result = res;
}
}

[Benchmark]
public unsafe void Vector128AddReduction()
{
fixed (double* a = _source)
{
int i = 0;
double res = 0.0;
for (; i <= Size - 2; i += 2)
{
Vector128<double> data = AdvSimd.LoadVector128(a + i);
// Sum up all lanes and reduce to scalar.
res += AdvSimd.Arm64.AddPairwiseScalar(data).ToScalar();
}
// Handle Tail.
for (; i < Size; i++)
{
res += a[i];
}
_result = res;
}
}

[Benchmark]
public unsafe void SveAddSequential()
{
fixed (double* a = _source)
{
int i = 0;
int cntd = (int)Sve.Count64BitElements();

Vector<double> resVec = Vector<double>.Zero;
Vector<double> pTrue = Sve.CreateTrueMaskDouble();
for (; i <= Size - cntd; i += cntd)
{
Vector<double> data = Sve.LoadVector(pTrue, a + i);
// Sum up all lanes sequentially and add to scalar.
resVec = Sve.AddSequentialAcross(resVec, data);
}
// Get the scalar result from the first lane.
double res = resVec.ToScalar();
// Handle Tail.
for (; i < Size; i++)
{
res += a[i];
}
_result = res;
}
}

[Benchmark]
public unsafe void SveAddReduction()
{
fixed (double* a = _source)
{
int i = 0;
int cntd = (int)Sve.Count64BitElements();
double res = 0.0;

Vector<double> pTrue = Sve.CreateTrueMaskDouble();
for (; i <= Size - cntd; i += cntd)
{
Vector<double> data = Sve.LoadVector(pTrue, a + i);
// Sum up all lanes and reduce to scalar.
res += Sve.AddAcross(data).ToScalar();
}
// Handle Tail.
for (; i < Size; i++)
{
res += a[i];
}
_result = res;
}
}
}
}
179 changes: 179 additions & 0 deletions src/benchmarks/micro/sve/MultiplyAdd.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
using System;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Extensions;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Filters;
using MicroBenchmarks;

namespace SveBenchmarks
{
[BenchmarkCategory(Categories.Runtime)]
[OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)]
[Config(typeof(Config))]
public class MultiplyAdd
{
private class Config : ManualConfig
{
public Config()
{
AddFilter(new SimpleFilter(_ => Sve.IsSupported));
}
}

[Params(15, 127, 527, 10015)]
public int Size;

private int[] _source1;
private int[] _source2;
private int _result;

[GlobalSetup]
public virtual void Setup()
{
_source1 = ValuesGenerator.Array<int>(Size);
_source2 = ValuesGenerator.Array<int>(Size);
}

[GlobalCleanup]
public virtual void Verify()
{
int current = _result;
Setup();
Scalar();
int scalar = _result;
// Check that the result is the same as the scalar result.
Debug.Assert(current == scalar);
}

[Benchmark]
public unsafe void Scalar()
{
fixed (int* a = _source1, b = _source2)
{
int res = 0;
for (int i = 0; i < Size; i++)
{
res += (int)(a[i] * b[i]);
}
_result = res;
}
}

[Benchmark]
public unsafe void Vector128MultiplyAdd()
{
fixed (int* a = _source1, b = _source2)
{
int incr = sizeof(Vector128<int>) / sizeof(int);
int i = 0;

// The length of the tail is Size modulo 4 * element count.
int lmt = Size - (Size % (incr << 2));

Vector128<int> res1 = Vector128<int>.Zero;
Vector128<int> res2 = Vector128<int>.Zero;
Vector128<int> res3 = Vector128<int>.Zero;
Vector128<int> res4 = Vector128<int>.Zero;

for (; i < lmt; i += incr << 2)
{
// Unroll loop by 4.
Vector128<int> aVec1 = AdvSimd.LoadVector128(a + i);
Vector128<int> bVec1 = AdvSimd.LoadVector128(b + i);
Vector128<int> aVec2 = AdvSimd.LoadVector128(a + i + incr);
Vector128<int> bVec2 = AdvSimd.LoadVector128(b + i + incr);
Vector128<int> aVec3 = AdvSimd.LoadVector128(a + i + incr * 2);
Vector128<int> bVec3 = AdvSimd.LoadVector128(b + i + incr * 2);
Vector128<int> aVec4 = AdvSimd.LoadVector128(a + i + incr * 3);
Vector128<int> bVec4 = AdvSimd.LoadVector128(b + i + incr * 3);

// Calculate 4 vectors at a time.
res1 = AdvSimd.MultiplyAdd(res1, aVec1, bVec1);
res2 = AdvSimd.MultiplyAdd(res2, aVec2, bVec2);
res3 = AdvSimd.MultiplyAdd(res3, aVec3, bVec3);
res4 = AdvSimd.MultiplyAdd(res4, aVec4, bVec4);
}

// Sum all the results between the 4 vectors.
res1 = Vector128.Add(res1, res2);
res3 = Vector128.Add(res3, res4);
res1 = Vector128.Add(res1, res3);
int res = AdvSimd.Arm64.AddAcross(res1).ToScalar();

// Process any remaining elements.
for (; i < Size; i++)
{
res += (int)(a[i] * b[i]);
}
_result = res;
}
}

[Benchmark]
public unsafe void SveMultiplyAdd()
{
fixed (int* a = _source1, b = _source2)
{
int i = 0;
int cntw = (int)Sve.Count32BitElements();

// The length of the tail is Size modulo 4 * element count.
int lmt = (int)Size - (Size % (cntw << 2));

Vector<int> res1 = Vector<int>.Zero;
Vector<int> res2 = Vector<int>.Zero;
Vector<int> res3 = Vector<int>.Zero;
Vector<int> res4 = Vector<int>.Zero;
Vector<int> pTrue = Sve.CreateTrueMaskInt32();

while (i < lmt)
{
// Unroll loop by 4.
Vector<int> aVec1 = Sve.LoadVector(pTrue, a + i);
Vector<int> bVec1 = Sve.LoadVector(pTrue, b + i);
Vector<int> aVec2 = Sve.LoadVector(pTrue, a + i + cntw);
Vector<int> bVec2 = Sve.LoadVector(pTrue, b + i + cntw);
Vector<int> aVec3 = Sve.LoadVector(pTrue, a + i + cntw * 2);
Vector<int> bVec3 = Sve.LoadVector(pTrue, b + i + cntw * 2);
Vector<int> aVec4 = Sve.LoadVector(pTrue, a + i + cntw * 3);
Vector<int> bVec4 = Sve.LoadVector(pTrue, b + i + cntw * 3);

// Calculate 4 vectors at a time.
res1 = Sve.MultiplyAdd(res1, aVec1, bVec1);
res2 = Sve.MultiplyAdd(res2, aVec2, bVec2);
res3 = Sve.MultiplyAdd(res3, aVec3, bVec3);
res4 = Sve.MultiplyAdd(res4, aVec4, bVec4);

// Increment counter by 4 times the element count.
i = Sve.SaturatingIncrementBy32BitElementCount(i, 4);
}

// Handle remaining elements using predicates.
lmt = Size;
Vector<int> pLoop = (Vector<int>)Sve.CreateWhileLessThanMask32Bit(i, lmt);
while (Sve.TestAnyTrue(pTrue, pLoop))
{
Vector<int> aVec = Sve.LoadVector(pLoop, a + i);
Vector<int> bVec = Sve.LoadVector(pLoop, b + i);

// Apply pLoop mask on the result.
res1 = Sve.ConditionalSelect(pLoop, Sve.MultiplyAdd(res1, aVec, bVec), res1);

// Increment by a vector length.
i += cntw;
pLoop = (Vector<int>)Sve.CreateWhileLessThanMask32Bit(i, lmt);
}

// Sum up all elements in the 4 result vectors.
res1 = Sve.Add(res1, res2);
res3 = Sve.Add(res3, res4);
_result = (int)Sve.AddAcross(Sve.Add(res1, res3)).ToScalar();
}
}

}
}
Loading