From 82bc79744b3c052a431722ffed6c4827db616a70 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 30 May 2025 18:22:36 +1000 Subject: [PATCH 01/20] Port ColorSpaceTransformUtils --- .../Common/Helpers/Vector128Utilities.cs | 103 +++++++++++++ .../Common/Helpers/Vector256Utilities.cs | 93 ++++++++++++ .../Webp/Lossless/ColorSpaceTransformUtils.cs | 142 +++++++++--------- .../WebP/ColorSpaceTransformUtilsTests.cs | 8 +- 4 files changed, 271 insertions(+), 75 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index dbe0a1fcec..f89900d7e5 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -304,6 +304,37 @@ public static Vector128 PackUnsignedSaturate(Vector128 left, Vector return Vector128.Narrow(lefClamped, rightClamped); } + /// + /// Packs signed 32-bit integers to unsigned 16-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 PackUnsignedSaturate(Vector128 left, Vector128 right) + { + if (Sse41.IsSupported) + { + return Sse41.PackUnsignedSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right); + } + + Vector128 min = Vector128.Create((int)ushort.MinValue); + Vector128 max = Vector128.Create((int)ushort.MaxValue); + Vector128 lefClamped = Clamp(left, min, max).AsUInt32(); + Vector128 rightClamped = Clamp(right, min, max).AsUInt32(); + return Vector128.Narrow(lefClamped, rightClamped); + } + /// /// Packs signed 32-bit integers to signed 16-bit integers and saturates. /// @@ -347,6 +378,78 @@ public static Vector128 PackSignedSaturate(Vector128 left, Vector128 public static Vector128 Clamp(Vector128 value, Vector128 min, Vector128 max) => Vector128.Min(Vector128.Max(value, min), max); + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the low 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyLow(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); + (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLower * rightLower; + Vector128 prodHi = leftUpper * rightUpper; + + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } + + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the high 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyHigh(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); + (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLower * rightLower; + Vector128 prodHi = leftUpper * rightUpper; + + // Arithmetic shift right by 16 bits to extract the high word + prodLo >>= 16; + prodHi >>= 16; + + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 817d6e6070..dfefd2d346 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -162,6 +162,27 @@ public static Vector256 MultiplySubtract( return (vm0 * vm1) - vs; } + /// + /// Packs signed 32-bit integers to signed 16-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 PackUnsignedSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.PackUnsignedSaturate(left, right); + } + + Vector256 min = Vector256.Create((int)ushort.MinValue); + Vector256 max = Vector256.Create((int)ushort.MaxValue); + Vector256 lefClamped = Clamp(left, min, max).AsUInt32(); + Vector256 rightClamped = Clamp(right, min, max).AsUInt32(); + return Vector256.Narrow(lefClamped, rightClamped); + } + /// /// Packs signed 32-bit integers to signed 16-bit integers and saturates. /// @@ -210,6 +231,78 @@ public static Vector256 Widen(Vector128 value) return Vector256.WidenLower(value.ToVector256()); } + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the low 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplyLow(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.MultiplyLow(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector256 leftLower, Vector256 leftUpper) = Vector256.Widen(left); + (Vector256 rightLower, Vector256 rightUpper) = Vector256.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector256 prodLo = leftLower * rightLower; + Vector256 prodHi = leftUpper * rightUpper; + + // Narrow the two int vectors back into one short vector + return Vector256.Narrow(prodLo, prodHi); + } + + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the high 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplyHigh(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.MultiplyHigh(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector256 leftLower, Vector256 leftUpper) = Vector256.Widen(left); + (Vector256 rightLower, Vector256 rightUpper) = Vector256.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector256 prodLo = leftLower * rightLower; + Vector256 prodHi = leftUpper * rightUpper; + + // Arithmetic shift right by 16 bits to extract the high word + prodLo >>= 16; + prodHi >>= 16; + + // Narrow the two int vectors back into one short vector + return Vector256.Narrow(prodLo, prodHi); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs index 9a6dfb66e8..5c6fb56043 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs @@ -4,7 +4,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Webp.Lossless; @@ -12,17 +12,17 @@ internal static class ColorSpaceTransformUtils { public static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) { - if (Avx2.IsSupported && tileWidth >= 16) + if (Vector256_.SupportsShuffleNativeByte && tileWidth >= 16) { const int span = 16; Span values = stackalloc ushort[span]; - var collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255); - var collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30); - var collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - var collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - var collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); - var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); - var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); + Vector256 collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255); + Vector256 collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30); + Vector256 collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + Vector256 collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + Vector256 collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + Vector256 multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); + Vector256 multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); for (int y = 0; y < tileHeight; y++) { Span srcSpan = bgra[(y * stride)..]; @@ -33,18 +33,18 @@ public static void CollectColorBlueTransforms(Span bgra, int stride, int t nuint input1Idx = x + (span / 2); Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector256 r0 = Avx2.Shuffle(input0, collectColorBlueTransformsShuffleLowMask256); - Vector256 r1 = Avx2.Shuffle(input1, collectColorBlueTransformsShuffleHighMask256); - Vector256 r = Avx2.Or(r0, r1); - Vector256 gb0 = Avx2.And(input0, collectColorBlueTransformsGreenBlueMask256); - Vector256 gb1 = Avx2.And(input1, collectColorBlueTransformsGreenBlueMask256); - Vector256 gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); - Vector256 g = Avx2.And(gb.AsByte(), collectColorBlueTransformsGreenMask256); - Vector256 a = Avx2.MultiplyHigh(r.AsInt16(), multsr); - Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); - Vector256 c = Avx2.Subtract(gb.AsByte(), b.AsByte()); - Vector256 d = Avx2.Subtract(c, a.AsByte()); - Vector256 e = Avx2.And(d, collectColorBlueTransformsBlueMask256); + Vector256 r0 = Vector256_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask256); + Vector256 r1 = Vector256_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask256); + Vector256 r = r0 | r1; + Vector256 gb0 = input0 & collectColorBlueTransformsGreenBlueMask256; + Vector256 gb1 = input1 & collectColorBlueTransformsGreenBlueMask256; + Vector256 gb = Vector256_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector256 g = gb.AsByte() & collectColorBlueTransformsGreenMask256; + Vector256 a = Vector256_.MultiplyHigh(r.AsInt16(), multsr); + Vector256 b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); + Vector256 c = gb.AsByte() - b.AsByte(); + Vector256 d = c - a.AsByte(); + Vector256 e = d & collectColorBlueTransformsBlueMask256; ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = e.AsUInt16(); @@ -59,20 +59,20 @@ public static void CollectColorBlueTransforms(Span bgra, int stride, int t int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); + CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); } } - else if (Sse41.IsSupported) + else if (Vector128.IsHardwareAccelerated) { const int span = 8; Span values = stackalloc ushort[span]; - var collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255); - var collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); - var collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - var collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - var collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); - var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue)); - var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue)); + Vector128 collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255); + Vector128 collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); + Vector128 collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + Vector128 collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + Vector128 collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + Vector128 multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue)); + Vector128 multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue)); for (int y = 0; y < tileHeight; y++) { Span srcSpan = bgra[(y * stride)..]; @@ -83,18 +83,18 @@ public static void CollectColorBlueTransforms(Span bgra, int stride, int t nuint input1Idx = x + (span / 2); Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector128 r0 = Ssse3.Shuffle(input0, collectColorBlueTransformsShuffleLowMask); - Vector128 r1 = Ssse3.Shuffle(input1, collectColorBlueTransformsShuffleHighMask); - Vector128 r = Sse2.Or(r0, r1); - Vector128 gb0 = Sse2.And(input0, collectColorBlueTransformsGreenBlueMask); - Vector128 gb1 = Sse2.And(input1, collectColorBlueTransformsGreenBlueMask); - Vector128 gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); - Vector128 g = Sse2.And(gb.AsByte(), collectColorBlueTransformsGreenMask); - Vector128 a = Sse2.MultiplyHigh(r.AsInt16(), multsr); - Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); - Vector128 c = Sse2.Subtract(gb.AsByte(), b.AsByte()); - Vector128 d = Sse2.Subtract(c, a.AsByte()); - Vector128 e = Sse2.And(d, collectColorBlueTransformsBlueMask); + Vector128 r0 = Vector128_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask); + Vector128 r1 = Vector128_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask); + Vector128 r = r0 | r1; + Vector128 gb0 = input0 & collectColorBlueTransformsGreenBlueMask; + Vector128 gb1 = input1 & collectColorBlueTransformsGreenBlueMask; + Vector128 gb = Vector128_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector128 g = gb.AsByte() & collectColorBlueTransformsGreenMask; + Vector128 a = Vector128_.MultiplyHigh(r.AsInt16(), multsr); + Vector128 b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); + Vector128 c = gb.AsByte() - b.AsByte(); + Vector128 d = c - a.AsByte(); + Vector128 e = d & collectColorBlueTransformsBlueMask; ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = e.AsUInt16(); @@ -109,16 +109,16 @@ public static void CollectColorBlueTransforms(Span bgra, int stride, int t int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); + CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); } } else { - CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo); + CollectColorBlueTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo); } } - private static void CollectColorBlueTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) + private static void CollectColorBlueTransformsScalar(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) { int pos = 0; while (tileHeight-- > 0) @@ -135,11 +135,11 @@ private static void CollectColorBlueTransformsNoneVectorized(Span bgra, in public static void CollectColorRedTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) { - if (Avx2.IsSupported && tileWidth >= 16) + if (Vector256.IsHardwareAccelerated && tileWidth >= 16) { Vector256 collectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte(); Vector256 collectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte(); - var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed)); + Vector256 multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed)); const int span = 16; Span values = stackalloc ushort[span]; for (int y = 0; y < tileHeight; y++) @@ -152,15 +152,15 @@ public static void CollectColorRedTransforms(Span bgra, int stride, int ti nuint input1Idx = x + (span / 2); Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector256 g0 = Avx2.And(input0, collectColorRedTransformsGreenMask256); // 0 0 | g 0 - Vector256 g1 = Avx2.And(input1, collectColorRedTransformsGreenMask256); - Vector256 g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 - Vector256 a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r - Vector256 a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16); - Vector256 a = Avx2.PackUnsignedSaturate(a0, a1); // x r - Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr - Vector256 c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r' - Vector256 d = Avx2.And(c, collectColorRedTransformsAndMask256); // 0 r' + Vector256 g0 = input0 & collectColorRedTransformsGreenMask256; // 0 0 | g 0 + Vector256 g1 = input1 & collectColorRedTransformsGreenMask256; + Vector256 g = Vector256_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 + Vector256 a0 = Vector256.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r + Vector256 a1 = Vector256.ShiftRightLogical(input1.AsInt32(), 16); + Vector256 a = Vector256_.PackUnsignedSaturate(a0, a1); // x r + Vector256 b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); // x dr + Vector256 c = a.AsByte() - b.AsByte(); // x r' + Vector256 d = c & collectColorRedTransformsAndMask256; // 0 r' ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = d.AsUInt16(); @@ -175,14 +175,14 @@ public static void CollectColorRedTransforms(Span bgra, int stride, int ti int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); + CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); } } - else if (Sse41.IsSupported) + else if (Vector128.IsHardwareAccelerated) { Vector128 collectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte(); Vector128 collectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte(); - var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed)); + Vector128 multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed)); const int span = 8; Span values = stackalloc ushort[span]; for (int y = 0; y < tileHeight; y++) @@ -195,15 +195,15 @@ public static void CollectColorRedTransforms(Span bgra, int stride, int ti nuint input1Idx = x + (span / 2); Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector128 g0 = Sse2.And(input0, collectColorRedTransformsGreenMask); // 0 0 | g 0 - Vector128 g1 = Sse2.And(input1, collectColorRedTransformsGreenMask); - Vector128 g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 - Vector128 a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r - Vector128 a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16); - Vector128 a = Sse41.PackUnsignedSaturate(a0, a1); // x r - Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr - Vector128 c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r' - Vector128 d = Sse2.And(c, collectColorRedTransformsAndMask); // 0 r' + Vector128 g0 = input0 & collectColorRedTransformsGreenMask; // 0 0 | g 0 + Vector128 g1 = input1 & collectColorRedTransformsGreenMask; + Vector128 g = Vector128_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 + Vector128 a0 = Vector128.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r + Vector128 a1 = Vector128.ShiftRightLogical(input1.AsInt32(), 16); + Vector128 a = Vector128_.PackUnsignedSaturate(a0, a1); // x r + Vector128 b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); // x dr + Vector128 c = a.AsByte() - b.AsByte(); // x r' + Vector128 d = c & collectColorRedTransformsAndMask; // 0 r' ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = d.AsUInt16(); @@ -218,16 +218,16 @@ public static void CollectColorRedTransforms(Span bgra, int stride, int ti int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); + CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); } } else { - CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo); + CollectColorRedTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToRed, histo); } } - private static void CollectColorRedTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) + private static void CollectColorRedTransformsScalar(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) { int pos = 0; while (tileHeight-- > 0) diff --git a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs index c5e8c975f1..6073888fe0 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs @@ -71,17 +71,17 @@ private static void RunCollectColorRedTransformsTest() public void CollectColorBlueTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.AllowAll); [Fact] - public void CollectColorBlueTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41); + public void CollectColorBlueTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41); [Fact] - public void CollectColorBlueTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2); + public void CollectColorBlueTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2); [Fact] public void CollectColorRedTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.AllowAll); [Fact] - public void CollectColorRedTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41); + public void CollectColorRedTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41); [Fact] - public void CollectColorRedTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2); + public void CollectColorRedTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2); } From c490bc6f66f9fda79456aef28094314ff0070b84 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 30 May 2025 23:37:07 +1000 Subject: [PATCH 02/20] Port TTransformSse41 --- src/ImageSharp/Common/Helpers/Numerics.cs | 17 -- .../Common/Helpers/Vector128Utilities.cs | 227 ++++++++++++++++++ .../Formats/Webp/Lossy/LossyUtils.cs | 145 ++++++----- 3 files changed, 308 insertions(+), 81 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index ca14ae4c38..5f91dcd998 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -884,23 +884,6 @@ public static void Accumulate(ref Vector accumulator, Vector values) accumulator += intHigh; } - /// - /// Reduces elements of the vector into one sum. - /// - /// The accumulator to reduce. - /// The sum of all elements. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int ReduceSum(Vector128 accumulator) - { - // Add odd to even. - Vector128 vsum = Sse2.Add(accumulator, Sse2.Shuffle(accumulator, 0b_11_11_01_01)); - - // Add high to low. - vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10)); - - return Sse2.ConvertToInt32(vsum); - } - /// /// Reduces elements of the vector into one sum. /// diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index f89900d7e5..2c37a493ea 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -88,6 +88,30 @@ public static Vector128 ShuffleNative(Vector128 vector, [ConstantE return Vector128.Shuffle(vector, indices); } + /// + /// Creates a new vector by selecting values from an input vector using the control. + /// + /// The input vector from which values are selected. + /// The shuffle control byte. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ShuffleNative(Vector128 vector, [ConstantExpected] byte control) + { + if (Sse2.IsSupported) + { + return Sse2.Shuffle(vector, control); + } + + // Don't use InverseMMShuffle here as we want to avoid the cast. + Vector128 indices = Vector128.Create( + control & 0x3, + (control >> 2) & 0x3, + (control >> 4) & 0x3, + (control >> 6) & 0x3); + + return Vector128.Shuffle(vector, indices); + } + /// /// Creates a new vector by selecting values from an input vector using a set of indices. /// @@ -412,6 +436,31 @@ public static Vector128 MultiplyLow(Vector128 left, Vector128 MultiplyAddAdjacent(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyAddAdjacent(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); + (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLower * rightLower; + Vector128 prodHi = leftUpper * rightUpper; + + // Extract the low and high parts of the products shuffling them to form a result we can add together. + // Use out-of-bounds to zero out the unused lanes. + Vector128 v0 = Vector128.Shuffle(prodLo, Vector128.Create(0, 2, 8, 8)); + Vector128 v1 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 0, 2)); + Vector128 v2 = Vector128.Shuffle(prodLo, Vector128.Create(1, 3, 8, 8)); + Vector128 v3 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 1, 3)); + + return v0 + v1 + v2 + v3; + } + /// /// Multiply the packed 16-bit integers in and , producing /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result. @@ -450,6 +499,184 @@ public static Vector128 MultiplyHigh(Vector128 left, Vector128 + /// Unpack and interleave 64-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 64-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 64-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 64-bit integers from the high + /// halves of and . + /// + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + return Vector128.Create(left.GetUpper(), right.GetUpper()); + } + + /// + /// Unpack and interleave 64-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 64-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 64-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 64-bit integers from the low + /// halves of and . + /// + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + return Vector128.Create(left.GetLower(), right.GetLower()); + } + + /// + /// Unpack and interleave 32-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 32-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 32-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 32-bit integers from the high + /// halves of and . + /// + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 2, 1, 3)); + } + + /// + /// Unpack and interleave 32-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 32-bit integers from the low + /// halves of and . + /// + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 2, 1, 3)); + } + + /// + /// Unpack and interleave 16-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 16-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 16-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 16-bit integers from the high + /// halves of and . + /// + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7)); + } + + /// + /// Unpack and interleave 16-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 16-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 16-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 16-bit integers from the low + /// halves of and . + /// + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7)); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index aae4181ce0..9c73a32072 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -7,6 +7,7 @@ using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Webp.Lossy; @@ -127,7 +128,7 @@ public static int Vp8_Sse4x4(Span a, Span b) Vector128 e1 = Sse2.MultiplyAddAdjacent(d1, d1); Vector128 sum = Sse2.Add(e0, e1); - return Numerics.ReduceSum(sum); + return ReduceSum(sum); } if (AdvSimd.IsSupported) @@ -174,12 +175,12 @@ private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs) Vector128 sum1 = SubtractAndAccumulate(a0, b0); Vector128 sum2 = SubtractAndAccumulate(a1, b1); - sum = Sse2.Add(sum, Sse2.Add(sum1, sum2)); + sum += sum1 + sum2; offset += 2 * WebpConstants.Bps; } - return Numerics.ReduceSum(sum); + return ReduceSum(sum); } [MethodImpl(InliningOptions.ShortMethod)] @@ -378,17 +379,16 @@ public static int Vp8Disto16X16(Span a, Span b, Span w, Span [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8Disto4X4(Span a, Span b, Span w, Span scratch) { - if (Sse41.IsSupported) + if (Vector128.IsHardwareAccelerated) { - int diffSum = TTransformSse41(a, b, w); + int diffSum = TTransformVector128(a, b, w); return Math.Abs(diffSum) >> 5; } - else - { - int sum1 = TTransform(a, w, scratch); - int sum2 = TTransform(b, w, scratch); - return Math.Abs(sum2 - sum1) >> 5; - } + + int sum1 = TTransform(a, w, scratch); + int sum2 = TTransform(b, w, scratch); + + return Math.Abs(sum2 - sum1) >> 5; } public static void DC16(Span dst, Span yuv, int offset) @@ -905,7 +905,7 @@ public static int TTransform(Span input, Span w, Span scratch /// Returns the weighted sum of the absolute value of transformed coefficients. /// w[] contains a row-major 4 by 4 symmetric matrix. /// - public static int TTransformSse41(Span inputA, Span inputB, Span w) + public static int TTransformVector128(Span inputA, Span inputB, Span w) { // Load and combine inputs. Vector128 ina0 = Unsafe.As>(ref MemoryMarshal.GetReference(inputA)); @@ -918,14 +918,14 @@ public static int TTransformSse41(Span inputA, Span inputB, Span inb3 = Unsafe.As>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 3, 16))).AsInt64(); // Combine inA and inB (we'll do two transforms in parallel). - Vector128 inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32()); - Vector128 inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32()); - Vector128 inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32()); - Vector128 inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32()); - Vector128 tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte()); - Vector128 tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte()); - Vector128 tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte()); - Vector128 tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte()); + Vector128 inab0 = Vector128_.UnpackLow(ina0.AsInt32(), inb0.AsInt32()); + Vector128 inab1 = Vector128_.UnpackLow(ina1.AsInt32(), inb1.AsInt32()); + Vector128 inab2 = Vector128_.UnpackLow(ina2.AsInt32(), inb2.AsInt32()); + Vector128 inab3 = Vector128_.UnpackLow(ina3.AsInt32(), inb3.AsInt32()); + Vector128 tmp0 = Vector128.WidenLower(inab0.AsByte()).AsInt16(); + Vector128 tmp1 = Vector128.WidenLower(inab1.AsByte()).AsInt16(); + Vector128 tmp2 = Vector128.WidenLower(inab2.AsByte()).AsInt16(); + Vector128 tmp3 = Vector128.WidenLower(inab3.AsByte()).AsInt16(); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 @@ -934,14 +934,14 @@ public static int TTransformSse41(Span inputA, Span inputB, Span a0 = Sse2.Add(tmp0, tmp2); - Vector128 a1 = Sse2.Add(tmp1, tmp3); - Vector128 a2 = Sse2.Subtract(tmp1, tmp3); - Vector128 a3 = Sse2.Subtract(tmp0, tmp2); - Vector128 b0 = Sse2.Add(a0, a1); - Vector128 b1 = Sse2.Add(a3, a2); - Vector128 b2 = Sse2.Subtract(a3, a2); - Vector128 b3 = Sse2.Subtract(a0, a1); + Vector128 a0 = tmp0 + tmp2; + Vector128 a1 = tmp1 + tmp3; + Vector128 a2 = tmp1 - tmp3; + Vector128 a3 = tmp0 - tmp2; + Vector128 b0 = a0 + a1; + Vector128 b1 = a3 + a2; + Vector128 b2 = a3 - a2; + Vector128 b3 = a0 - a1; // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 @@ -959,38 +959,38 @@ public static int TTransformSse41(Span inputA, Span inputB, Span w8 = Unsafe.As>(ref MemoryMarshal.GetReference(w.Slice(8, 8))); // Calculate a and b (two 4x4 at once). - a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16()); - a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16()); - a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16()); - a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16()); - b0 = Sse2.Add(a0, a1); - b1 = Sse2.Add(a3, a2); - b2 = Sse2.Subtract(a3, a2); - b3 = Sse2.Subtract(a0, a1); + a0 = output0.AsInt16() + output2.AsInt16(); + a1 = output1.AsInt16() + output3.AsInt16(); + a2 = output1.AsInt16() - output3.AsInt16(); + a3 = output0.AsInt16() - output2.AsInt16(); + b0 = a0 + a1; + b1 = a3 + a2; + b2 = a3 - a2; + b3 = a0 - a1; // Separate the transforms of inA and inB. - Vector128 ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64()); - Vector128 ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64()); - Vector128 bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64()); - Vector128 bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64()); + Vector128 ab0 = Vector128_.UnpackLow(b0.AsInt64(), b1.AsInt64()); + Vector128 ab2 = Vector128_.UnpackLow(b2.AsInt64(), b3.AsInt64()); + Vector128 bb0 = Vector128_.UnpackHigh(b0.AsInt64(), b1.AsInt64()); + Vector128 bb2 = Vector128_.UnpackHigh(b2.AsInt64(), b3.AsInt64()); - Vector128 ab0Abs = Ssse3.Abs(ab0.AsInt16()); - Vector128 ab2Abs = Ssse3.Abs(ab2.AsInt16()); - Vector128 b0Abs = Ssse3.Abs(bb0.AsInt16()); - Vector128 bb2Abs = Ssse3.Abs(bb2.AsInt16()); + Vector128 ab0Abs = Vector128.Abs(ab0.AsInt16()); + Vector128 ab2Abs = Vector128.Abs(ab2.AsInt16()); + Vector128 b0Abs = Vector128.Abs(bb0.AsInt16()); + Vector128 bb2Abs = Vector128.Abs(bb2.AsInt16()); // weighted sums. - Vector128 ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16()); - Vector128 ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16()); - Vector128 b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16()); - Vector128 bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16()); - Vector128 ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8); - Vector128 b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8); + Vector128 ab0mulw0 = Vector128_.MultiplyAddAdjacent(ab0Abs, w0.AsInt16()); + Vector128 ab2mulw8 = Vector128_.MultiplyAddAdjacent(ab2Abs, w8.AsInt16()); + Vector128 b0mulw0 = Vector128_.MultiplyAddAdjacent(b0Abs, w0.AsInt16()); + Vector128 bb2mulw8 = Vector128_.MultiplyAddAdjacent(bb2Abs, w8.AsInt16()); + Vector128 ab0ab2Sum = ab0mulw0 + ab2mulw8; + Vector128 b0w0bb2w8Sum = b0mulw0 + bb2mulw8; // difference of weighted sums. - Vector128 result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32()); + Vector128 result = ab0ab2Sum - b0w0bb2w8Sum; - return Numerics.ReduceSum(result); + return ReduceSum(result); } // Transpose two 4x4 16b matrices horizontally stored in registers. @@ -1002,28 +1002,28 @@ public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 - Vector128 transpose00 = Sse2.UnpackLow(b0, b1); - Vector128 transpose01 = Sse2.UnpackLow(b2, b3); - Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); - Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); + Vector128 transpose00 = Vector128_.UnpackLow(b0, b1); + Vector128 transpose01 = Vector128_.UnpackLow(b2, b3); + Vector128 transpose02 = Vector128_.UnpackHigh(b0, b1); + Vector128 transpose03 = Vector128_.UnpackHigh(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 - Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); - Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); + Vector128 transpose10 = Vector128_.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose11 = Vector128_.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); + Vector128 transpose12 = Vector128_.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose13 = Vector128_.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 - output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); - output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); - output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); - output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + output0 = Vector128_.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); + output1 = Vector128_.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); + output2 = Vector128_.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); + output3 = Vector128_.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -1910,6 +1910,23 @@ public static void Mean16x4(Span input, Span dc) // Cost of coding one event with probability 'proba'. public static int Vp8BitCost(int bit, byte proba) => bit == 0 ? WebpLookupTables.Vp8EntropyCost[proba] : WebpLookupTables.Vp8EntropyCost[255 - proba]; + /// + /// Reduces elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of all elements. + [MethodImpl(InliningOptions.ShortMethod)] + private static int ReduceSum(Vector128 accumulator) + { + // Add odd to even. + Vector128 vsum = accumulator + Vector128_.ShuffleNative(accumulator, 0b_11_11_01_01); + + // Add high to low. + vsum += Vector128_.ShuffleNative(vsum, 0b_11_10_11_10); + + return vsum.ToScalar(); + } + [MethodImpl(InliningOptions.ShortMethod)] private static void Put16(int v, Span dst) { From dd9bd0a6cf148ba19378d40e51598acc5bad32ef Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 30 May 2025 23:40:31 +1000 Subject: [PATCH 03/20] Use explicit type --- .../Formats/Webp/Lossy/LossyUtils.cs | 80 +++++++++---------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 9c73a32072..7bc995030e 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -67,16 +67,16 @@ public static int Vp8_Sse4x4(Span a, Span b) // Load values. ref byte aRef = ref MemoryMarshal.GetReference(a); ref byte bRef = ref MemoryMarshal.GetReference(b); - var a0 = Vector256.Create( + Vector256 a0 = Vector256.Create( Unsafe.As>(ref aRef), Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps))); - var a1 = Vector256.Create( + Vector256 a1 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2)), Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3))); - var b0 = Vector256.Create( + Vector256 b0 = Vector256.Create( Unsafe.As>(ref bRef), Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps))); - var b1 = Vector256.Create( + Vector256 b1 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2)), Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3))); @@ -193,16 +193,16 @@ private static int Vp8_Sse16xN_Avx2(Span a, Span b, int numPairs) for (int i = 0; i < numPairs; i++) { // Load values. - var a0 = Vector256.Create( + Vector256 a0 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref aRef, offset)), Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps))); - var b0 = Vector256.Create( + Vector256 b0 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref bRef, offset)), Unsafe.As>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps))); - var a1 = Vector256.Create( + Vector256 a1 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref aRef, offset + (2 * WebpConstants.Bps))), Unsafe.As>(ref Unsafe.Add(ref aRef, offset + (3 * WebpConstants.Bps)))); - var b1 = Vector256.Create( + Vector256 b1 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (2 * WebpConstants.Bps))), Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (3 * WebpConstants.Bps)))); @@ -1057,24 +1057,24 @@ public static void TransformTwo(Span src, Span dst, Span scrat // Load and concatenate the transform coefficients (we'll do two transforms // in parallel). ref short srcRef = ref MemoryMarshal.GetReference(src); - var in0 = Vector128.Create(Unsafe.As(ref srcRef), 0); - var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 4)), 0); - var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 8)), 0); - var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 12)), 0); + Vector128 in0 = Vector128.Create(Unsafe.As(ref srcRef), 0); + Vector128 in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 4)), 0); + Vector128 in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 8)), 0); + Vector128 in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 12)), 0); // a00 a10 a20 a30 x x x x // a01 a11 a21 a31 x x x x // a02 a12 a22 a32 x x x x // a03 a13 a23 a33 x x x x - var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 16)), 0); - var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 20)), 0); - var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 24)), 0); - var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 28)), 0); + Vector128 inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 16)), 0); + Vector128 inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 20)), 0); + Vector128 inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 24)), 0); + Vector128 inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 28)), 0); - in0 = Sse2.UnpackLow(in0, inb0); - in1 = Sse2.UnpackLow(in1, inb1); - in2 = Sse2.UnpackLow(in2, inb2); - in3 = Sse2.UnpackLow(in3, inb3); + in0 = Vector128_.UnpackLow(in0, inb0); + in1 = Vector128_.UnpackLow(in1, inb1); + in2 = Vector128_.UnpackLow(in2, inb2); + in3 = Vector128_.UnpackLow(in3, inb3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -1086,8 +1086,8 @@ public static void TransformTwo(Span src, Span dst, Span scrat Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); - var k1 = Vector128.Create((short)20091); - var k2 = Vector128.Create((short)-30068); + Vector128 k1 = Vector128.Create((short)20091); + Vector128 k2 = Vector128.Create((short)-30068); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2); @@ -1193,10 +1193,10 @@ public static void TransformOne(Span src, Span dst, Span scrat { // Load and concatenate the transform coefficients. ref short srcRef = ref MemoryMarshal.GetReference(src); - var in0 = Vector128.Create(Unsafe.As(ref srcRef), 0); - var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 4)), 0); - var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 8)), 0); - var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 12)), 0); + Vector128 in0 = Vector128.Create(Unsafe.As(ref srcRef), 0); + Vector128 in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 4)), 0); + Vector128 in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 8)), 0); + Vector128 in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 12)), 0); // a00 a10 a20 a30 x x x x // a01 a11 a21 a31 x x x x @@ -1208,8 +1208,8 @@ public static void TransformOne(Span src, Span dst, Span scrat Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); - var k1 = Vector128.Create((short)20091); - var k2 = Vector128.Create((short)-30068); + Vector128 k1 = Vector128.Create((short)20091); + Vector128 k2 = Vector128.Create((short)-30068); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2); @@ -2034,7 +2034,7 @@ private static void DoFilter2(Span p, int offset, int step) // Applies filter on 2 pixels (p0 and q0) private static void DoFilter2Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int thresh) { - var signBit = Vector128.Create((byte)0x80); + Vector128 signBit = Vector128.Create((byte)0x80); // Convert p1/q1 to byte (for GetBaseDelta). Vector128 p1s = Sse2.Xor(p1, signBit); @@ -2063,7 +2063,7 @@ private static void DoFilter4Sse2(ref Vector128 p1, ref Vector128 p0 // Compute hev mask. Vector128 notHev = GetNotHev(ref p1, ref p0, ref q0, ref q1, tresh); - var signBit = Vector128.Create((byte)0x80); + Vector128 signBit = Vector128.Create((byte)0x80); // Convert to signed values. p1 = Sse2.Xor(p1, signBit); @@ -2107,7 +2107,7 @@ private static void DoFilter6Sse2(ref Vector128 p2, ref Vector128 p1 Vector128 notHev = GetNotHev(ref p1, ref p0, ref q0, ref q1, tresh); // Convert to signed values. - var signBit = Vector128.Create((byte)0x80); + Vector128 signBit = Vector128.Create((byte)0x80); p1 = Sse2.Xor(p1, signBit); p0 = Sse2.Xor(p0, signBit); q0 = Sse2.Xor(q0, signBit); @@ -2128,11 +2128,11 @@ private static void DoFilter6Sse2(ref Vector128 p2, ref Vector128 p1 Vector128 flow = Sse2.UnpackLow(Vector128.Zero, f); Vector128 fhigh = Sse2.UnpackHigh(Vector128.Zero, f); - var nine = Vector128.Create((short)0x0900); + Vector128 nine = Vector128.Create((short)0x0900); Vector128 f9Low = Sse2.MultiplyHigh(flow.AsInt16(), nine); // Filter (lo) * 9 Vector128 f9High = Sse2.MultiplyHigh(fhigh.AsInt16(), nine); // Filter (hi) * 9 - var sixtyThree = Vector128.Create((short)63); + Vector128 sixtyThree = Vector128.Create((short)63); Vector128 a2Low = Sse2.Add(f9Low, sixtyThree); // Filter * 9 + 63 Vector128 a2High = Sse2.Add(f9High, sixtyThree); // Filter * 9 + 63 @@ -2163,7 +2163,7 @@ private static Vector128 GetNotHev(ref Vector128 p1, ref Vector128 t1 = Abs(p1, p0); Vector128 t2 = Abs(q1, q0); - var h = Vector128.Create((byte)hevThresh); + Vector128 h = Vector128.Create((byte)hevThresh); Vector128 tMax = Sse2.Max(t1, t2); Vector128 tMaxH = Sse2.SubtractSaturate(tMax, h); @@ -2252,9 +2252,9 @@ private static bool NeedsFilter2(Span p, int offset, int step, int t, int private static Vector128 NeedsFilter(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh) { - var mthresh = Vector128.Create((byte)thresh); + Vector128 mthresh = Vector128.Create((byte)thresh); Vector128 t1 = Abs(p1, q1); // abs(p1 - q1) - var fe = Vector128.Create((byte)0xFE); + Vector128 fe = Vector128.Create((byte)0xFE); Vector128 t2 = Sse2.And(t1, fe); // set lsb of each byte to zero. Vector128 t3 = Sse2.ShiftRightLogical(t2.AsInt16(), 1); // abs(p1 - q1) / 2 @@ -2400,7 +2400,7 @@ private static Vector128 SignedShift8b(Vector128 x) [MethodImpl(InliningOptions.ShortMethod)] private static void ComplexMask(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh, int ithresh, ref Vector128 mask) { - var it = Vector128.Create((byte)ithresh); + Vector128 it = Vector128.Create((byte)ithresh); Vector128 diff = Sse2.SubtractSaturate(mask, it); Vector128 threshMask = Sse2.CompareEqual(diff, Vector128.Zero); Vector128 filterMask = NeedsFilter(p1, p0, q0, q1, thresh); @@ -2414,7 +2414,7 @@ private static void ComplexMask(Vector128 p1, Vector128 p0, Vector12 // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip). private static void Update2Pixels(ref Vector128 pi, ref Vector128 qi, Vector128 a0Low, Vector128 a0High) { - var signBit = Vector128.Create((byte)0x80); + Vector128 signBit = Vector128.Create((byte)0x80); Vector128 a1Low = Sse2.ShiftRightArithmetic(a0Low, 7); Vector128 a1High = Sse2.ShiftRightArithmetic(a0High, 7); Vector128 delta = Sse2.PackSignedSaturate(a1Low, a1High); @@ -2427,8 +2427,8 @@ private static void Update2Pixels(ref Vector128 pi, ref Vector128 qi [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 LoadUvEdge(ref byte uRef, ref byte vRef, int offset) { - var uVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref uRef, (uint)offset)), 0); - var vVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref vRef, (uint)offset)), 0); + Vector128 uVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref uRef, (uint)offset)), 0); + Vector128 vVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref vRef, (uint)offset)), 0); return Sse2.UnpackLow(uVec, vVec).AsByte(); } From 3be2b6a7fc21da49bb0ca824d0cc36b945c1b479 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 31 May 2025 00:08:24 +1000 Subject: [PATCH 04/20] Port TransformTwo --- .../Common/Helpers/Vector128Utilities.cs | 32 ++++++ .../Formats/Webp/Lossy/LossyUtils.cs | 100 +++++++++--------- 2 files changed, 82 insertions(+), 50 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 2c37a493ea..3076788d1b 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -677,6 +677,38 @@ public static Vector128 UnpackLow(Vector128 left, Vector128 return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7)); } + /// + /// Unpack and interleave 8-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the low + /// halves of and . + /// + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle( + unpacked, + Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 7bc995030e..7d186cd651 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -1035,7 +1035,7 @@ public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 // Does two transforms. public static void TransformTwo(Span src, Span dst, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: @@ -1083,64 +1083,64 @@ public static void TransformTwo(Span src, Span dst, Span scrat // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + Vector128 a = in0.AsInt16() + in2.AsInt16(); + Vector128 b = in0.AsInt16() - in2.AsInt16(); Vector128 k1 = Vector128.Create((short)20091); Vector128 k2 = Vector128.Create((short)-30068); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3.AsInt16(), c4); + Vector128 c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2); + Vector128 c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1); + Vector128 c3 = in1.AsInt16() - in3.AsInt16(); + Vector128 c4 = c1 - c2; + Vector128 c = c3.AsInt16() + c4; // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); + Vector128 d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1); + Vector128 d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2); + Vector128 d3 = in1.AsInt16() + in3.AsInt16(); + Vector128 d4 = d1 + d2; + Vector128 d = d3 + d4; // Second pass. - Vector128 tmp0 = Sse2.Add(a.AsInt16(), d); - Vector128 tmp1 = Sse2.Add(b.AsInt16(), c); - Vector128 tmp2 = Sse2.Subtract(b.AsInt16(), c); - Vector128 tmp3 = Sse2.Subtract(a.AsInt16(), d); + Vector128 tmp0 = a.AsInt16() + d; + Vector128 tmp1 = b.AsInt16() + c; + Vector128 tmp2 = b.AsInt16() - c; + Vector128 tmp3 = a.AsInt16() - d; // Transpose the two 4x4. Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4)); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); + Vector128 dc = t0.AsInt16() + Vector128.Create((short)4); + a = dc + t2.AsInt16(); + b = dc - t2.AsInt16(); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); + c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2); + c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1); + c3 = t1.AsInt16() - t3.AsInt16(); + c4 = c1 - c2; + c = c3 + c4; // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); + d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1); + d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2); + d3 = t1.AsInt16() + t3.AsInt16(); + d4 = d1 + d2; + d = d3 + d4; // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + tmp0 = a + d; + tmp1 = b + c; + tmp2 = b - c; + tmp3 = a - d; + Vector128 shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3); // Transpose the two 4x4. Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); @@ -1155,22 +1155,22 @@ public static void TransformTwo(Span src, Span dst, Span scrat Vector128 dst3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3)), 0).AsByte(); // Convert to 16b. - dst0 = Sse2.UnpackLow(dst0, Vector128.Zero); - dst1 = Sse2.UnpackLow(dst1, Vector128.Zero); - dst2 = Sse2.UnpackLow(dst2, Vector128.Zero); - dst3 = Sse2.UnpackLow(dst3, Vector128.Zero); + dst0 = Vector128_.UnpackLow(dst0, Vector128.Zero); + dst1 = Vector128_.UnpackLow(dst1, Vector128.Zero); + dst2 = Vector128_.UnpackLow(dst2, Vector128.Zero); + dst3 = Vector128_.UnpackLow(dst3, Vector128.Zero); // Add the inverse transform(s). - dst0 = Sse2.Add(dst0.AsInt16(), t0.AsInt16()).AsByte(); - dst1 = Sse2.Add(dst1.AsInt16(), t1.AsInt16()).AsByte(); - dst2 = Sse2.Add(dst2.AsInt16(), t2.AsInt16()).AsByte(); - dst3 = Sse2.Add(dst3.AsInt16(), t3.AsInt16()).AsByte(); + dst0 = (dst0.AsInt16() + t0.AsInt16()).AsByte(); + dst1 = (dst1.AsInt16() + t1.AsInt16()).AsByte(); + dst2 = (dst2.AsInt16() + t2.AsInt16()).AsByte(); + dst3 = (dst3.AsInt16() + t3.AsInt16()).AsByte(); // Unsigned saturate to 8b. - dst0 = Sse2.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); - dst1 = Sse2.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); - dst2 = Sse2.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); - dst3 = Sse2.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); + dst0 = Vector128_.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); + dst1 = Vector128_.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); + dst2 = Vector128_.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); + dst3 = Vector128_.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); // Store the results. // Store eight bytes/pixels per line. From 0a9c407ed46e22789d962eae88efced5f8e67386 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 31 May 2025 00:39:32 +1000 Subject: [PATCH 05/20] Add explicit AdvSimd to MultiplyAddAdjacent --- .../Common/Helpers/Vector128Utilities.cs | 53 ++++++++++++++----- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 3076788d1b..a96f5fa73a 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -436,6 +436,20 @@ public static Vector128 MultiplyLow(Vector128 left, Vector128 + /// Multiply packed signed 16-bit integers in and , producing + /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and + /// pack the results. + /// + /// + /// The first vector containing packed signed 16-bit integers to multiply and add. + /// + /// + /// The second vector containing packed signed 16-bit integers to multiply and add. + /// + /// + /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers + /// public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -443,22 +457,35 @@ public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector12 return Sse2.MultiplyAddAdjacent(left, right); } - // Widen each half of the short vectors into two int vectors - (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); - (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + if (AdvSimd.IsSupported) + { + Vector128 prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower()); + Vector128 prodHi = AdvSimd.MultiplyWideningLower(left.GetUpper(), right.GetUpper()); - // Elementwise multiply: each int lane now holds the full 32-bit product - Vector128 prodLo = leftLower * rightLower; - Vector128 prodHi = leftUpper * rightUpper; + Vector128 v0 = AdvSimd.AddPairwiseWidening(prodLo); + Vector128 v1 = AdvSimd.AddPairwiseWidening(prodHi); - // Extract the low and high parts of the products shuffling them to form a result we can add together. - // Use out-of-bounds to zero out the unused lanes. - Vector128 v0 = Vector128.Shuffle(prodLo, Vector128.Create(0, 2, 8, 8)); - Vector128 v1 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 0, 2)); - Vector128 v2 = Vector128.Shuffle(prodLo, Vector128.Create(1, 3, 8, 8)); - Vector128 v3 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 1, 3)); + return Vector128.Narrow(v0, v1); + } - return v0 + v1 + v2 + v3; + { + // Widen each half of the short vectors into two int vectors + (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); + (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLower * rightLower; + Vector128 prodHi = leftUpper * rightUpper; + + // Extract the low and high parts of the products shuffling them to form a result we can add together. + // Use out-of-bounds to zero out the unused lanes. + Vector128 v0 = Vector128.Shuffle(prodLo, Vector128.Create(0, 2, 8, 8)); + Vector128 v1 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 0, 2)); + Vector128 v2 = Vector128.Shuffle(prodLo, Vector128.Create(1, 3, 8, 8)); + Vector128 v3 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 1, 3)); + + return v0 + v1 + v2 + v3; + } } /// From cfad39b8fbda99896ec16069e2c3bcd663961c2c Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 31 May 2025 01:04:06 +1000 Subject: [PATCH 06/20] Add XPlat V128 SubtractSaturate --- .../Common/Helpers/Vector128Utilities.cs | 143 +++++++++++++----- 1 file changed, 101 insertions(+), 42 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index a96f5fa73a..c160b9560b 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -402,40 +402,6 @@ public static Vector128 PackSignedSaturate(Vector128 left, Vector128 public static Vector128 Clamp(Vector128 value, Vector128 min, Vector128 max) => Vector128.Min(Vector128.Max(value, min), max); - /// - /// Multiply the packed 16-bit integers in and , producing - /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. - /// - /// - /// The first vector containing packed 16-bit integers to multiply. - /// - /// - /// The second vector containing packed 16-bit integers to multiply. - /// - /// - /// A vector containing the low 16 bits of the products of the packed 16-bit integers - /// from and . - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 MultiplyLow(Vector128 left, Vector128 right) - { - if (Sse2.IsSupported) - { - return Sse2.MultiplyLow(left, right); - } - - // Widen each half of the short vectors into two int vectors - (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); - (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); - - // Elementwise multiply: each int lane now holds the full 32-bit product - Vector128 prodLo = leftLower * rightLower; - Vector128 prodHi = leftUpper * rightUpper; - - // Narrow the two int vectors back into one short vector - return Vector128.Narrow(prodLo, prodHi); - } - /// /// Multiply packed signed 16-bit integers in and , producing /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and @@ -450,6 +416,7 @@ public static Vector128 MultiplyLow(Vector128 left, Vector128 /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -470,12 +437,12 @@ public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector12 { // Widen each half of the short vectors into two int vectors - (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); - (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); // Elementwise multiply: each int lane now holds the full 32-bit product - Vector128 prodLo = leftLower * rightLower; - Vector128 prodHi = leftUpper * rightUpper; + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; // Extract the low and high parts of the products shuffling them to form a result we can add together. // Use out-of-bounds to zero out the unused lanes. @@ -488,6 +455,40 @@ public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector12 } } + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the low 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyLow(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; + + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } + /// /// Multiply the packed 16-bit integers in and , producing /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result. @@ -511,12 +512,12 @@ public static Vector128 MultiplyHigh(Vector128 left, Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); - (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); // Elementwise multiply: each int lane now holds the full 32-bit product - Vector128 prodLo = leftLower * rightLower; - Vector128 prodHi = leftUpper * rightUpper; + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; // Arithmetic shift right by 16 bits to extract the high word prodLo >>= 16; @@ -540,6 +541,7 @@ public static Vector128 MultiplyHigh(Vector128 left, Vector128 and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackHigh(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -569,6 +571,7 @@ public static Vector128 UnpackHigh(Vector128 left, Vector128 r /// A vector containing the unpacked and interleaved 64-bit integers from the low /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackLow(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -598,6 +601,7 @@ public static Vector128 UnpackLow(Vector128 left, Vector128 ri /// A vector containing the unpacked and interleaved 32-bit integers from the high /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackHigh(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -628,6 +632,7 @@ public static Vector128 UnpackHigh(Vector128 left, Vector128 righ /// A vector containing the unpacked and interleaved 32-bit integers from the low /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackLow(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -658,6 +663,7 @@ public static Vector128 UnpackLow(Vector128 left, Vector128 right /// A vector containing the unpacked and interleaved 16-bit integers from the high /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackHigh(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -688,6 +694,7 @@ public static Vector128 UnpackHigh(Vector128 left, Vector128 and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackLow(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -718,6 +725,7 @@ public static Vector128 UnpackLow(Vector128 left, Vector128 /// A vector containing the unpacked and interleaved 8-bit integers from the low /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackLow(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -736,6 +744,57 @@ public static Vector128 UnpackLow(Vector128 left, Vector128 ri Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); } + /// + /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to subtract from. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 16-bit to safely compute unsigned differences without underflow + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Mask lanes where left >= right to preserve the result + // All other lanes are zeroed (saturate to 0) + Vector128 maskLo = Vector128.GreaterThanOrEqual(leftLo, rightLo).AsUInt16(); + Vector128 maskHi = Vector128.GreaterThanOrEqual(leftHi, rightHi).AsUInt16(); + + diffLo &= maskLo; + diffHi &= maskHi; + + // Narrow back to bytes + return Vector128.Narrow(diffLo, diffHi); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } From 7223e90bb441b4d14871a37f7ed2237218bc7b30 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 2 Jun 2025 14:47:36 +1000 Subject: [PATCH 07/20] Port Vp8_Sse16x16 --- .../Common/Helpers/Vector128Utilities.cs | 95 +++++++++- .../Common/Helpers/Vector256Utilities.cs | 163 ++++++++++++++++++ .../Formats/Webp/Lossy/LossyUtils.cs | 126 ++++++++------ 3 files changed, 324 insertions(+), 60 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index c160b9560b..c5e16faf99 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -711,6 +711,39 @@ public static Vector128 UnpackLow(Vector128 left, Vector128 return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7)); } + /// + /// Unpack and interleave 8-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle( + unpacked, + Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); + } + /// /// Unpack and interleave 8-bit integers from the low half of and /// and store the results in the result. @@ -744,6 +777,56 @@ public static Vector128 UnpackLow(Vector128 left, Vector128 ri Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); } + /// + /// Subtract packed signed 16-bit integers in from packed signed 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed signed 16-bit integers to subtract from. + /// + /// + /// The second vector containing packed signed 16-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 32-bit signed + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Clamp to signed 16-bit range + Vector128 shortMin = Vector128.Create((int)short.MinValue); + Vector128 shortMax = Vector128.Create((int)short.MaxValue); + + diffLo = Clamp(diffLo, shortMin, shortMax); + diffHi = Clamp(diffHi, shortMin, shortMax); + + // Narrow back to 16 bit signed. + return Vector128.Narrow(diffLo, diffHi); + } + /// /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers /// in using saturation, and store the results. @@ -775,7 +858,7 @@ public static Vector128 SubtractSaturate(Vector128 left, Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); @@ -783,13 +866,11 @@ public static Vector128 SubtractSaturate(Vector128 left, Vector128 diffLo = leftLo - rightLo; Vector128 diffHi = leftHi - rightHi; - // Mask lanes where left >= right to preserve the result - // All other lanes are zeroed (saturate to 0) - Vector128 maskLo = Vector128.GreaterThanOrEqual(leftLo, rightLo).AsUInt16(); - Vector128 maskHi = Vector128.GreaterThanOrEqual(leftHi, rightHi).AsUInt16(); + // Clamp to signed 8-bit range + Vector128 max = Vector128.Create((ushort)byte.MaxValue); - diffLo &= maskLo; - diffHi &= maskHi; + diffLo = Clamp(diffLo, Vector128.Zero, max); + diffHi = Clamp(diffHi, Vector128.Zero, max); // Narrow back to bytes return Vector128.Narrow(diffLo, diffHi); diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index dfefd2d346..71dfadc399 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -162,6 +162,33 @@ public static Vector256 MultiplySubtract( return (vm0 * vm1) - vs; } + /// + /// Multiply packed signed 16-bit integers in and , producing + /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and + /// pack the results. + /// + /// + /// The first vector containing packed signed 16-bit integers to multiply and add. + /// + /// + /// The second vector containing packed signed 16-bit integers to multiply and add. + /// + /// + /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplyAddAdjacent(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.MultiplyAddAdjacent(left, right); + } + + return Vector256.Create( + Vector128_.MultiplyAddAdjacent(left.GetLower(), right.GetLower()), + Vector128_.MultiplyAddAdjacent(left.GetUpper(), right.GetUpper())); + } + /// /// Packs signed 32-bit integers to signed 16-bit integers and saturates. /// @@ -303,6 +330,142 @@ public static Vector256 MultiplyHigh(Vector256 left, Vector256 + /// Unpack and interleave 32-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 32-bit integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 UnpackLow(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.UnpackLow(left, right); + } + + Vector128 lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower()); + Vector128 hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper()); + + return Vector256.Create(lo, hi); + } + + /// + /// Unpack and interleave 8-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 UnpackHigh(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.UnpackHigh(left, right); + } + + Vector128 lo = Vector128_.UnpackHigh(left.GetLower(), right.GetLower()); + Vector128 hi = Vector128_.UnpackHigh(left.GetUpper(), right.GetUpper()); + + return Vector256.Create(lo, hi); + } + + /// + /// Unpack and interleave 8-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 UnpackLow(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.UnpackLow(left, right); + } + + Vector128 lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower()); + Vector128 hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper()); + + return Vector256.Create(lo, hi); + } + + /// + /// Subtract packed signed 16-bit integers in from packed signed 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed signed 16-bit integers to subtract from. + /// + /// + /// The second vector containing packed signed 16-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 SubtractSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.SubtractSaturate(left, right); + } + + return Vector256.Create( + Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()), + Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper())); + } + + /// + /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to subtract from. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 SubtractSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.SubtractSaturate(left, right); + } + + return Vector256.Create( + Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()), + Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper())); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 7d186cd651..4e61242c06 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -20,12 +20,12 @@ public static int Vp8_Sse16x16(Span a, Span b) { if (Avx2.IsSupported) { - return Vp8_Sse16xN_Avx2(a, b, 4); + return Vp8_Sse16xN_Vector256(a, b, 4); } - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - return Vp8_Sse16xN_Sse2(a, b, 8); + return Vp8_16xN_Vector128(a, b, 8); } if (AdvSimd.IsSupported) @@ -40,14 +40,14 @@ public static int Vp8_Sse16x16(Span a, Span b) [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8_Sse16x8(Span a, Span b) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { - return Vp8_Sse16xN_Avx2(a, b, 2); + return Vp8_Sse16xN_Vector256(a, b, 2); } - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - return Vp8_Sse16xN_Sse2(a, b, 4); + return Vp8_16xN_Vector128(a, b, 4); } if (AdvSimd.IsSupported) @@ -81,21 +81,21 @@ public static int Vp8_Sse4x4(Span a, Span b) Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3))); // Combine pair of lines. - Vector256 a01 = Avx2.UnpackLow(a0.AsInt32(), a1.AsInt32()); - Vector256 b01 = Avx2.UnpackLow(b0.AsInt32(), b1.AsInt32()); + Vector256 a01 = Vector256_.UnpackLow(a0.AsInt32(), a1.AsInt32()); + Vector256 b01 = Vector256_.UnpackLow(b0.AsInt32(), b1.AsInt32()); // Convert to 16b. - Vector256 a01s = Avx2.UnpackLow(a01.AsByte(), Vector256.Zero); - Vector256 b01s = Avx2.UnpackLow(b01.AsByte(), Vector256.Zero); + Vector256 a01s = Vector256_.UnpackLow(a01.AsByte(), Vector256.Zero); + Vector256 b01s = Vector256_.UnpackLow(b01.AsByte(), Vector256.Zero); // subtract, square and accumulate. - Vector256 d0 = Avx2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); - Vector256 e0 = Avx2.MultiplyAddAdjacent(d0, d0); + Vector256 d0 = Vector256_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); + Vector256 e0 = Vector256_.MultiplyAddAdjacent(d0, d0); - return Numerics.ReduceSum(e0); + return ReduceSumVector256(e0); } - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load values. ref byte aRef = ref MemoryMarshal.GetReference(a); @@ -110,25 +110,25 @@ public static int Vp8_Sse4x4(Span a, Span b) Vector128 b3 = Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3)); // Combine pair of lines. - Vector128 a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32()); - Vector128 a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32()); - Vector128 b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32()); - Vector128 b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32()); + Vector128 a01 = Vector128_.UnpackLow(a0.AsInt32(), a1.AsInt32()); + Vector128 a23 = Vector128_.UnpackLow(a2.AsInt32(), a3.AsInt32()); + Vector128 b01 = Vector128_.UnpackLow(b0.AsInt32(), b1.AsInt32()); + Vector128 b23 = Vector128_.UnpackLow(b2.AsInt32(), b3.AsInt32()); // Convert to 16b. - Vector128 a01s = Sse2.UnpackLow(a01.AsByte(), Vector128.Zero); - Vector128 a23s = Sse2.UnpackLow(a23.AsByte(), Vector128.Zero); - Vector128 b01s = Sse2.UnpackLow(b01.AsByte(), Vector128.Zero); - Vector128 b23s = Sse2.UnpackLow(b23.AsByte(), Vector128.Zero); + Vector128 a01s = Vector128_.UnpackLow(a01.AsByte(), Vector128.Zero); + Vector128 a23s = Vector128_.UnpackLow(a23.AsByte(), Vector128.Zero); + Vector128 b01s = Vector128_.UnpackLow(b01.AsByte(), Vector128.Zero); + Vector128 b23s = Vector128_.UnpackLow(b23.AsByte(), Vector128.Zero); // subtract, square and accumulate. - Vector128 d0 = Sse2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); + Vector128 d0 = Vector128_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); Vector128 d1 = Sse2.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16()); Vector128 e0 = Sse2.MultiplyAddAdjacent(d0, d0); Vector128 e1 = Sse2.MultiplyAddAdjacent(d1, d1); Vector128 sum = Sse2.Add(e0, e1); - return ReduceSum(sum); + return ReduceSumVector128(sum); } if (AdvSimd.IsSupported) @@ -159,7 +159,7 @@ public static int Vp8_SseNxN(Span a, Span b, int w, int h) } [MethodImpl(InliningOptions.ShortMethod)] - private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs) + private static int Vp8_16xN_Vector128(Span a, Span b, int numPairs) { Vector128 sum = Vector128.Zero; nuint offset = 0; @@ -173,18 +173,18 @@ private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs) Vector128 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps)); Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps)); - Vector128 sum1 = SubtractAndAccumulate(a0, b0); - Vector128 sum2 = SubtractAndAccumulate(a1, b1); + Vector128 sum1 = SubtractAndAccumulateVector128(a0, b0); + Vector128 sum2 = SubtractAndAccumulateVector128(a1, b1); sum += sum1 + sum2; offset += 2 * WebpConstants.Bps; } - return ReduceSum(sum); + return ReduceSumVector128(sum); } [MethodImpl(InliningOptions.ShortMethod)] - private static int Vp8_Sse16xN_Avx2(Span a, Span b, int numPairs) + private static int Vp8_Sse16xN_Vector256(Span a, Span b, int numPairs) { Vector256 sum = Vector256.Zero; nuint offset = 0; @@ -206,14 +206,14 @@ private static int Vp8_Sse16xN_Avx2(Span a, Span b, int numPairs) Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (2 * WebpConstants.Bps))), Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (3 * WebpConstants.Bps)))); - Vector256 sum1 = SubtractAndAccumulate(a0, b0); - Vector256 sum2 = SubtractAndAccumulate(a1, b1); - sum = Avx2.Add(sum, Avx2.Add(sum1, sum2)); + Vector256 sum1 = SubtractAndAccumulateVector256(a0, b0); + Vector256 sum2 = SubtractAndAccumulateVector256(a1, b1); + sum += sum1 + sum2; offset += 4 * WebpConstants.Bps; } - return Numerics.ReduceSum(sum); + return ReduceSumVector256(sum); } [MethodImpl(InliningOptions.ShortMethod)] @@ -306,41 +306,41 @@ private static unsafe Vector128 AccumulateSSE16Neon(byte* a, byte* b, Vect } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b) + private static Vector128 SubtractAndAccumulateVector128(Vector128 a, Vector128 b) { // Take abs(a-b) in 8b. - Vector128 ab = Sse2.SubtractSaturate(a, b); - Vector128 ba = Sse2.SubtractSaturate(b, a); - Vector128 absAb = Sse2.Or(ab, ba); + Vector128 ab = Vector128_.SubtractSaturate(a, b); + Vector128 ba = Vector128_.SubtractSaturate(b, a); + Vector128 absAb = ab | ba; // Zero-extend to 16b. - Vector128 c0 = Sse2.UnpackLow(absAb, Vector128.Zero); - Vector128 c1 = Sse2.UnpackHigh(absAb, Vector128.Zero); + Vector128 c0 = Vector128_.UnpackLow(absAb, Vector128.Zero); + Vector128 c1 = Vector128_.UnpackHigh(absAb, Vector128.Zero); // Multiply with self. - Vector128 sum1 = Sse2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); - Vector128 sum2 = Sse2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); + Vector128 sum1 = Vector128_.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); + Vector128 sum2 = Vector128_.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); - return Sse2.Add(sum1, sum2); + return sum1 + sum2; } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector256 SubtractAndAccumulate(Vector256 a, Vector256 b) + private static Vector256 SubtractAndAccumulateVector256(Vector256 a, Vector256 b) { // Take abs(a-b) in 8b. - Vector256 ab = Avx2.SubtractSaturate(a, b); - Vector256 ba = Avx2.SubtractSaturate(b, a); + Vector256 ab = Vector256_.SubtractSaturate(a, b); + Vector256 ba = Vector256_.SubtractSaturate(b, a); Vector256 absAb = Avx2.Or(ab, ba); // Zero-extend to 16b. - Vector256 c0 = Avx2.UnpackLow(absAb, Vector256.Zero); - Vector256 c1 = Avx2.UnpackHigh(absAb, Vector256.Zero); + Vector256 c0 = Vector256_.UnpackLow(absAb, Vector256.Zero); + Vector256 c1 = Vector256_.UnpackHigh(absAb, Vector256.Zero); // Multiply with self. - Vector256 sum1 = Avx2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); - Vector256 sum2 = Avx2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); + Vector256 sum1 = Vector256_.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); + Vector256 sum2 = Vector256_.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); - return Avx2.Add(sum1, sum2); + return sum1 + sum2; } [MethodImpl(InliningOptions.ShortMethod)] @@ -990,7 +990,7 @@ public static int TTransformVector128(Span inputA, Span inputB, Span // difference of weighted sums. Vector128 result = ab0ab2Sum - b0w0bb2w8Sum; - return ReduceSum(result); + return ReduceSumVector128(result); } // Transpose two 4x4 16b matrices horizontally stored in registers. @@ -1916,7 +1916,27 @@ public static void Mean16x4(Span input, Span dc) /// The accumulator to reduce. /// The sum of all elements. [MethodImpl(InliningOptions.ShortMethod)] - private static int ReduceSum(Vector128 accumulator) + public static int ReduceSumVector256(Vector256 accumulator) + { + // Add upper lane to lower lane. + Vector128 vsum = accumulator.GetLower() + accumulator.GetUpper(); + + // Add odd to even. + vsum += Vector128_.ShuffleNative(vsum, 0b_11_11_01_01); + + // Add high to low. + vsum += Vector128_.ShuffleNative(vsum, 0b_11_10_11_10); + + return vsum.ToScalar(); + } + + /// + /// Reduces elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of all elements. + [MethodImpl(InliningOptions.ShortMethod)] + private static int ReduceSumVector128(Vector128 accumulator) { // Add odd to even. Vector128 vsum = accumulator + Vector128_.ShuffleNative(accumulator, 0b_11_11_01_01); From e6168448a38ed5b35e3853719b0b08cfcc73a860 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 2 Jun 2025 22:53:42 +1000 Subject: [PATCH 08/20] Remove all v128 util restrictions --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 24 +- .../Common/Helpers/Vector128Utilities.cs | 289 ++++++++++++++--- .../Formats/Webp/Lossy/LossyUtils.cs | 302 +++++++++--------- 3 files changed, 410 insertions(+), 205 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 96ddb7976c..0f399d2de0 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -114,7 +114,7 @@ public static void Shuffle4Reduce( { if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) || (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) || - (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)) + Vector128.IsHardwareAccelerated) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -158,7 +158,7 @@ public static void Shuffle3Reduce( ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { int remainder = source.Length % (Vector128.Count * 3); @@ -190,7 +190,7 @@ public static void Pad3Shuffle4Reduce( ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated) { int remainder = source.Length % (Vector128.Count * 3); @@ -223,7 +223,7 @@ public static void Shuffle4Slice3Reduce( ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated) { int remainder = source.Length & ((Vector128.Count * 4) - 1); // bit-hack for modulo @@ -405,7 +405,7 @@ private static void Shuffle4( } } } - else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte) + else if (Vector128.IsHardwareAccelerated) { Span temp = stackalloc byte[Vector128.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -445,9 +445,7 @@ private static void Shuffle3( Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && - Vector128_.SupportsShuffleNativeByte && - Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); @@ -507,10 +505,7 @@ private static void Pad3Shuffle4( Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && - Vector128_.SupportsShuffleNativeByte && - Vector128_.SupportsShiftByte && - Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 fill = Vector128.Create(0xff000000ff000000ul).AsByte(); @@ -553,10 +548,7 @@ private static void Shuffle4Slice3( Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && - Vector128_.SupportsShuffleNativeByte && - Vector128_.SupportsShiftByte && - Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); Vector128 maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index c5e16faf99..a6359e6e91 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -1,7 +1,6 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. -using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -46,24 +45,6 @@ public static bool SupportsShuffleNativeByte } } - /// - /// Gets a value indicating whether right align operations are supported. - /// - public static bool SupportsAlignRight - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Ssse3.IsSupported || AdvSimd.IsSupported; - } - - /// - /// Gets a value indicating whether right or left byte shift operations are supported. - /// - public static bool SupportsShiftByte - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Sse2.IsSupported || AdvSimd.IsSupported; - } - /// /// Creates a new vector by selecting values from an input vector using the control. /// @@ -157,8 +138,7 @@ public static Vector128 ShiftRightBytesInVector(Vector128 value, [Co return AdvSimd.ExtractVector128(value, Vector128.Zero, numBytes); } - ThrowUnreachableException(); - return default; + return Vector128.Shuffle(value, Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + Vector128.Create(numBytes)); } /// @@ -182,8 +162,7 @@ public static Vector128 ShiftLeftBytesInVector(Vector128 value, [Con #pragma warning restore CA1857 // A constant is expected for the parameter } - ThrowUnreachableException(); - return default; + return Vector128.Shuffle(value, Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) - Vector128.Create(numBytes)); } /// @@ -206,8 +185,9 @@ public static Vector128 AlignRight(Vector128 left, Vector128 r return AdvSimd.ExtractVector128(right, left, mask); } - ThrowUnreachableException(); - return default; +#pragma warning disable CA1857 // A constant is expected for the parameter + return ShiftLeftBytesInVector(left, (byte)(Vector128.Count - mask)) | ShiftRightBytesInVector(right, mask); +#pragma warning restore CA1857 // A constant is expected for the parameter } /// @@ -390,6 +370,37 @@ public static Vector128 PackSignedSaturate(Vector128 left, Vector128 return Vector128.Narrow(lefClamped, rightClamped); } + /// + /// Packs signed 16-bit integers to signed 8-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 PackSignedSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.PackSignedSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertNarrowingSaturateSigned(left, right); + } + + Vector128 min = Vector128.Create((short)sbyte.MinValue); + Vector128 max = Vector128.Create((short)sbyte.MaxValue); + Vector128 lefClamped = Clamp(left, min, max); + Vector128 rightClamped = Clamp(right, min, max); + return Vector128.Narrow(lefClamped, rightClamped); + } + /// /// Restricts a vector between a minimum and a maximum value. /// @@ -739,9 +750,7 @@ public static Vector128 UnpackHigh(Vector128 left, Vector128 r } Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); - return Vector128.Shuffle( - unpacked, - Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); + return Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); } /// @@ -772,9 +781,69 @@ public static Vector128 UnpackLow(Vector128 left, Vector128 ri } Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); - return Vector128.Shuffle( - unpacked, - Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); + return Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); + } + + /// + /// Unpack and interleave 8-bit signed integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit signed integers to unpack from the high half. + /// + /// + /// The second vector containing packed 8-bit signed integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit signed integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); + } + + /// + /// Unpack and interleave 8-bit signed integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit signed integers to unpack from the low half. + /// + /// + /// The second vector containing packed 8-bit signed integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit signed integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); } /// @@ -817,16 +886,65 @@ public static Vector128 SubtractSaturate(Vector128 left, Vector128 Vector128 diffHi = leftHi - rightHi; // Clamp to signed 16-bit range - Vector128 shortMin = Vector128.Create((int)short.MinValue); - Vector128 shortMax = Vector128.Create((int)short.MaxValue); + Vector128 min = Vector128.Create((int)short.MinValue); + Vector128 max = Vector128.Create((int)short.MaxValue); - diffLo = Clamp(diffLo, shortMin, shortMax); - diffHi = Clamp(diffHi, shortMin, shortMax); + diffLo = Clamp(diffLo, min, max); + diffHi = Clamp(diffHi, min, max); // Narrow back to 16 bit signed. return Vector128.Narrow(diffLo, diffHi); } + /// + /// Add packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to add to. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to add. + /// + /// + /// A vector containing the results of adding packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 AddSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.AddSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.AddSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.AddSaturate(left, right); + } + + // Widen inputs to 16-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Add + Vector128 sumLo = leftLo + rightLo; + Vector128 sumHi = leftHi + rightHi; + + // Clamp to signed 8-bit range + Vector128 max = Vector128.Create((ushort)byte.MaxValue); + + sumLo = Clamp(sumLo, Vector128.Zero, max); + sumHi = Clamp(sumHi, Vector128.Zero, max); + + // Narrow back to bytes + return Vector128.Narrow(sumLo, sumHi); + } + /// /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers /// in using saturation, and store the results. @@ -876,6 +994,103 @@ public static Vector128 SubtractSaturate(Vector128 left, Vector128 throw new UnreachableException(); + /// + /// Add packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to add to. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to add. + /// + /// + /// A vector containing the results of adding packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 AddSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.AddSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.AddSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.AddSaturate(left, right); + } + + // Widen inputs to 16-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Add + Vector128 sumLo = leftLo + rightLo; + Vector128 sumHi = leftHi + rightHi; + + // Clamp to signed 8-bit range + Vector128 min = Vector128.Create((short)sbyte.MinValue); + Vector128 max = Vector128.Create((short)sbyte.MaxValue); + + sumLo = Clamp(sumLo, min, max); + sumHi = Clamp(sumHi, min, max); + + // Narrow back to signed bytes + return Vector128.Narrow(sumLo, sumHi); + } + + /// + /// Subtract packed signed 8-bit integers in from packed signed 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed signed 8-bit integers to subtract from. + /// + /// + /// The second vector containing packed signed 8-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed signed 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 16-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Clamp to signed 8-bit range + Vector128 min = Vector128.Create((short)sbyte.MinValue); + Vector128 max = Vector128.Create((short)sbyte.MaxValue); + + diffLo = Clamp(diffLo, min, max); + diffHi = Clamp(diffHi, min, max); + + // Narrow back to signed bytes + return Vector128.Narrow(diffLo, diffHi); + } } diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 4e61242c06..b21e3c02ba 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -1521,20 +1521,20 @@ public static void VFilter16(Span p, int offset, int stride, int thresh, i Vector128 p1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset - (2 * stride)))); Vector128 p0 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset - stride))); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t1, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Vector128.Max(mask, AbsVector128(t1, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); Vector128 q0 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)offset)); Vector128 q1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + stride))); Vector128 q2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (2 * stride)))); t1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (3 * stride)))); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t1, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t1, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); // Store. @@ -1561,17 +1561,17 @@ public static void HFilter16(Span p, int offset, int stride, int thresh, i ref byte bRef = ref Unsafe.Add(ref pRef, (uint)offset - 4); Load16x4(ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, p1)); Load16x4(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(q3, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Sse2.Max(mask, AbsVector128(q1, q0)); + mask = Sse2.Max(mask, AbsVector128(q3, q2)); + mask = Sse2.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); Store16x4(p3, p2, p1, p0, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); @@ -1599,22 +1599,22 @@ public static void VFilter16i(Span p, int offset, int stride, int thresh, Span b = p[(offset + (2 * stride))..]; offset += 4 * stride; - Vector128 mask = Abs(p0, p1); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p0, p1); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, p1)); p3 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)offset)); p2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + stride))); Vector128 tmp1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (2 * stride)))); Vector128 tmp2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (3 * stride)))); - mask = Sse2.Max(mask, Abs(tmp1, tmp2)); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, tmp1)); + mask = Sse2.Max(mask, AbsVector128(tmp1, tmp2)); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, tmp1)); // p3 and p2 are not just temporary variables here: they will be // re-used for next span. And q2/q3 will become p1/p0 accordingly. - ComplexMask(p1, p0, p3, p2, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, p3, p2, thresh, ithresh, ref mask); DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); // Store. @@ -1656,17 +1656,17 @@ public static void HFilter16i(Span p, int offset, int stride, int thresh, offset += 4; // Compute partial mask. - mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, p1)); Load16x4(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out p3, out p2, out Vector128 tmp1, out Vector128 tmp2); - mask = Sse2.Max(mask, Abs(tmp1, tmp2)); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, tmp1)); + mask = Sse2.Max(mask, AbsVector128(tmp1, tmp2)); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, tmp1)); - ComplexMask(p1, p0, p3, p2, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, p3, p2, thresh, ithresh, ref mask); DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); Store16x4(p1, p0, p3, p2, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); @@ -1695,34 +1695,34 @@ public static void VFilter8(Span u, Span v, int offset, int stride, // Load uv h-edges. ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - Vector128 t1 = LoadUvEdge(ref uRef, ref vRef, offset - (4 * stride)); - Vector128 p2 = LoadUvEdge(ref uRef, ref vRef, offset - (3 * stride)); - Vector128 p1 = LoadUvEdge(ref uRef, ref vRef, offset - (2 * stride)); - Vector128 p0 = LoadUvEdge(ref uRef, ref vRef, offset - stride); + Vector128 t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - (4 * stride)); + Vector128 p2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - (3 * stride)); + Vector128 p1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - (2 * stride)); + Vector128 p0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - stride); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t1, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(t1, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, p1)); - Vector128 q0 = LoadUvEdge(ref uRef, ref vRef, offset); - Vector128 q1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); - Vector128 q2 = LoadUvEdge(ref uRef, ref vRef, offset + (2 * stride)); - t1 = LoadUvEdge(ref uRef, ref vRef, offset + (3 * stride)); + Vector128 q0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset); + Vector128 q1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + stride); + Vector128 q2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (2 * stride)); + t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (3 * stride)); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t1, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Sse2.Max(mask, AbsVector128(q1, q0)); + mask = Sse2.Max(mask, AbsVector128(t1, q2)); + mask = Sse2.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); // Store. - StoreUv(p2, ref uRef, ref vRef, offset - (3 * stride)); - StoreUv(p1, ref uRef, ref vRef, offset - (2 * stride)); - StoreUv(p0, ref uRef, ref vRef, offset - stride); - StoreUv(q0, ref uRef, ref vRef, offset); - StoreUv(q1, ref uRef, ref vRef, offset + (1 * stride)); - StoreUv(q2, ref uRef, ref vRef, offset + (2 * stride)); + StoreUvVector128(p2, ref uRef, ref vRef, offset - (3 * stride)); + StoreUvVector128(p1, ref uRef, ref vRef, offset - (2 * stride)); + StoreUvVector128(p0, ref uRef, ref vRef, offset - stride); + StoreUvVector128(q0, ref uRef, ref vRef, offset); + StoreUvVector128(q1, ref uRef, ref vRef, offset + (1 * stride)); + StoreUvVector128(q2, ref uRef, ref vRef, offset + (2 * stride)); } else { @@ -1740,17 +1740,17 @@ public static void HFilter8(Span u, Span v, int offset, int stride, ref byte vRef = ref MemoryMarshal.GetReference(v); Load16x4(ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, p1)); Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(q3, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Sse2.Max(mask, AbsVector128(q1, q0)); + mask = Sse2.Max(mask, AbsVector128(q3, q2)); + mask = Sse2.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); Store16x4(p3, p2, p1, p0, ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride); @@ -1771,34 +1771,34 @@ public static void VFilter8i(Span u, Span v, int offset, int stride, // Load uv h-edges. ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - Vector128 t2 = LoadUvEdge(ref uRef, ref vRef, offset); - Vector128 t1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); - Vector128 p1 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 2)); - Vector128 p0 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 3)); + Vector128 t2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset); + Vector128 t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + stride); + Vector128 p1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 2)); + Vector128 p0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 3)); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(t2, t1)); + mask = Sse2.Max(mask, AbsVector128(t1, p1)); offset += 4 * stride; - Vector128 q0 = LoadUvEdge(ref uRef, ref vRef, offset); - Vector128 q1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); - t1 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 2)); - t2 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 3)); + Vector128 q0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset); + Vector128 q1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + stride); + t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 2)); + t2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 3)); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, q1)); + mask = Sse2.Max(mask, AbsVector128(q1, q0)); + mask = Sse2.Max(mask, AbsVector128(t2, t1)); + mask = Sse2.Max(mask, AbsVector128(t1, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); // Store. - StoreUv(p1, ref uRef, ref vRef, offset + (-2 * stride)); - StoreUv(p0, ref uRef, ref vRef, offset + (-1 * stride)); - StoreUv(q0, ref uRef, ref vRef, offset); - StoreUv(q1, ref uRef, ref vRef, offset + stride); + StoreUvVector128(p1, ref uRef, ref vRef, offset + (-2 * stride)); + StoreUvVector128(p0, ref uRef, ref vRef, offset + (-1 * stride)); + StoreUvVector128(q0, ref uRef, ref vRef, offset); + StoreUvVector128(q1, ref uRef, ref vRef, offset + stride); } else { @@ -1817,20 +1817,20 @@ public static void HFilter8i(Span u, Span v, int offset, int stride, ref byte vRef = ref MemoryMarshal.GetReference(v); Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 t2, out Vector128 t1, out Vector128 p1, out Vector128 p0); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(t2, t1)); + mask = Sse2.Max(mask, AbsVector128(t1, p1)); // Beginning of q0. offset += 4; Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out t1, out t2); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, q1)); + mask = Sse2.Max(mask, AbsVector128(q1, q0)); + mask = Sse2.Max(mask, AbsVector128(t2, t1)); + mask = Sse2.Max(mask, AbsVector128(t1, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); // Beginning of p1. @@ -2057,24 +2057,24 @@ private static void DoFilter2Sse2(ref Vector128 p1, ref Vector128 p0 Vector128 signBit = Vector128.Create((byte)0x80); // Convert p1/q1 to byte (for GetBaseDelta). - Vector128 p1s = Sse2.Xor(p1, signBit); - Vector128 q1s = Sse2.Xor(q1, signBit); - Vector128 mask = NeedsFilter(p1, p0, q0, q1, thresh); + Vector128 p1s = p1 ^ signBit; + Vector128 q1s = q1 ^ signBit; + Vector128 mask = NeedsFilterVector128(p1, p0, q0, q1, thresh); // Flip sign. - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); + p0 ^= signBit; + q0 ^= signBit; - Vector128 a = GetBaseDelta(p1s.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1s.AsSByte()).AsByte(); + Vector128 a = GetBaseDeltaVector128(p1s.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1s.AsSByte()).AsByte(); // Mask filter values we don't care about. - a = Sse2.And(a, mask); + a &= mask; DoSimpleFilterSse2(ref p0, ref q0, a); // Flip sign. - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); + p0 ^= signBit; + q0 ^= signBit; } // Applies filter on 4 pixels (p1, p0, q0 and q1) @@ -2101,8 +2101,8 @@ private static void DoFilter4Sse2(ref Vector128 p1, ref Vector128 p0 t2 = Sse2.AddSaturate(t1, Vector128.Create((byte)3).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 3 Vector128 t3 = Sse2.AddSaturate(t1, Vector128.Create((byte)4).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 4 - t2 = SignedShift8b(t2.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 - t3 = SignedShift8b(t3.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 + t2 = SignedShift8bVector128(t2.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 + t3 = SignedShift8bVector128(t3.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 p0 = Sse2.AddSaturate(p0.AsSByte(), t2).AsByte(); // p0 += t2 q0 = Sse2.SubtractSaturate(q0.AsSByte(), t3).AsByte(); // q0 -= t3 p0 = Sse2.Xor(p0, signBit); @@ -2135,7 +2135,7 @@ private static void DoFilter6Sse2(ref Vector128 p2, ref Vector128 p1 p2 = Sse2.Xor(p2, signBit); q2 = Sse2.Xor(q2, signBit); - Vector128 a = GetBaseDelta(p1.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1.AsSByte()); + Vector128 a = GetBaseDeltaVector128(p1.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1.AsSByte()); // Do simple filter on pixels with hev. Vector128 m = Sse2.AndNot(notHev, mask); @@ -2162,9 +2162,9 @@ private static void DoFilter6Sse2(ref Vector128 p2, ref Vector128 p1 Vector128 a0Low = Sse2.Add(a1Low, f9Low); // Filter * 27 + 63 Vector128 a0High = Sse2.Add(a1High, f9High); // Filter * 27 + 63 - Update2Pixels(ref p2, ref q2, a2Low, a2High); - Update2Pixels(ref p1, ref q1, a1Low, a1High); - Update2Pixels(ref p0, ref q0, a0Low, a0High); + Update2PixelsVector128(ref p2, ref q2, a2Low, a2High); + Update2PixelsVector128(ref p1, ref q1, a1Low, a1High); + Update2PixelsVector128(ref p0, ref q0, a0Low, a0High); } private static void DoSimpleFilterSse2(ref Vector128 p0, ref Vector128 q0, Vector128 fl) @@ -2172,16 +2172,16 @@ private static void DoSimpleFilterSse2(ref Vector128 p0, ref Vector128 v3 = Sse2.AddSaturate(fl.AsSByte(), Vector128.Create((byte)3).AsSByte()); Vector128 v4 = Sse2.AddSaturate(fl.AsSByte(), Vector128.Create((byte)4).AsSByte()); - v4 = SignedShift8b(v4.AsByte()).AsSByte(); // v4 >> 3 - v3 = SignedShift8b(v3.AsByte()).AsSByte(); // v3 >> 3 + v4 = SignedShift8bVector128(v4.AsByte()).AsSByte(); // v4 >> 3 + v3 = SignedShift8bVector128(v3.AsByte()).AsSByte(); // v3 >> 3 q0 = Sse2.SubtractSaturate(q0.AsSByte(), v4).AsByte(); // q0 -= v4 p0 = Sse2.AddSaturate(p0.AsSByte(), v3).AsByte(); // p0 += v3 } private static Vector128 GetNotHev(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int hevThresh) { - Vector128 t1 = Abs(p1, p0); - Vector128 t2 = Abs(q1, q0); + Vector128 t1 = AbsVector128(p1, p0); + Vector128 t2 = AbsVector128(q1, q0); Vector128 h = Vector128.Create((byte)hevThresh); Vector128 tMax = Sse2.Max(t1, t2); @@ -2270,21 +2270,21 @@ private static bool NeedsFilter2(Span p, int offset, int step, int t, int WebpLookupTables.Abs0(q2 - q1) <= it && WebpLookupTables.Abs0(q1 - q0) <= it; } - private static Vector128 NeedsFilter(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh) + private static Vector128 NeedsFilterVector128(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh) { Vector128 mthresh = Vector128.Create((byte)thresh); - Vector128 t1 = Abs(p1, q1); // abs(p1 - q1) + Vector128 t1 = AbsVector128(p1, q1); // abs(p1 - q1) Vector128 fe = Vector128.Create((byte)0xFE); - Vector128 t2 = Sse2.And(t1, fe); // set lsb of each byte to zero. - Vector128 t3 = Sse2.ShiftRightLogical(t2.AsInt16(), 1); // abs(p1 - q1) / 2 + Vector128 t2 = t1 & fe; // set lsb of each byte to zero. + Vector128 t3 = Vector128.ShiftRightLogical(t2.AsInt16(), 1); // abs(p1 - q1) / 2 - Vector128 t4 = Abs(p0, q0); // abs(p0 - q0) - Vector128 t5 = Sse2.AddSaturate(t4, t4); // abs(p0 - q0) * 2 - Vector128 t6 = Sse2.AddSaturate(t5.AsByte(), t3.AsByte()); // abs(p0-q0)*2 + abs(p1-q1)/2 + Vector128 t4 = AbsVector128(p0, q0); // abs(p0 - q0) + Vector128 t5 = Vector128_.AddSaturate(t4, t4); // abs(p0 - q0) * 2 + Vector128 t6 = Vector128_.AddSaturate(t5.AsByte(), t3.AsByte()); // abs(p0-q0)*2 + abs(p1-q1)/2 - Vector128 t7 = Sse2.SubtractSaturate(t6, mthresh.AsByte()); // mask <= m_thresh + Vector128 t7 = Vector128_.SubtractSaturate(t6, mthresh.AsByte()); // mask <= m_thresh - return Sse2.CompareEqual(t7, Vector128.Zero); + return Vector128.Equals(t7, Vector128.Zero); } private static void Load16x4(ref byte r0, ref byte r8, int stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1) @@ -2304,8 +2304,8 @@ private static void Load16x4(ref byte r0, ref byte r8, int stride, out Vector128 // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - Load8x4(ref r0, (uint)stride, out Vector128 t1, out Vector128 t2); - Load8x4(ref r8, (uint)stride, out p0, out q1); + Load8x4Vector128(ref r0, (uint)stride, out Vector128 t1, out Vector128 t2); + Load8x4Vector128(ref r8, (uint)stride, out p0, out q1); // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 @@ -2318,7 +2318,7 @@ private static void Load16x4(ref byte r0, ref byte r8, int stride, out Vector128 } // Reads 8 rows across a vertical edge. - private static void Load8x4(ref byte bRef, nuint stride, out Vector128 p, out Vector128 q) + private static void Load8x4Vector128(ref byte bRef, nuint stride, out Vector128 p, out Vector128 q) { // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00 // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10 @@ -2335,18 +2335,18 @@ private static void Load8x4(ref byte bRef, nuint stride, out Vector128 p, // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 - Vector128 b0 = Sse2.UnpackLow(a0.AsSByte(), a1.AsSByte()); - Vector128 b1 = Sse2.UnpackHigh(a0.AsSByte(), a1.AsSByte()); + Vector128 b0 = Vector128_.UnpackLow(a0.AsSByte(), a1.AsSByte()); + Vector128 b1 = Vector128_.UnpackHigh(a0.AsSByte(), a1.AsSByte()); // C0 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 // C1 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 - Vector128 c0 = Sse2.UnpackLow(b0.AsInt16(), b1.AsInt16()); - Vector128 c1 = Sse2.UnpackHigh(b0.AsInt16(), b1.AsInt16()); + Vector128 c0 = Vector128_.UnpackLow(b0.AsInt16(), b1.AsInt16()); + Vector128 c1 = Vector128_.UnpackHigh(b0.AsInt16(), b1.AsInt16()); // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - p = Sse2.UnpackLow(c0.AsInt32(), c1.AsInt32()).AsByte(); - q = Sse2.UnpackHigh(c0.AsInt32(), c1.AsInt32()).AsByte(); + p = Vector128_.UnpackLow(c0.AsInt32(), c1.AsInt32()).AsByte(); + q = Vector128_.UnpackHigh(c0.AsInt32(), c1.AsInt32()).AsByte(); } // Transpose back and store @@ -2393,67 +2393,65 @@ private static void Store4x4(Vector128 x, ref byte dstRef, int stride) } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 GetBaseDelta(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1) + private static Vector128 GetBaseDeltaVector128(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1) { // Beware of addition order, for saturation! - Vector128 p1q1 = Sse2.SubtractSaturate(p1, q1); // p1 - q1 - Vector128 q0p0 = Sse2.SubtractSaturate(q0, p0); // q0 - p0 - Vector128 s1 = Sse2.AddSaturate(p1q1, q0p0); // p1 - q1 + 1 * (q0 - p0) - Vector128 s2 = Sse2.AddSaturate(q0p0, s1); // p1 - q1 + 2 * (q0 - p0) - Vector128 s3 = Sse2.AddSaturate(q0p0, s2); // p1 - q1 + 3 * (q0 - p0) - - return s3; + Vector128 p1q1 = Vector128_.SubtractSaturate(p1, q1); // p1 - q1 + Vector128 q0p0 = Vector128_.SubtractSaturate(q0, p0); // q0 - p0 + Vector128 s1 = Vector128_.AddSaturate(p1q1, q0p0); // p1 - q1 + 1 * (q0 - p0) + Vector128 s2 = Vector128_.AddSaturate(q0p0, s1); // p1 - q1 + 2 * (q0 - p0) + return Vector128_.AddSaturate(q0p0, s2); // p1 - q1 + 3 * (q0 - p0) } // Shift each byte of "x" by 3 bits while preserving by the sign bit. [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 SignedShift8b(Vector128 x) + private static Vector128 SignedShift8bVector128(Vector128 x) { - Vector128 low0 = Sse2.UnpackLow(Vector128.Zero, x); - Vector128 high0 = Sse2.UnpackHigh(Vector128.Zero, x); - Vector128 low1 = Sse2.ShiftRightArithmetic(low0.AsInt16(), 3 + 8); - Vector128 high1 = Sse2.ShiftRightArithmetic(high0.AsInt16(), 3 + 8); + Vector128 low0 = Vector128_.UnpackLow(Vector128.Zero, x); + Vector128 high0 = Vector128_.UnpackHigh(Vector128.Zero, x); + Vector128 low1 = Vector128.ShiftRightArithmetic(low0.AsInt16(), 3 + 8); + Vector128 high1 = Vector128.ShiftRightArithmetic(high0.AsInt16(), 3 + 8); - return Sse2.PackSignedSaturate(low1, high1); + return Vector128_.PackSignedSaturate(low1, high1); } [MethodImpl(InliningOptions.ShortMethod)] - private static void ComplexMask(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh, int ithresh, ref Vector128 mask) + private static void ComplexMaskVector128(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh, int ithresh, ref Vector128 mask) { Vector128 it = Vector128.Create((byte)ithresh); - Vector128 diff = Sse2.SubtractSaturate(mask, it); - Vector128 threshMask = Sse2.CompareEqual(diff, Vector128.Zero); - Vector128 filterMask = NeedsFilter(p1, p0, q0, q1, thresh); + Vector128 diff = Vector128_.SubtractSaturate(mask, it); + Vector128 threshMask = Vector128.Equals(diff, Vector128.Zero); + Vector128 filterMask = NeedsFilterVector128(p1, p0, q0, q1, thresh); - mask = Sse2.And(threshMask, filterMask); + mask = threshMask & filterMask; } // Updates values of 2 pixels at MB edge during complex filtering. // Update operations: // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)] // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip). - private static void Update2Pixels(ref Vector128 pi, ref Vector128 qi, Vector128 a0Low, Vector128 a0High) + private static void Update2PixelsVector128(ref Vector128 pi, ref Vector128 qi, Vector128 a0Low, Vector128 a0High) { Vector128 signBit = Vector128.Create((byte)0x80); - Vector128 a1Low = Sse2.ShiftRightArithmetic(a0Low, 7); - Vector128 a1High = Sse2.ShiftRightArithmetic(a0High, 7); - Vector128 delta = Sse2.PackSignedSaturate(a1Low, a1High); - pi = Sse2.AddSaturate(pi.AsSByte(), delta).AsByte(); - qi = Sse2.SubtractSaturate(qi.AsSByte(), delta).AsByte(); - pi = Sse2.Xor(pi, signBit.AsByte()); - qi = Sse2.Xor(qi, signBit.AsByte()); + Vector128 a1Low = Vector128.ShiftRightArithmetic(a0Low, 7); + Vector128 a1High = Vector128.ShiftRightArithmetic(a0High, 7); + Vector128 delta = Vector128_.PackSignedSaturate(a1Low, a1High); + pi = Vector128_.AddSaturate(pi.AsSByte(), delta).AsByte(); + qi = Vector128_.SubtractSaturate(qi.AsSByte(), delta).AsByte(); + pi ^= signBit.AsByte(); + qi ^= signBit.AsByte(); } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 LoadUvEdge(ref byte uRef, ref byte vRef, int offset) + private static Vector128 LoadUvEdgeVector128(ref byte uRef, ref byte vRef, int offset) { Vector128 uVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref uRef, (uint)offset)), 0); Vector128 vVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref vRef, (uint)offset)), 0); - return Sse2.UnpackLow(uVec, vVec).AsByte(); + return Vector128_.UnpackLow(uVec, vVec).AsByte(); } [MethodImpl(InliningOptions.ShortMethod)] - private static void StoreUv(Vector128 x, ref byte uRef, ref byte vRef, int offset) + private static void StoreUvVector128(Vector128 x, ref byte uRef, ref byte vRef, int offset) { Unsafe.As>(ref Unsafe.Add(ref uRef, (uint)offset)) = x.GetLower(); Unsafe.As>(ref Unsafe.Add(ref vRef, (uint)offset)) = x.GetUpper(); @@ -2461,8 +2459,8 @@ private static void StoreUv(Vector128 x, ref byte uRef, ref byte vRef, int // Compute abs(p - q) = subs(p - q) OR subs(q - p) [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 Abs(Vector128 p, Vector128 q) - => Sse2.Or(Sse2.SubtractSaturate(q, p), Sse2.SubtractSaturate(p, q)); + private static Vector128 AbsVector128(Vector128 p, Vector128 q) + => Vector128_.SubtractSaturate(q, p) | Vector128_.SubtractSaturate(p, q); [MethodImpl(InliningOptions.ShortMethod)] private static bool Hev(Span p, int offset, int step, int thresh) @@ -2511,5 +2509,5 @@ private static void Put8x8uv(byte value, Span dst) private static void Memset(Span dst, byte value, int startIdx, int count) => dst.Slice(startIdx, count).Fill(value); [MethodImpl(InliningOptions.ShortMethod)] - private static int Clamp255(int x) => x < 0 ? 0 : x > 255 ? 255 : x; + private static int Clamp255(int x) => Numerics.Clamp(x, 0, 255); } From f0c6f4c908a1a7b026dbce9a5066aa83677fb1ac Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 2 Jun 2025 23:11:58 +1000 Subject: [PATCH 09/20] Port load/store --- .../Common/Helpers/Vector128Utilities.cs | 22 --- .../Formats/Webp/Lossy/LossyUtils.cs | 150 +++++++++--------- 2 files changed, 75 insertions(+), 97 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index a6359e6e91..9b0c1d68d8 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -23,28 +23,6 @@ namespace SixLabors.ImageSharp.Common.Helpers; internal static class Vector128_ #pragma warning restore SA1649 // File name should match first type name { - /// - /// Gets a value indicating whether shuffle operations are supported. - /// - public static bool SupportsShuffleNativeByte - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get - { - if (Vector128.IsHardwareAccelerated) - { - if (RuntimeInformation.ProcessArchitecture is Architecture.X86 or Architecture.X64) - { - return Ssse3.IsSupported; - } - - return true; - } - - return false; - } - } - /// /// Creates a new vector by selecting values from an input vector using the control. /// diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index b21e3c02ba..5b85e39987 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -1451,9 +1451,9 @@ public static void SimpleHFilter16(Span p, int offset, int stride, int thr // Beginning of p1 ref byte pRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), (uint)(offset - 2)); - Load16x4(ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1); + Load16x4Vector128(ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1); DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh); - Store16x4(p1, p0, q0, q1, ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride); + Store16x4Vector128(p1, p0, q0, q1, ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride); } else { @@ -1535,7 +1535,7 @@ public static void VFilter16(Span p, int offset, int stride, int thresh, i mask = Vector128.Max(mask, AbsVector128(q2, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); // Store. ref byte outputRef = ref MemoryMarshal.GetReference(p); @@ -1559,23 +1559,23 @@ public static void HFilter16(Span p, int offset, int stride, int thresh, i { ref byte pRef = ref MemoryMarshal.GetReference(p); ref byte bRef = ref Unsafe.Add(ref pRef, (uint)offset - 4); - Load16x4(ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); + Load16x4Vector128(ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); Vector128 mask = AbsVector128(p1, p0); mask = Sse2.Max(mask, AbsVector128(p3, p2)); mask = Sse2.Max(mask, AbsVector128(p2, p1)); - Load16x4(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); + Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); mask = Sse2.Max(mask, AbsVector128(q1, q0)); mask = Sse2.Max(mask, AbsVector128(q3, q2)); mask = Sse2.Max(mask, AbsVector128(q2, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); - Store16x4(p3, p2, p1, p0, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); - Store16x4(q0, q1, q2, q3, ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride); + Store16x4Vector128(p3, p2, p1, p0, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); + Store16x4Vector128(q0, q1, q2, q3, ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride); } else { @@ -1644,7 +1644,7 @@ public static void HFilter16i(Span p, int offset, int stride, int thresh, if (Sse2.IsSupported) { ref byte pRef = ref MemoryMarshal.GetReference(p); - Load16x4(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); + Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); Vector128 mask; for (int k = 3; k > 0; k--) @@ -1660,7 +1660,7 @@ public static void HFilter16i(Span p, int offset, int stride, int thresh, mask = Sse2.Max(mask, AbsVector128(p3, p2)); mask = Sse2.Max(mask, AbsVector128(p2, p1)); - Load16x4(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out p3, out p2, out Vector128 tmp1, out Vector128 tmp2); + Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out p3, out p2, out Vector128 tmp1, out Vector128 tmp2); mask = Sse2.Max(mask, AbsVector128(tmp1, tmp2)); mask = Sse2.Max(mask, AbsVector128(p3, p2)); @@ -1669,7 +1669,7 @@ public static void HFilter16i(Span p, int offset, int stride, int thresh, ComplexMaskVector128(p1, p0, p3, p2, thresh, ithresh, ref mask); DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); - Store16x4(p1, p0, p3, p2, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); + Store16x4Vector128(p1, p0, p3, p2, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); // Rotate samples. p1 = tmp1; @@ -1714,7 +1714,7 @@ public static void VFilter8(Span u, Span v, int offset, int stride, mask = Sse2.Max(mask, AbsVector128(q2, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); // Store. StoreUvVector128(p2, ref uRef, ref vRef, offset - (3 * stride)); @@ -1738,23 +1738,23 @@ public static void HFilter8(Span u, Span v, int offset, int stride, { ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - Load16x4(ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); + Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); Vector128 mask = AbsVector128(p1, p0); mask = Sse2.Max(mask, AbsVector128(p3, p2)); mask = Sse2.Max(mask, AbsVector128(p2, p1)); - Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); + Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); mask = Sse2.Max(mask, AbsVector128(q1, q0)); mask = Sse2.Max(mask, AbsVector128(q3, q2)); mask = Sse2.Max(mask, AbsVector128(q2, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); - Store16x4(p3, p2, p1, p0, ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride); - Store16x4(q0, q1, q2, q3, ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride); + Store16x4Vector128(p3, p2, p1, p0, ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride); + Store16x4Vector128(q0, q1, q2, q3, ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride); } else { @@ -1815,7 +1815,7 @@ public static void HFilter8i(Span u, Span v, int offset, int stride, { ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 t2, out Vector128 t1, out Vector128 p1, out Vector128 p0); + Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 t2, out Vector128 t1, out Vector128 p1, out Vector128 p0); Vector128 mask = AbsVector128(p1, p0); mask = Sse2.Max(mask, AbsVector128(t2, t1)); @@ -1824,7 +1824,7 @@ public static void HFilter8i(Span u, Span v, int offset, int stride, // Beginning of q0. offset += 4; - Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out t1, out t2); + Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out t1, out t2); mask = Sse2.Max(mask, AbsVector128(q1, q0)); mask = Sse2.Max(mask, AbsVector128(t2, t1)); @@ -1835,7 +1835,7 @@ public static void HFilter8i(Span u, Span v, int offset, int stride, // Beginning of p1. offset -= 2; - Store16x4(p1, p0, q0, q1, ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride); + Store16x4Vector128(p1, p0, q0, q1, ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride); } else { @@ -2070,7 +2070,7 @@ private static void DoFilter2Sse2(ref Vector128 p1, ref Vector128 p0 // Mask filter values we don't care about. a &= mask; - DoSimpleFilterSse2(ref p0, ref q0, a); + DoSimpleFilterVector128(ref p0, ref q0, a); // Flip sign. p0 ^= signBit; @@ -2081,7 +2081,7 @@ private static void DoFilter2Sse2(ref Vector128 p1, ref Vector128 p0 private static void DoFilter4Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, Vector128 mask, int tresh) { // Compute hev mask. - Vector128 notHev = GetNotHev(ref p1, ref p0, ref q0, ref q1, tresh); + Vector128 notHev = GetNotHevVector128(ref p1, ref p0, ref q0, ref q1, tresh); Vector128 signBit = Vector128.Create((byte)0x80); @@ -2121,75 +2121,75 @@ private static void DoFilter4Sse2(ref Vector128 p1, ref Vector128 p0 } // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) - private static void DoFilter6Sse2(ref Vector128 p2, ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, ref Vector128 q2, Vector128 mask, int tresh) + private static void DoFilter6Vector128(ref Vector128 p2, ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, ref Vector128 q2, Vector128 mask, int tresh) { // Compute hev mask. - Vector128 notHev = GetNotHev(ref p1, ref p0, ref q0, ref q1, tresh); + Vector128 notHev = GetNotHevVector128(ref p1, ref p0, ref q0, ref q1, tresh); // Convert to signed values. Vector128 signBit = Vector128.Create((byte)0x80); - p1 = Sse2.Xor(p1, signBit); - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); - q1 = Sse2.Xor(q1, signBit); - p2 = Sse2.Xor(p2, signBit); - q2 = Sse2.Xor(q2, signBit); + p1 ^= signBit; + p0 ^= signBit; + q0 ^= signBit; + q1 ^= signBit; + p2 ^= signBit; + q2 ^= signBit; Vector128 a = GetBaseDeltaVector128(p1.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1.AsSByte()); // Do simple filter on pixels with hev. - Vector128 m = Sse2.AndNot(notHev, mask); - Vector128 f = Sse2.And(a.AsByte(), m); - DoSimpleFilterSse2(ref p0, ref q0, f); + Vector128 m = ~notHev & mask; + Vector128 f = a.AsByte() & m; + DoSimpleFilterVector128(ref p0, ref q0, f); // Do strong filter on pixels with not hev. - m = Sse2.And(notHev, mask); - f = Sse2.And(a.AsByte(), m); - Vector128 flow = Sse2.UnpackLow(Vector128.Zero, f); - Vector128 fhigh = Sse2.UnpackHigh(Vector128.Zero, f); + m = notHev & mask; + f = a.AsByte() & m; + Vector128 flow = Vector128_.UnpackLow(Vector128.Zero, f); + Vector128 fhigh = Vector128_.UnpackHigh(Vector128.Zero, f); Vector128 nine = Vector128.Create((short)0x0900); - Vector128 f9Low = Sse2.MultiplyHigh(flow.AsInt16(), nine); // Filter (lo) * 9 - Vector128 f9High = Sse2.MultiplyHigh(fhigh.AsInt16(), nine); // Filter (hi) * 9 + Vector128 f9Low = Vector128_.MultiplyHigh(flow.AsInt16(), nine); // Filter (lo) * 9 + Vector128 f9High = Vector128_.MultiplyHigh(fhigh.AsInt16(), nine); // Filter (hi) * 9 Vector128 sixtyThree = Vector128.Create((short)63); - Vector128 a2Low = Sse2.Add(f9Low, sixtyThree); // Filter * 9 + 63 - Vector128 a2High = Sse2.Add(f9High, sixtyThree); // Filter * 9 + 63 + Vector128 a2Low = f9Low + sixtyThree; // Filter * 9 + 63 + Vector128 a2High = f9High + sixtyThree; // Filter * 9 + 63 - Vector128 a1Low = Sse2.Add(a2Low, f9Low); // Filter * 18 + 63 - Vector128 a1High = Sse2.Add(a2High, f9High); // // Filter * 18 + 63 + Vector128 a1Low = a2Low + f9Low; // Filter * 18 + 63 + Vector128 a1High = a2High + f9High; // // Filter * 18 + 63 - Vector128 a0Low = Sse2.Add(a1Low, f9Low); // Filter * 27 + 63 - Vector128 a0High = Sse2.Add(a1High, f9High); // Filter * 27 + 63 + Vector128 a0Low = a1Low + f9Low; // Filter * 27 + 63 + Vector128 a0High = a1High + f9High; // Filter * 27 + 63 Update2PixelsVector128(ref p2, ref q2, a2Low, a2High); Update2PixelsVector128(ref p1, ref q1, a1Low, a1High); Update2PixelsVector128(ref p0, ref q0, a0Low, a0High); } - private static void DoSimpleFilterSse2(ref Vector128 p0, ref Vector128 q0, Vector128 fl) + private static void DoSimpleFilterVector128(ref Vector128 p0, ref Vector128 q0, Vector128 fl) { - Vector128 v3 = Sse2.AddSaturate(fl.AsSByte(), Vector128.Create((byte)3).AsSByte()); - Vector128 v4 = Sse2.AddSaturate(fl.AsSByte(), Vector128.Create((byte)4).AsSByte()); + Vector128 v3 = Vector128_.AddSaturate(fl.AsSByte(), Vector128.Create((byte)3).AsSByte()); + Vector128 v4 = Vector128_.AddSaturate(fl.AsSByte(), Vector128.Create((byte)4).AsSByte()); v4 = SignedShift8bVector128(v4.AsByte()).AsSByte(); // v4 >> 3 v3 = SignedShift8bVector128(v3.AsByte()).AsSByte(); // v3 >> 3 - q0 = Sse2.SubtractSaturate(q0.AsSByte(), v4).AsByte(); // q0 -= v4 - p0 = Sse2.AddSaturate(p0.AsSByte(), v3).AsByte(); // p0 += v3 + q0 = Vector128_.SubtractSaturate(q0.AsSByte(), v4).AsByte(); // q0 -= v4 + p0 = Vector128_.AddSaturate(p0.AsSByte(), v3).AsByte(); // p0 += v3 } - private static Vector128 GetNotHev(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int hevThresh) + private static Vector128 GetNotHevVector128(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int hevThresh) { Vector128 t1 = AbsVector128(p1, p0); Vector128 t2 = AbsVector128(q1, q0); Vector128 h = Vector128.Create((byte)hevThresh); - Vector128 tMax = Sse2.Max(t1, t2); + Vector128 tMax = Vector128.Max(t1, t2); - Vector128 tMaxH = Sse2.SubtractSaturate(tMax, h); + Vector128 tMaxH = Vector128_.SubtractSaturate(tMax, h); // not_hev <= t1 && not_hev <= t2 - return Sse2.CompareEqual(tMaxH, Vector128.Zero); + return Vector128.Equals(tMaxH, Vector128.Zero); } // Applies filter on 4 pixels (p1, p0, q0 and q1) @@ -2287,7 +2287,7 @@ private static Vector128 NeedsFilterVector128(Vector128 p1, Vector12 return Vector128.Equals(t7, Vector128.Zero); } - private static void Load16x4(ref byte r0, ref byte r8, int stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1) + private static void Load16x4Vector128(ref byte r0, ref byte r8, int stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1) { // Assume the pixels around the edge (|) are numbered as follows // 00 01 | 02 03 @@ -2311,10 +2311,10 @@ private static void Load16x4(ref byte r0, ref byte r8, int stride, out Vector128 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - p1 = Sse2.UnpackLow(t1.AsInt64(), p0.AsInt64()).AsByte(); - p0 = Sse2.UnpackHigh(t1.AsInt64(), p0.AsInt64()).AsByte(); - q0 = Sse2.UnpackLow(t2.AsInt64(), q1.AsInt64()).AsByte(); - q1 = Sse2.UnpackHigh(t2.AsInt64(), q1.AsInt64()).AsByte(); + p1 = Vector128_.UnpackLow(t1.AsInt64(), p0.AsInt64()).AsByte(); + p0 = Vector128_.UnpackHigh(t1.AsInt64(), p0.AsInt64()).AsByte(); + q0 = Vector128_.UnpackLow(t2.AsInt64(), q1.AsInt64()).AsByte(); + q1 = Vector128_.UnpackHigh(t2.AsInt64(), q1.AsInt64()).AsByte(); } // Reads 8 rows across a vertical edge. @@ -2350,44 +2350,44 @@ private static void Load8x4Vector128(ref byte bRef, nuint stride, out Vector128< } // Transpose back and store - private static void Store16x4(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, ref byte r0Ref, ref byte r8Ref, int stride) + private static void Store16x4Vector128(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, ref byte r0Ref, ref byte r8Ref, int stride) { // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - Vector128 p0s = Sse2.UnpackLow(p1, p0); - Vector128 p1s = Sse2.UnpackHigh(p1, p0); + Vector128 p0s = Vector128_.UnpackLow(p1, p0); + Vector128 p1s = Vector128_.UnpackHigh(p1, p0); // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - Vector128 q0s = Sse2.UnpackLow(q0, q1); - Vector128 q1s = Sse2.UnpackHigh(q0, q1); + Vector128 q0s = Vector128_.UnpackLow(q0, q1); + Vector128 q1s = Vector128_.UnpackHigh(q0, q1); // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 Vector128 t1 = p0s; - p0s = Sse2.UnpackLow(t1.AsInt16(), q0s.AsInt16()).AsByte(); - q0s = Sse2.UnpackHigh(t1.AsInt16(), q0s.AsInt16()).AsByte(); + p0s = Vector128_.UnpackLow(t1.AsInt16(), q0s.AsInt16()).AsByte(); + q0s = Vector128_.UnpackHigh(t1.AsInt16(), q0s.AsInt16()).AsByte(); // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 t1 = p1s; - p1s = Sse2.UnpackLow(t1.AsInt16(), q1s.AsInt16()).AsByte(); - q1s = Sse2.UnpackHigh(t1.AsInt16(), q1s.AsInt16()).AsByte(); + p1s = Vector128_.UnpackLow(t1.AsInt16(), q1s.AsInt16()).AsByte(); + q1s = Vector128_.UnpackHigh(t1.AsInt16(), q1s.AsInt16()).AsByte(); - Store4x4(p0s, ref r0Ref, stride); - Store4x4(q0s, ref Unsafe.Add(ref r0Ref, 4 * (uint)stride), stride); + Store4x4Vector128(p0s, ref r0Ref, stride); + Store4x4Vector128(q0s, ref Unsafe.Add(ref r0Ref, 4 * (uint)stride), stride); - Store4x4(p1s, ref r8Ref, stride); - Store4x4(q1s, ref Unsafe.Add(ref r8Ref, 4 * (uint)stride), stride); + Store4x4Vector128(p1s, ref r8Ref, stride); + Store4x4Vector128(q1s, ref Unsafe.Add(ref r8Ref, 4 * (uint)stride), stride); } - private static void Store4x4(Vector128 x, ref byte dstRef, int stride) + private static void Store4x4Vector128(Vector128 x, ref byte dstRef, int stride) { int offset = 0; for (int i = 0; i < 4; i++) { - Unsafe.As(ref Unsafe.Add(ref dstRef, (uint)offset)) = Sse2.ConvertToInt32(x.AsInt32()); - x = Sse2.ShiftRightLogical128BitLane(x, 4); + Unsafe.As(ref Unsafe.Add(ref dstRef, (uint)offset)) = x.AsInt32().ToScalar(); + x = Vector128_.ShiftRightBytesInVector(x, 4); offset += stride; } } From f2e4257d6675a131d93067a14e9e0a5d36716079 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 3 Jun 2025 00:15:44 +1000 Subject: [PATCH 10/20] Port filters --- .../Common/Helpers/Vector128Utilities.cs | 67 +++++- .../Formats/Webp/Lossy/LossyUtils.cs | 202 +++++++++--------- 2 files changed, 167 insertions(+), 102 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 9b0c1d68d8..bfd237a2d7 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -3,7 +3,6 @@ using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.Wasm; @@ -23,6 +22,35 @@ namespace SixLabors.ImageSharp.Common.Helpers; internal static class Vector128_ #pragma warning restore SA1649 // File name should match first type name { + /// + /// Average packed unsigned 8-bit integers in and , and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to average. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to average. + /// + /// + /// A vector containing the average of the packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Average(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.Average(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.FusedAddRoundedHalving(left, right); + } + + // Portable fallback: (a + b + 1) >> 1 + return (left + right + Vector128.Create((byte)1)) >> 1; + } + /// /// Creates a new vector by selecting values from an input vector using the control. /// @@ -444,6 +472,43 @@ public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector12 } } + /// + /// Horizontally add adjacent pairs of 16-bit integers in and , and + /// pack the signed 16-bit results. + /// + /// + /// The first vector containing packed signed 16-bit integers to add. + /// + /// + /// The second vector containing packed signed 16-bit integers to add. + /// + /// + /// A vector containing the results of horizontally adding adjacent pairs of packed signed 16-bit integers + /// + public static Vector128 HorizontalAdd(Vector128 left, Vector128 right) + { + if (Ssse3.IsSupported) + { + return Ssse3.HorizontalAdd(left, right); + } + + if (AdvSimd.Arm64.IsSupported) + { + return AdvSimd.Arm64.AddPairwise(left, right); + } + + // Extract the low and high parts of the products shuffling them to form a result we can add together. + // Use out-of-bounds to zero out the unused lanes. + Vector128 even = Vector128.Create(0, 2, 4, 6, 8, 8, 8, 8); + Vector128 odd = Vector128.Create(1, 3, 5, 7, 8, 8, 8, 8); + Vector128 v0 = Vector128.Shuffle(right, even); + Vector128 v1 = Vector128.Shuffle(right, odd); + Vector128 v2 = Vector128.Shuffle(left, even); + Vector128 v3 = Vector128.Shuffle(left, odd); + + return v0 + v1 + v2 + v3; + } + /// /// Multiply the packed 16-bit integers in and , producing /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 5b85e39987..b8c4c9c312 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -1413,7 +1413,7 @@ public static void TransformDcuv(Span src, Span dst) // Simple In-loop filtering (Paragraph 15.2) public static void SimpleVFilter16(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load. ref byte pRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), (uint)offset); @@ -1423,7 +1423,7 @@ public static void SimpleVFilter16(Span p, int offset, int stride, int thr Vector128 q0 = Unsafe.As>(ref pRef); Vector128 q1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)stride)); - DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh); + DoFilter2Vector128(ref p1, ref p0, ref q0, ref q1, thresh); // Store. ref byte outputRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), (uint)offset); @@ -1446,13 +1446,13 @@ public static void SimpleVFilter16(Span p, int offset, int stride, int thr public static void SimpleHFilter16(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Beginning of p1 ref byte pRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), (uint)(offset - 2)); Load16x4Vector128(ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1); - DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh); + DoFilter2Vector128(ref p1, ref p0, ref q0, ref q1, thresh); Store16x4Vector128(p1, p0, q0, q1, ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride); } else @@ -1471,7 +1471,7 @@ public static void SimpleHFilter16(Span p, int offset, int stride, int thr public static void SimpleVFilter16i(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { for (int k = 3; k > 0; k--) { @@ -1491,7 +1491,7 @@ public static void SimpleVFilter16i(Span p, int offset, int stride, int th public static void SimpleHFilter16i(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { for (int k = 3; k > 0; k--) { @@ -1513,7 +1513,7 @@ public static void SimpleHFilter16i(Span p, int offset, int stride, int th [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter16(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); Vector128 t1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset - (4 * stride)))); @@ -1555,21 +1555,21 @@ public static void VFilter16(Span p, int offset, int stride, int thresh, i [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter16(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); ref byte bRef = ref Unsafe.Add(ref pRef, (uint)offset - 4); Load16x4Vector128(ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); Vector128 mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, p1)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); - mask = Sse2.Max(mask, AbsVector128(q1, q0)); - mask = Sse2.Max(mask, AbsVector128(q3, q2)); - mask = Sse2.Max(mask, AbsVector128(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(q3, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); @@ -1585,7 +1585,7 @@ public static void HFilter16(Span p, int offset, int stride, int thresh, i public static void VFilter16i(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); Vector128 p3 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)offset)); @@ -1600,22 +1600,22 @@ public static void VFilter16i(Span p, int offset, int stride, int thresh, offset += 4 * stride; Vector128 mask = AbsVector128(p0, p1); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, p1)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); p3 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)offset)); p2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + stride))); Vector128 tmp1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (2 * stride)))); Vector128 tmp2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (3 * stride)))); - mask = Sse2.Max(mask, AbsVector128(tmp1, tmp2)); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, tmp1)); + mask = Vector128.Max(mask, AbsVector128(tmp1, tmp2)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, tmp1)); // p3 and p2 are not just temporary variables here: they will be // re-used for next span. And q2/q3 will become p1/p0 accordingly. ComplexMaskVector128(p1, p0, p3, p2, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); + DoFilter4Vector128(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); // Store. ref byte outputRef = ref MemoryMarshal.GetReference(b); @@ -1641,7 +1641,7 @@ public static void VFilter16i(Span p, int offset, int stride, int thresh, public static void HFilter16i(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); @@ -1657,17 +1657,17 @@ public static void HFilter16i(Span p, int offset, int stride, int thresh, // Compute partial mask. mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, p1)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out p3, out p2, out Vector128 tmp1, out Vector128 tmp2); - mask = Sse2.Max(mask, AbsVector128(tmp1, tmp2)); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, tmp1)); + mask = Vector128.Max(mask, AbsVector128(tmp1, tmp2)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, tmp1)); ComplexMaskVector128(p1, p0, p3, p2, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); + DoFilter4Vector128(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); Store16x4Vector128(p1, p0, p3, p2, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); @@ -1690,7 +1690,7 @@ public static void HFilter16i(Span p, int offset, int stride, int thresh, [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter8(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load uv h-edges. ref byte uRef = ref MemoryMarshal.GetReference(u); @@ -1701,17 +1701,17 @@ public static void VFilter8(Span u, Span v, int offset, int stride, Vector128 p0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - stride); Vector128 mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(t1, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, p1)); + mask = Vector128.Max(mask, AbsVector128(t1, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); Vector128 q0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset); Vector128 q1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + stride); Vector128 q2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (2 * stride)); t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (3 * stride)); - mask = Sse2.Max(mask, AbsVector128(q1, q0)); - mask = Sse2.Max(mask, AbsVector128(t1, q2)); - mask = Sse2.Max(mask, AbsVector128(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t1, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); @@ -1734,21 +1734,21 @@ public static void VFilter8(Span u, Span v, int offset, int stride, [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter8(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); Vector128 mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(p3, p2)); - mask = Sse2.Max(mask, AbsVector128(p2, p1)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); - mask = Sse2.Max(mask, AbsVector128(q1, q0)); - mask = Sse2.Max(mask, AbsVector128(q3, q2)); - mask = Sse2.Max(mask, AbsVector128(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(q3, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); @@ -1766,7 +1766,7 @@ public static void HFilter8(Span u, Span v, int offset, int stride, [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter8i(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load uv h-edges. ref byte uRef = ref MemoryMarshal.GetReference(u); @@ -1777,8 +1777,8 @@ public static void VFilter8i(Span u, Span v, int offset, int stride, Vector128 p0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 3)); Vector128 mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(t2, t1)); - mask = Sse2.Max(mask, AbsVector128(t1, p1)); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, p1)); offset += 4 * stride; @@ -1787,12 +1787,12 @@ public static void VFilter8i(Span u, Span v, int offset, int stride, t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 2)); t2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 3)); - mask = Sse2.Max(mask, AbsVector128(q1, q0)); - mask = Sse2.Max(mask, AbsVector128(t2, t1)); - mask = Sse2.Max(mask, AbsVector128(t1, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); + DoFilter4Vector128(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); // Store. StoreUvVector128(p1, ref uRef, ref vRef, offset + (-2 * stride)); @@ -1811,27 +1811,27 @@ public static void VFilter8i(Span u, Span v, int offset, int stride, [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter8i(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 t2, out Vector128 t1, out Vector128 p1, out Vector128 p0); Vector128 mask = AbsVector128(p1, p0); - mask = Sse2.Max(mask, AbsVector128(t2, t1)); - mask = Sse2.Max(mask, AbsVector128(t1, p1)); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, p1)); // Beginning of q0. offset += 4; Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out t1, out t2); - mask = Sse2.Max(mask, AbsVector128(q1, q0)); - mask = Sse2.Max(mask, AbsVector128(t2, t1)); - mask = Sse2.Max(mask, AbsVector128(t1, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, q1)); ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); + DoFilter4Vector128(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); // Beginning of p1. offset -= 2; @@ -1847,7 +1847,7 @@ public static void HFilter8i(Span u, Span v, int offset, int stride, public static void Mean16x4(Span input, Span dc) { - if (Ssse3.IsSupported) + if (Vector128.IsHardwareAccelerated) { Vector128 mean16x4Mask = Vector128.Create((short)0x00ff).AsByte(); @@ -1855,23 +1855,23 @@ public static void Mean16x4(Span input, Span dc) Vector128 a1 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16))); Vector128 a2 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16))); Vector128 a3 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 3, 16))); - Vector128 b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte - Vector128 b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); - Vector128 b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); - Vector128 b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); - Vector128 c0 = Sse2.And(a0, mean16x4Mask); // lo byte - Vector128 c1 = Sse2.And(a1, mean16x4Mask); - Vector128 c2 = Sse2.And(a2, mean16x4Mask); - Vector128 c3 = Sse2.And(a3, mean16x4Mask); - Vector128 d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); - Vector128 d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); - Vector128 d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); - Vector128 d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); - Vector128 e0 = Sse2.Add(d0, d1); - Vector128 e1 = Sse2.Add(d2, d3); - Vector128 f0 = Sse2.Add(e0, e1); - Vector128 hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16()); - Vector128 wide = Sse2.UnpackLow(hadd, Vector128.Zero).AsUInt32(); + Vector128 b0 = Vector128.ShiftRightLogical(a0.AsInt16(), 8); // hi byte + Vector128 b1 = Vector128.ShiftRightLogical(a1.AsInt16(), 8); + Vector128 b2 = Vector128.ShiftRightLogical(a2.AsInt16(), 8); + Vector128 b3 = Vector128.ShiftRightLogical(a3.AsInt16(), 8); + Vector128 c0 = a0 & mean16x4Mask; // lo byte + Vector128 c1 = a1 & mean16x4Mask; + Vector128 c2 = a2 & mean16x4Mask; + Vector128 c3 = a3 & mean16x4Mask; + Vector128 d0 = b0.AsInt32() + c0.AsInt32(); + Vector128 d1 = b1.AsInt32() + c1.AsInt32(); + Vector128 d2 = b2.AsInt32() + c2.AsInt32(); + Vector128 d3 = b3.AsInt32() + c3.AsInt32(); + Vector128 e0 = d0 + d1; + Vector128 e1 = d2 + d3; + Vector128 f0 = e0 + e1; + Vector128 hadd = Vector128_.HorizontalAdd(f0.AsInt16(), f0.AsInt16()); + Vector128 wide = Vector128_.UnpackLow(hadd, Vector128.Zero).AsUInt32(); ref uint outputRef = ref MemoryMarshal.GetReference(dc); Unsafe.As>(ref outputRef) = wide; @@ -2052,7 +2052,7 @@ private static void DoFilter2(Span p, int offset, int step) } // Applies filter on 2 pixels (p0 and q0) - private static void DoFilter2Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int thresh) + private static void DoFilter2Vector128(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int thresh) { Vector128 signBit = Vector128.Create((byte)0x80); @@ -2078,7 +2078,7 @@ private static void DoFilter2Sse2(ref Vector128 p1, ref Vector128 p0 } // Applies filter on 4 pixels (p1, p0, q0 and q1) - private static void DoFilter4Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, Vector128 mask, int tresh) + private static void DoFilter4Vector128(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, Vector128 mask, int tresh) { // Compute hev mask. Vector128 notHev = GetNotHevVector128(ref p1, ref p0, ref q0, ref q1, tresh); @@ -2086,38 +2086,38 @@ private static void DoFilter4Sse2(ref Vector128 p1, ref Vector128 p0 Vector128 signBit = Vector128.Create((byte)0x80); // Convert to signed values. - p1 = Sse2.Xor(p1, signBit); - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); - q1 = Sse2.Xor(q1, signBit); - - Vector128 t1 = Sse2.SubtractSaturate(p1.AsSByte(), q1.AsSByte()); // p1 - q1 - t1 = Sse2.AndNot(notHev, t1.AsByte()).AsSByte(); // hev(p1 - q1) - Vector128 t2 = Sse2.SubtractSaturate(q0.AsSByte(), p0.AsSByte()); // q0 - p0 - t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) - t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) - t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) - t1 = Sse2.And(t1.AsByte(), mask).AsSByte(); // mask filter values we don't care about. - - t2 = Sse2.AddSaturate(t1, Vector128.Create((byte)3).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 3 - Vector128 t3 = Sse2.AddSaturate(t1, Vector128.Create((byte)4).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 4 + p1 ^= signBit; + p0 ^= signBit; + q0 ^= signBit; + q1 ^= signBit; + + Vector128 t1 = Vector128_.SubtractSaturate(p1.AsSByte(), q1.AsSByte()); // p1 - q1 + t1 = (~notHev & t1.AsByte()).AsSByte(); // hev(p1 - q1) + Vector128 t2 = Vector128_.SubtractSaturate(q0.AsSByte(), p0.AsSByte()); // q0 - p0 + t1 = Vector128_.AddSaturate(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) + t1 = Vector128_.AddSaturate(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) + t1 = Vector128_.AddSaturate(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) + t1 = (t1.AsByte() & mask).AsSByte(); // mask filter values we don't care about. + + t2 = Vector128_.AddSaturate(t1, Vector128.Create((byte)3).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 3 + Vector128 t3 = Vector128_.AddSaturate(t1, Vector128.Create((byte)4).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 4 t2 = SignedShift8bVector128(t2.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 t3 = SignedShift8bVector128(t3.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 - p0 = Sse2.AddSaturate(p0.AsSByte(), t2).AsByte(); // p0 += t2 - q0 = Sse2.SubtractSaturate(q0.AsSByte(), t3).AsByte(); // q0 -= t3 - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); + p0 = Vector128_.AddSaturate(p0.AsSByte(), t2).AsByte(); // p0 += t2 + q0 = Vector128_.SubtractSaturate(q0.AsSByte(), t3).AsByte(); // q0 -= t3 + p0 ^= signBit; + q0 ^= signBit; // This is equivalent to signed (a + 1) >> 1 calculation. - t2 = Sse2.Add(t3, signBit.AsSByte()); - t3 = Sse2.Average(t2.AsByte(), Vector128.Zero).AsSByte(); - t3 = Sse2.Subtract(t3, Vector128.Create((sbyte)64)); - - t3 = Sse2.And(notHev, t3.AsByte()).AsSByte(); // if !hev - q1 = Sse2.SubtractSaturate(q1.AsSByte(), t3).AsByte(); // q1 -= t3 - p1 = Sse2.AddSaturate(p1.AsSByte(), t3).AsByte(); // p1 += t3 - p1 = Sse2.Xor(p1.AsByte(), signBit); - q1 = Sse2.Xor(q1.AsByte(), signBit); + t2 = t3 + signBit.AsSByte(); + t3 = Vector128_.Average(t2.AsByte(), Vector128.Zero).AsSByte(); + t3 -= Vector128.Create((sbyte)64); + + t3 = (notHev & t3.AsByte()).AsSByte(); // if !hev + q1 = Vector128_.SubtractSaturate(q1.AsSByte(), t3).AsByte(); // q1 -= t3 + p1 = Vector128_.AddSaturate(p1.AsSByte(), t3).AsByte(); // p1 += t3 + p1 = p1.AsByte() ^ signBit; + q1 = q1.AsByte() ^ signBit; } // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) From 217450eb00a76bd07ebbd004f3c677888790d085 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 3 Jun 2025 18:40:47 +1000 Subject: [PATCH 11/20] Complete LossyUtils port --- .../Formats/Webp/Lossy/LossyUtils.cs | 248 +++++------------- .../Formats/Webp/Lossy/Vp8Encoding.cs | 8 +- 2 files changed, 75 insertions(+), 181 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index b8c4c9c312..c65861c4b5 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -5,8 +5,6 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.Arm; -using System.Runtime.Intrinsics.X86; using SixLabors.ImageSharp.Common.Helpers; // ReSharper disable InconsistentNaming @@ -18,7 +16,7 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8_Sse16x16(Span a, Span b) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { return Vp8_Sse16xN_Vector256(a, b, 4); } @@ -28,11 +26,6 @@ public static int Vp8_Sse16x16(Span a, Span b) return Vp8_16xN_Vector128(a, b, 8); } - if (AdvSimd.IsSupported) - { - return Vp8_Sse16x16_Neon(a, b); - } - return Vp8_SseNxN(a, b, 16, 16); } @@ -50,11 +43,6 @@ public static int Vp8_Sse16x8(Span a, Span b) return Vp8_16xN_Vector128(a, b, 4); } - if (AdvSimd.IsSupported) - { - return Vp8_Sse16x8_Neon(a, b); - } - return Vp8_SseNxN(a, b, 16, 8); } @@ -62,7 +50,7 @@ public static int Vp8_Sse16x8(Span a, Span b) [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8_Sse4x4(Span a, Span b) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { // Load values. ref byte aRef = ref MemoryMarshal.GetReference(a); @@ -123,19 +111,14 @@ public static int Vp8_Sse4x4(Span a, Span b) // subtract, square and accumulate. Vector128 d0 = Vector128_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); - Vector128 d1 = Sse2.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16()); - Vector128 e0 = Sse2.MultiplyAddAdjacent(d0, d0); - Vector128 e1 = Sse2.MultiplyAddAdjacent(d1, d1); - Vector128 sum = Sse2.Add(e0, e1); + Vector128 d1 = Vector128_.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16()); + Vector128 e0 = Vector128_.MultiplyAddAdjacent(d0, d0); + Vector128 e1 = Vector128_.MultiplyAddAdjacent(d1, d1); + Vector128 sum = e0 + e1; return ReduceSumVector128(sum); } - if (AdvSimd.IsSupported) - { - return Vp8_Sse4x4_Neon(a, b); - } - return Vp8_SseNxN(a, b, 4, 4); } @@ -216,95 +199,6 @@ private static int Vp8_Sse16xN_Vector256(Span a, Span b, int numPair return ReduceSumVector256(sum); } - [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe int Vp8_Sse16x16_Neon(Span a, Span b) - { - Vector128 sum = Vector128.Zero; - fixed (byte* aRef = &MemoryMarshal.GetReference(a)) - { - fixed (byte* bRef = &MemoryMarshal.GetReference(b)) - { - for (int y = 0; y < 16; y++) - { - sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum); - } - } - } - - return (int)Vector128.Sum(sum); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe int Vp8_Sse16x8_Neon(Span a, Span b) - { - Vector128 sum = Vector128.Zero; - fixed (byte* aRef = &MemoryMarshal.GetReference(a)) - { - fixed (byte* bRef = &MemoryMarshal.GetReference(b)) - { - for (int y = 0; y < 8; y++) - { - sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum); - } - } - } - - return (int)Vector128.Sum(sum); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static int Vp8_Sse4x4_Neon(Span a, Span b) - { - Vector128 a0 = Load4x4Neon(a).AsByte(); - Vector128 b0 = Load4x4Neon(b).AsByte(); - Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0); - Vector64 absDiffLower = absDiff.GetLower().AsByte(); - Vector64 absDiffUpper = absDiff.GetUpper().AsByte(); - Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); - Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); - - // pair-wise adds and widen. - Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); - Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); - - Vector128 sum = AdvSimd.Add(sum1, sum2); - - return (int)Vector128.Sum(sum); - } - - // Load all 4x4 pixels into a single Vector128 - [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe Vector128 Load4x4Neon(Span src) - { - fixed (byte* srcRef = &MemoryMarshal.GetReference(src)) - { - Vector128 output = Vector128.Zero; - output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef); - output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps)); - output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2))); - output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3))); - return output; - } - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe Vector128 AccumulateSSE16Neon(byte* a, byte* b, Vector128 sum) - { - Vector128 a0 = AdvSimd.LoadVector128(a); - Vector128 b0 = AdvSimd.LoadVector128(b); - - Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0); - Vector64 absDiffLower = absDiff.GetLower(); - Vector64 absDiffUpper = absDiff.GetUpper(); - Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); - Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); - - // pair-wise adds and widen. - Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); - Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); - return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2)); - } - [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 SubtractAndAccumulateVector128(Vector128 a, Vector128 b) { @@ -330,7 +224,7 @@ private static Vector256 SubtractAndAccumulateVector256(Vector256 a, // Take abs(a-b) in 8b. Vector256 ab = Vector256_.SubtractSaturate(a, b); Vector256 ba = Vector256_.SubtractSaturate(b, a); - Vector256 absAb = Avx2.Or(ab, ba); + Vector256 absAb = ab | ba; // Zero-extend to 16b. Vector256 c0 = Vector256_.UnpackLow(absAb, Vector256.Zero); @@ -948,7 +842,7 @@ public static int TTransformVector128(Span inputA, Span inputB, Span // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3); + Vp8Transpose_2_4x4_16bVector128(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -995,7 +889,7 @@ public static int TTransformVector128(Span inputA, Span inputB, Span // Transpose two 4x4 16b matrices horizontally stored in registers. [MethodImpl(InliningOptions.ShortMethod)] - public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) + public static void Vp8Transpose_2_4x4_16bVector128(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) { // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 @@ -1110,7 +1004,7 @@ public static void TransformTwo(Span src, Span dst, Span scrat Vector128 tmp3 = a.AsInt16() - d; // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. @@ -1143,7 +1037,7 @@ public static void TransformTwo(Span src, Span dst, Span scrat Vector128 shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3); // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'dst' and store. // Load the reference(s). @@ -1189,7 +1083,7 @@ public static void TransformTwo(Span src, Span dst, Span scrat public static void TransformOne(Span src, Span dst, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load and concatenate the transform coefficients. ref short srcRef = ref MemoryMarshal.GetReference(src); @@ -1205,102 +1099,102 @@ public static void TransformOne(Span src, Span dst, Span scrat // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + Vector128 a = in0.AsInt16() + in2.AsInt16(); + Vector128 b = in0.AsInt16() - in2.AsInt16(); Vector128 k1 = Vector128.Create((short)20091); Vector128 k2 = Vector128.Create((short)-30068); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3.AsInt16(), c4); + Vector128 c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2); + Vector128 c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1); + Vector128 c3 = in1.AsInt16() - in3.AsInt16(); + Vector128 c4 = c1 - c2; + Vector128 c = c3.AsInt16() + c4; // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); + Vector128 d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1); + Vector128 d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2); + Vector128 d3 = in1.AsInt16() + in3.AsInt16(); + Vector128 d4 = d1 + d2; + Vector128 d = d3 + d4; // Second pass. - Vector128 tmp0 = Sse2.Add(a.AsInt16(), d); - Vector128 tmp1 = Sse2.Add(b.AsInt16(), c); - Vector128 tmp2 = Sse2.Subtract(b.AsInt16(), c); - Vector128 tmp3 = Sse2.Subtract(a.AsInt16(), d); + Vector128 tmp0 = a.AsInt16() + d; + Vector128 tmp1 = b.AsInt16() + c; + Vector128 tmp2 = b.AsInt16() - c; + Vector128 tmp3 = a.AsInt16() - d; // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4)); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); + Vector128 dc = t0.AsInt16() + Vector128.Create((short)4); + a = dc + t2.AsInt16(); + b = dc - t2.AsInt16(); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); + c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2); + c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1); + c3 = t1.AsInt16() - t3.AsInt16(); + c4 = c1 - c2; + c = c3 + c4; // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); + d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1); + d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2); + d3 = t1.AsInt16() + t3.AsInt16(); + d4 = d1 + d2; + d = d3 + d4; // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + tmp0 = a + d; + tmp1 = b + c; + tmp2 = b - c; + tmp3 = a - d; + Vector128 shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3); // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'dst' and store. // Load the reference(s). // Load four bytes/pixels per line. ref byte dstRef = ref MemoryMarshal.GetReference(dst); - Vector128 dst0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref dstRef)).AsByte(); - Vector128 dst1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps))).AsByte(); - Vector128 dst2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 2))).AsByte(); - Vector128 dst3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3))).AsByte(); + Vector128 dst0 = Vector128.CreateScalar(Unsafe.As(ref dstRef)).AsByte(); + Vector128 dst1 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps))).AsByte(); + Vector128 dst2 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 2))).AsByte(); + Vector128 dst3 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3))).AsByte(); // Convert to 16b. - dst0 = Sse2.UnpackLow(dst0, Vector128.Zero); - dst1 = Sse2.UnpackLow(dst1, Vector128.Zero); - dst2 = Sse2.UnpackLow(dst2, Vector128.Zero); - dst3 = Sse2.UnpackLow(dst3, Vector128.Zero); + dst0 = Vector128_.UnpackLow(dst0, Vector128.Zero); + dst1 = Vector128_.UnpackLow(dst1, Vector128.Zero); + dst2 = Vector128_.UnpackLow(dst2, Vector128.Zero); + dst3 = Vector128_.UnpackLow(dst3, Vector128.Zero); // Add the inverse transform(s). - dst0 = Sse2.Add(dst0.AsInt16(), t0.AsInt16()).AsByte(); - dst1 = Sse2.Add(dst1.AsInt16(), t1.AsInt16()).AsByte(); - dst2 = Sse2.Add(dst2.AsInt16(), t2.AsInt16()).AsByte(); - dst3 = Sse2.Add(dst3.AsInt16(), t3.AsInt16()).AsByte(); + dst0 = (dst0.AsInt16() + t0.AsInt16()).AsByte(); + dst1 = (dst1.AsInt16() + t1.AsInt16()).AsByte(); + dst2 = (dst2.AsInt16() + t2.AsInt16()).AsByte(); + dst3 = (dst3.AsInt16() + t3.AsInt16()).AsByte(); // Unsigned saturate to 8b. - dst0 = Sse2.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); - dst1 = Sse2.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); - dst2 = Sse2.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); - dst3 = Sse2.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); + dst0 = Vector128_.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); + dst1 = Vector128_.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); + dst2 = Vector128_.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); + dst3 = Vector128_.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); // Store the results. // Store four bytes/pixels per line. ref byte outputRef = ref MemoryMarshal.GetReference(dst); - int output0 = Sse2.ConvertToInt32(dst0.AsInt32()); - int output1 = Sse2.ConvertToInt32(dst1.AsInt32()); - int output2 = Sse2.ConvertToInt32(dst2.AsInt32()); - int output3 = Sse2.ConvertToInt32(dst3.AsInt32()); + int output0 = dst0.AsInt32().ToScalar(); + int output1 = dst1.AsInt32().ToScalar(); + int output2 = dst2.AsInt32().ToScalar(); + int output3 = dst3.AsInt32().ToScalar(); Unsafe.As(ref outputRef) = output0; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index c645816d4b..fd8d48dd00 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -131,14 +131,14 @@ public static void ITransformTwo(Span reference, Span input, Span tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'ref' and store. // Load the reference(s). @@ -210,14 +210,14 @@ public static void ITransformOne(Span reference, Span input, Span tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'ref' and store. // Load the reference(s). From 85d6a2b1efb33181e2acf705a5ad14960078fe5a Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 3 Jun 2025 19:25:31 +1000 Subject: [PATCH 12/20] Port Vp8Encoding --- .../Common/Helpers/Vector128Utilities.cs | 27 ++ .../Formats/Webp/Lossy/Vp8Encoding.cs | 337 +++++++++--------- 2 files changed, 192 insertions(+), 172 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index bfd237a2d7..50eeb8e0a7 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -99,6 +99,33 @@ public static Vector128 ShuffleNative(Vector128 vector, [ConstantExpec return Vector128.Shuffle(vector, indices); } + /// + /// Shuffle 16-bit integers in the high 64 bits of using the control in . + /// Store the results in the high 64 bits of the destination, with the low 64 bits being copied from . + /// + /// The input vector containing packed 16-bit integers to shuffle. + /// The shuffle control byte. + /// + /// A vector containing the shuffled 16-bit integers in the high 64 bits, with the low 64 bits copied from . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ShuffleHigh(Vector128 value, [ConstantExpected] byte control) + { + if (Sse2.IsSupported) + { + return Sse2.ShuffleHigh(value, control); + } + + // Don't use InverseMMShuffle here as we want to avoid the cast. + Vector64 indices = Vector64.Create( + (short)(control & 0x3), + (short)((control >> 2) & 0x3), + (short)((control >> 4) & 0x3), + (short)((control >> 6) & 0x3)); + + return Vector128.Create(value.GetLower(), Vector64.Shuffle(value.GetUpper(), indices)); + } + /// /// Creates a new vector by selecting values from an input vector using a set of indices. /// diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index fd8d48dd00..72420a0947 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -2,10 +2,11 @@ // Licensed under the Six Labors Split License. using System.Buffers.Binary; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Webp.Lossy; @@ -78,7 +79,7 @@ private static byte[] GetClip1() // Does two inverse transforms. public static void ITransformTwo(Span reference, Span input, Span dst, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: @@ -116,10 +117,10 @@ public static void ITransformTwo(Span reference, Span input, Span inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); Vector128 inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); - in0 = Sse2.UnpackLow(in0, inb0); - in1 = Sse2.UnpackLow(in1, inb1); - in2 = Sse2.UnpackLow(in2, inb2); - in3 = Sse2.UnpackLow(in3, inb3); + in0 = Vector128_.UnpackLow(in0, inb0); + in1 = Vector128_.UnpackLow(in1, inb1); + in2 = Vector128_.UnpackLow(in2, inb2); + in3 = Vector128_.UnpackLow(in3, inb3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -128,49 +129,45 @@ public static void ITransformTwo(Span reference, Span input, Span tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); + InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); + InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'ref' and store. // Load the reference(s). - Vector128 ref0 = Vector128.Zero; - Vector128 ref1 = Vector128.Zero; - Vector128 ref2 = Vector128.Zero; - Vector128 ref3 = Vector128.Zero; ref byte referenceRef = ref MemoryMarshal.GetReference(reference); // Load eight bytes/pixels per line. - ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); - ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); - ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); - ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); + Vector128 ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); + Vector128 ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); + Vector128 ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); + Vector128 ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); // Convert to 16b. - ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); - ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); - ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); - ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + ref0 = Vector128_.UnpackLow(ref0, Vector128.Zero); + ref1 = Vector128_.UnpackLow(ref1, Vector128.Zero); + ref2 = Vector128_.UnpackLow(ref2, Vector128.Zero); + ref3 = Vector128_.UnpackLow(ref3, Vector128.Zero); // Add the inverse transform(s). - Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); - Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); - Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); - Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); + Vector128 ref0InvAdded = ref0.AsInt16() + t0.AsInt16(); + Vector128 ref1InvAdded = ref1.AsInt16() + t1.AsInt16(); + Vector128 ref2InvAdded = ref2.AsInt16() + t2.AsInt16(); + Vector128 ref3InvAdded = ref3.AsInt16() + t3.AsInt16(); // Unsigned saturate to 8b. - ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); - ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); - ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); - ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); + ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); // Store eight bytes/pixels per line. ref byte outputRef = ref MemoryMarshal.GetReference(dst); @@ -188,7 +185,7 @@ public static void ITransformTwo(Span reference, Span input, Span reference, Span input, Span dst, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load and concatenate the transform coefficients (we'll do two inverse // transforms in parallel). In the case of only one inverse transform, the @@ -207,58 +204,54 @@ public static void ITransformOne(Span reference, Span input, Span tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); + InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); + InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'ref' and store. // Load the reference(s). - Vector128 ref0 = Vector128.Zero; - Vector128 ref1 = Vector128.Zero; - Vector128 ref2 = Vector128.Zero; - Vector128 ref3 = Vector128.Zero; ref byte referenceRef = ref MemoryMarshal.GetReference(reference); // Load four bytes/pixels per line. - ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); - ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); - ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); - ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); + Vector128 ref0 = Vector128.CreateScalar(Unsafe.As(ref referenceRef)).AsByte(); + Vector128 ref1 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); + Vector128 ref2 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); + Vector128 ref3 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); // Convert to 16b. - ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); - ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); - ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); - ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + ref0 = Vector128_.UnpackLow(ref0, Vector128.Zero); + ref1 = Vector128_.UnpackLow(ref1, Vector128.Zero); + ref2 = Vector128_.UnpackLow(ref2, Vector128.Zero); + ref3 = Vector128_.UnpackLow(ref3, Vector128.Zero); // Add the inverse transform(s). - Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); - Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); - Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); - Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); + Vector128 ref0InvAdded = ref0.AsInt16() + t0.AsInt16(); + Vector128 ref1InvAdded = ref1.AsInt16() + t1.AsInt16(); + Vector128 ref2InvAdded = ref2.AsInt16() + t2.AsInt16(); + Vector128 ref3InvAdded = ref3.AsInt16() + t3.AsInt16(); // Unsigned saturate to 8b. - ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); - ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); - ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); - ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); + ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); // Unsigned saturate to 8b. ref byte outputRef = ref MemoryMarshal.GetReference(dst); // Store four bytes/pixels per line. - int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); - int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); - int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); - int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); + int output0 = ref0.AsInt32().ToScalar(); + int output1 = ref1.AsInt32().ToScalar(); + int output2 = ref2.AsInt32().ToScalar(); + int output3 = ref3.AsInt32().ToScalar(); Unsafe.As(ref outputRef) = output0; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; @@ -302,72 +295,72 @@ public static void ITransformOne(Span reference, Span input, Span in0, Vector128 in2, Vector128 in1, Vector128 in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3) + private static void InverseTransformVerticalPassVector128(Vector128 in0, Vector128 in2, Vector128 in1, Vector128 in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3) { - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + Vector128 a = in0.AsInt16() + in2.AsInt16(); + Vector128 b = in0.AsInt16() - in2.AsInt16(); Vector128 k1 = Vector128.Create((short)20091).AsInt16(); Vector128 k2 = Vector128.Create((short)-30068).AsInt16(); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3, c4); + Vector128 c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2); + Vector128 c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1); + Vector128 c3 = in1.AsInt16() - in3.AsInt16(); + Vector128 c4 = c1 - c2; + Vector128 c = c3 + c4; // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); + Vector128 d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1); + Vector128 d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2); + Vector128 d3 = in1.AsInt16() + in3.AsInt16(); + Vector128 d4 = d1 + d2; + Vector128 d = d3 + d4; // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); + tmp0 = a + d; + tmp1 = b + c; + tmp2 = b - c; + tmp3 = a - d; } - private static void InverseTransformHorizontalPass(Vector128 t0, Vector128 t2, Vector128 t1, Vector128 t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3) + private static void InverseTransformHorizontalPassVector128(Vector128 t0, Vector128 t2, Vector128 t1, Vector128 t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3) { - Vector128 dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4)); - Vector128 a = Sse2.Add(dc, t2.AsInt16()); - Vector128 b = Sse2.Subtract(dc, t2.AsInt16()); + Vector128 dc = t0.AsInt16() + Vector128.Create((short)4); + Vector128 a = dc + t2.AsInt16(); + Vector128 b = dc - t2.AsInt16(); Vector128 k1 = Vector128.Create((short)20091).AsInt16(); Vector128 k2 = Vector128.Create((short)-30068).AsInt16(); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - Vector128 c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2); - Vector128 c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1); - Vector128 c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3, c4); + Vector128 c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2); + Vector128 c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1); + Vector128 c3 = t1.AsInt16() - t3.AsInt16(); + Vector128 c4 = c1 - c2; + Vector128 c = c3 + c4; // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - Vector128 d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1); - Vector128 d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2); - Vector128 d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); + Vector128 d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1); + Vector128 d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2); + Vector128 d3 = t1.AsInt16() + t3.AsInt16(); + Vector128 d4 = d1 + d2; + Vector128 d = d3 + d4; // Second pass. - Vector128 tmp0 = Sse2.Add(a, d); - Vector128 tmp1 = Sse2.Add(b, c); - Vector128 tmp2 = Sse2.Subtract(b, c); - Vector128 tmp3 = Sse2.Subtract(a, d); - shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + Vector128 tmp0 = a + d; + Vector128 tmp1 = b + c; + Vector128 tmp2 = b - c; + Vector128 tmp3 = a - d; + shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3); + shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3); + shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3); + shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3); } public static void FTransform2(Span src, Span reference, Span output, Span output2, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte srcRef = ref MemoryMarshal.GetReference(src); ref byte referenceRef = ref MemoryMarshal.GetReference(reference); @@ -385,38 +378,38 @@ public static void FTransform2(Span src, Span reference, Span Vector128 ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0); // Convert both to 16 bit. - Vector128 srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128.Zero); - Vector128 srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128.Zero); - Vector128 srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128.Zero); - Vector128 srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128.Zero); - Vector128 refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128.Zero); - Vector128 refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128.Zero); - Vector128 refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128.Zero); - Vector128 refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128.Zero); + Vector128 srcLow0 = Vector128_.UnpackLow(src0.AsByte(), Vector128.Zero); + Vector128 srcLow1 = Vector128_.UnpackLow(src1.AsByte(), Vector128.Zero); + Vector128 srcLow2 = Vector128_.UnpackLow(src2.AsByte(), Vector128.Zero); + Vector128 srcLow3 = Vector128_.UnpackLow(src3.AsByte(), Vector128.Zero); + Vector128 refLow0 = Vector128_.UnpackLow(ref0.AsByte(), Vector128.Zero); + Vector128 refLow1 = Vector128_.UnpackLow(ref1.AsByte(), Vector128.Zero); + Vector128 refLow2 = Vector128_.UnpackLow(ref2.AsByte(), Vector128.Zero); + Vector128 refLow3 = Vector128_.UnpackLow(ref3.AsByte(), Vector128.Zero); // Compute difference. -> 00 01 02 03 00' 01' 02' 03' - Vector128 diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16()); - Vector128 diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16()); - Vector128 diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16()); - Vector128 diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16()); + Vector128 diff0 = srcLow0.AsInt16() - refLow0.AsInt16(); + Vector128 diff1 = srcLow1.AsInt16() - refLow1.AsInt16(); + Vector128 diff2 = srcLow2.AsInt16() - refLow2.AsInt16(); + Vector128 diff3 = srcLow3.AsInt16() - refLow3.AsInt16(); // Unpack and shuffle. // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 - Vector128 shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32()); - Vector128 shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32()); - Vector128 shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32()); - Vector128 shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32()); + Vector128 shuf01l = Vector128_.UnpackLow(diff0.AsInt32(), diff1.AsInt32()); + Vector128 shuf23l = Vector128_.UnpackLow(diff2.AsInt32(), diff3.AsInt32()); + Vector128 shuf01h = Vector128_.UnpackHigh(diff0.AsInt32(), diff1.AsInt32()); + Vector128 shuf23h = Vector128_.UnpackHigh(diff2.AsInt32(), diff3.AsInt32()); // First pass. - FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128 v01l, out Vector128 v32l); - FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128 v01h, out Vector128 v32h); + FTransformPass1Vector128(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128 v01l, out Vector128 v32l); + FTransformPass1Vector128(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128 v01h, out Vector128 v32h); // Second pass. - FTransformPass2SSE2(v01l, v32l, output); - FTransformPass2SSE2(v01h, v32h, output2); + FTransformPass2Vector128(v01l, v32l, output); + FTransformPass2Vector128(v01h, v32h, output2); } else { @@ -427,7 +420,7 @@ public static void FTransform2(Span src, Span reference, Span public static void FTransform(Span src, Span reference, Span output, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte srcRef = ref MemoryMarshal.GetReference(src); ref byte referenceRef = ref MemoryMarshal.GetReference(reference); @@ -449,29 +442,29 @@ public static void FTransform(Span src, Span reference, Span // 20 21 22 23 * // 30 31 32 33 * // Shuffle. - Vector128 srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16()); - Vector128 srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16()); - Vector128 refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16()); - Vector128 refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16()); + Vector128 srcLow0 = Vector128_.UnpackLow(src0.AsInt16(), src1.AsInt16()); + Vector128 srcLow1 = Vector128_.UnpackLow(src2.AsInt16(), src3.AsInt16()); + Vector128 refLow0 = Vector128_.UnpackLow(ref0.AsInt16(), ref1.AsInt16()); + Vector128 refLow1 = Vector128_.UnpackLow(ref2.AsInt16(), ref3.AsInt16()); // 00 01 10 11 02 03 12 13 * * ... // 20 21 30 31 22 22 32 33 * * ... // Convert both to 16 bit. - Vector128 src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128.Zero); - Vector128 src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128.Zero); - Vector128 ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128.Zero); - Vector128 ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128.Zero); + Vector128 src0_16b = Vector128_.UnpackLow(srcLow0.AsByte(), Vector128.Zero); + Vector128 src1_16b = Vector128_.UnpackLow(srcLow1.AsByte(), Vector128.Zero); + Vector128 ref0_16b = Vector128_.UnpackLow(refLow0.AsByte(), Vector128.Zero); + Vector128 ref1_16b = Vector128_.UnpackLow(refLow1.AsByte(), Vector128.Zero); // Compute the difference. - Vector128 row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16()); - Vector128 row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16()); + Vector128 row01 = src0_16b.AsInt16() - ref0_16b.AsInt16(); + Vector128 row23 = src1_16b.AsInt16() - ref1_16b.AsInt16(); // First pass. - FTransformPass1SSE2(row01, row23, out Vector128 v01, out Vector128 v32); + FTransformPass1Vector128(row01, row23, out Vector128 v01, out Vector128 v32); // Second pass. - FTransformPass2SSE2(v01, v32, output); + FTransformPass2Vector128(v01, v32, output); } else { @@ -517,88 +510,88 @@ public static void FTransform(Span src, Span reference, Span } } - public static void FTransformPass1SSE2(Vector128 row01, Vector128 row23, out Vector128 out01, out Vector128 out32) + public static void FTransformPass1Vector128(Vector128 row01, Vector128 row23, out Vector128 out01, out Vector128 out32) { // *in01 = 00 01 10 11 02 03 12 13 // *in23 = 20 21 30 31 22 23 32 33 - Vector128 shuf01_p = Sse2.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301); - Vector128 shuf32_p = Sse2.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301); + Vector128 shuf01_p = Vector128_.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301); + Vector128 shuf32_p = Vector128_.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301); // 00 01 10 11 03 02 13 12 // 20 21 30 31 23 22 33 32 - Vector128 s01 = Sse2.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64()); - Vector128 s32 = Sse2.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64()); + Vector128 s01 = Vector128_.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64()); + Vector128 s32 = Vector128_.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64()); // 00 01 10 11 20 21 30 31 // 03 02 13 12 23 22 33 32 - Vector128 a01 = Sse2.Add(s01.AsInt16(), s32.AsInt16()); - Vector128 a32 = Sse2.Subtract(s01.AsInt16(), s32.AsInt16()); + Vector128 a01 = s01.AsInt16() + s32.AsInt16(); + Vector128 a32 = s01.AsInt16() - s32.AsInt16(); // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] // [ (a0 + a1) << 3, ... ] - Vector128 tmp0 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p + Vector128 tmp0 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p // [ (a0 - a1) << 3, ... ] - Vector128 tmp2 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16()); // K88m - Vector128 tmp11 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16()); // K5352_2217p - Vector128 tmp31 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16()); // K5352_2217m - Vector128 tmp12 = Sse2.Add(tmp11, Vector128.Create(1812)); - Vector128 tmp32 = Sse2.Add(tmp31, Vector128.Create(937)); - Vector128 tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9); - Vector128 tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9); - Vector128 s03 = Sse2.PackSignedSaturate(tmp0, tmp2); - Vector128 s12 = Sse2.PackSignedSaturate(tmp1, tmp3); - Vector128 slo = Sse2.UnpackLow(s03, s12); // 0 1 0 1 0 1... - Vector128 shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3 - Vector128 v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32()); - out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32()); - out32 = Sse2.Shuffle(v23, SimdUtils.Shuffle.MMShuffle1032); + Vector128 tmp2 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16()); // K88m + Vector128 tmp11 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16()); // K5352_2217p + Vector128 tmp31 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16()); // K5352_2217m + Vector128 tmp12 = tmp11 + Vector128.Create(1812); + Vector128 tmp32 = tmp31 + Vector128.Create(937); + Vector128 tmp1 = Vector128.ShiftRightArithmetic(tmp12, 9); + Vector128 tmp3 = Vector128.ShiftRightArithmetic(tmp32, 9); + Vector128 s03 = Vector128_.PackSignedSaturate(tmp0, tmp2); + Vector128 s12 = Vector128_.PackSignedSaturate(tmp1, tmp3); + Vector128 slo = Vector128_.UnpackLow(s03, s12); // 0 1 0 1 0 1... + Vector128 shi = Vector128_.UnpackHigh(s03, s12); // 2 3 2 3 2 3 + Vector128 v23 = Vector128_.UnpackHigh(slo.AsInt32(), shi.AsInt32()); + out01 = Vector128_.UnpackLow(slo.AsInt32(), shi.AsInt32()); + out32 = Vector128_.ShuffleNative(v23, SimdUtils.Shuffle.MMShuffle1032); } - public static void FTransformPass2SSE2(Vector128 v01, Vector128 v32, Span output) + public static void FTransformPass2Vector128(Vector128 v01, Vector128 v32, Span output) { // Same operations are done on the (0,3) and (1,2) pairs. // a3 = v0 - v3 // a2 = v1 - v2 - Vector128 a32 = Sse2.Subtract(v01.AsInt16(), v32.AsInt16()); - Vector128 a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64()); + Vector128 a32 = v01.AsInt16() - v32.AsInt16(); + Vector128 a22 = Vector128_.UnpackHigh(a32.AsInt64(), a32.AsInt64()); - Vector128 b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16()); - Vector128 c1 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16()); // K5352_2217 - Vector128 c3 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16()); // K2217_5352 - Vector128 d1 = Sse2.Add(c1, Vector128.Create(12000 + (1 << 16))); // K12000PlusOne - Vector128 d3 = Sse2.Add(c3, Vector128.Create(51000)); - Vector128 e1 = Sse2.ShiftRightArithmetic(d1, 16); - Vector128 e3 = Sse2.ShiftRightArithmetic(d3, 16); + Vector128 b23 = Vector128_.UnpackLow(a22.AsInt16(), a32.AsInt16()); + Vector128 c1 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16()); // K5352_2217 + Vector128 c3 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16()); // K2217_5352 + Vector128 d1 = c1 + Vector128.Create(12000 + (1 << 16)); // K12000PlusOne + Vector128 d3 = c3 + Vector128.Create(51000); + Vector128 e1 = Vector128.ShiftRightArithmetic(d1, 16); + Vector128 e3 = Vector128.ShiftRightArithmetic(d3, 16); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) - Vector128 f1 = Sse2.PackSignedSaturate(e1, e1); - Vector128 f3 = Sse2.PackSignedSaturate(e3, e3); + Vector128 f1 = Vector128_.PackSignedSaturate(e1, e1); + Vector128 f3 = Vector128_.PackSignedSaturate(e3, e3); // g1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. // -> g1 = f1 + 1 - (a3 == 0) - Vector128 g1 = Sse2.Add(f1, Sse2.CompareEqual(a32, Vector128.Zero)); + Vector128 g1 = f1 + Vector128.Equals(a32, Vector128.Zero); // a0 = v0 + v3 // a1 = v1 + v2 - Vector128 a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16()); - Vector128 a01Plus7 = Sse2.Add(a01.AsInt16(), Vector128.Create((short)7)); - Vector128 a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16(); - Vector128 c0 = Sse2.Add(a01Plus7, a11); - Vector128 c2 = Sse2.Subtract(a01Plus7, a11); + Vector128 a01 = v01.AsInt16() + v32.AsInt16(); + Vector128 a01Plus7 = a01.AsInt16() + Vector128.Create((short)7); + Vector128 a11 = Vector128_.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16(); + Vector128 c0 = a01Plus7 + a11; + Vector128 c2 = a01Plus7 - a11; // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; - Vector128 d0 = Sse2.ShiftRightArithmetic(c0, 4); - Vector128 d2 = Sse2.ShiftRightArithmetic(c2, 4); + Vector128 d0 = Vector128.ShiftRightArithmetic(c0, 4); + Vector128 d2 = Vector128.ShiftRightArithmetic(c2, 4); - Vector128 d0g1 = Sse2.UnpackLow(d0.AsInt64(), g1.AsInt64()); - Vector128 d2f3 = Sse2.UnpackLow(d2.AsInt64(), f3.AsInt64()); + Vector128 d0g1 = Vector128_.UnpackLow(d0.AsInt64(), g1.AsInt64()); + Vector128 d2f3 = Vector128_.UnpackLow(d2.AsInt64(), f3.AsInt64()); ref short outputRef = ref MemoryMarshal.GetReference(output); Unsafe.As>(ref outputRef) = d0g1.AsInt16(); From b5fe86c08ca7553f40519e24f8540307377b2b62 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 3 Jun 2025 20:20:57 +1000 Subject: [PATCH 13/20] Port YuvConversion --- .../Common/Helpers/Vector128Utilities.cs | 141 +++++++++++- .../Formats/Webp/Lossy/YuvConversion.cs | 209 +++++++++--------- 2 files changed, 244 insertions(+), 106 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 50eeb8e0a7..760296c9d3 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -608,6 +608,44 @@ public static Vector128 MultiplyHigh(Vector128 left, Vector128 + /// Multiply the packed 16-bit unsigned integers in and , producing + /// intermediate unsigned 32-bit integers, and store the high 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit unsigned integers to multiply. + /// + /// + /// The second vector containing packed 16-bit unsigned integers to multiply. + /// + /// + /// A vector containing the high 16 bits of the products of the packed 16-bit unsigned integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyHigh(left, right); + } + + // Widen each half of the short vectors into two uint vectors + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; + + // Arithmetic shift right by 16 bits to extract the high word + prodLo >>= 16; + prodHi >>= 16; + + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } + /// /// Unpack and interleave 64-bit integers from the high half of and /// and store the results in the result. @@ -927,7 +965,7 @@ public static Vector128 UnpackLow(Vector128 left, Vector128 /// The second vector containing packed signed 16-bit integers to subtract. /// /// - /// A vector containing the results of subtracting packed unsigned 16-bit integers + /// A vector containing the results of subtracting packed signed 16-bit integers /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) @@ -967,7 +1005,57 @@ public static Vector128 SubtractSaturate(Vector128 left, Vector128 } /// - /// Add packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// Subtract packed unsigned 16-bit integers in from packed unsigned 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 16-bit integers to subtract from. + /// + /// + /// The second vector containing packed unsigned 16-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 32-bit signed + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Clamp to signed 16-bit range + Vector128 min = Vector128.Create((uint)ushort.MinValue); + Vector128 max = Vector128.Create((uint)ushort.MaxValue); + + diffLo = Clamp(diffLo, min, max); + diffHi = Clamp(diffHi, min, max); + + // Narrow back to 16 bit signed. + return Vector128.Narrow(diffLo, diffHi); + } + + /// + /// Add packed unsigned 8-bit integers in to packed unsigned 8-bit integers /// in using saturation, and store the results. /// /// @@ -1015,6 +1103,55 @@ public static Vector128 AddSaturate(Vector128 left, Vector128 return Vector128.Narrow(sumLo, sumHi); } + /// + /// Add packed unsigned 16-bit integers in to packed unsigned 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 16-bit integers to add to. + /// + /// + /// The second vector containing packed unsigned 16-bit integers to add. + /// + /// + /// A vector containing the results of adding packed unsigned 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 AddSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.AddSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.AddSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.AddSaturate(left, right); + } + + // Widen inputs to 32-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Add + Vector128 sumLo = leftLo + rightLo; + Vector128 sumHi = leftHi + rightHi; + + // Clamp to signed 16-bit range + Vector128 max = Vector128.Create((uint)ushort.MaxValue); + + sumLo = Clamp(sumLo, Vector128.Zero, max); + sumHi = Clamp(sumHi, Vector128.Zero, max); + + // Narrow back to 16 bit unsigned. + return Vector128.Narrow(sumLo, sumHi); + } + /// /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers /// in using saturation, and store the results. diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 40146c6af8..d5f91b7c88 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -5,7 +5,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -29,9 +29,9 @@ internal static class YuvConversion // ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16 public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { - if (Sse41.IsSupported) + if (Vector128.IsHardwareAccelerated) { - UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer); + UpSampleVector128(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer); } else { @@ -107,7 +107,7 @@ private static void UpSampleScalar(Span topY, Span bottomY, Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) + private static void UpSampleVector128(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { const int xStep = 3; Array.Clear(uvBuffer); @@ -138,18 +138,18 @@ private static void UpSampleSse41(Span topY, Span bottomY, Span topY, Span bottomY, Span tmpBottomDst = tmpTopDst[(4 * 32)..]; Span tmpTop = tmpBottomDst[(4 * 32)..]; Span tmpBottom = bottomY.IsEmpty ? null : tmpTop[32..]; - UpSampleLastBlock(topU[uvPos..], curU[uvPos..], leftOver, ru); - UpSampleLastBlock(topV[uvPos..], curV[uvPos..], leftOver, rv); + UpSampleLastBlockVector128(topU[uvPos..], curU[uvPos..], leftOver, ru); + UpSampleLastBlockVector128(topV[uvPos..], curV[uvPos..], leftOver, rv); topY[pos..len].CopyTo(tmpTop); if (!bottomY.IsEmpty) { bottomY[pos..len].CopyTo(tmpBottom); - ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); + ConvertYuvToBgrWithBottomYVector128(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); } else { - ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep); + ConvertYuvToBgrVector128(tmpTop, tmpTopDst, ru, rv, 0, xStep); } tmpTopDst[..((len - pos) * xStep)].CopyTo(topDst[(pos * xStep)..]); @@ -184,7 +184,7 @@ private static void UpSampleSse41(Span topY, Span bottomY, Span output) + private static void UpSample32PixelsVector128(ref byte r1, ref byte r2, Span output) { // Load inputs. Vector128 a = Unsafe.As>(ref r1); @@ -192,28 +192,28 @@ private static void UpSample32Pixels(ref byte r1, ref byte r2, Span output Vector128 c = Unsafe.As>(ref r2); Vector128 d = Unsafe.As>(ref Unsafe.Add(ref r2, 1)); - Vector128 s = Sse2.Average(a, d); // s = (a + d + 1) / 2 - Vector128 t = Sse2.Average(b, c); // t = (b + c + 1) / 2 - Vector128 st = Sse2.Xor(s, t); // st = s^t + Vector128 s = Vector128_.Average(a, d); // s = (a + d + 1) / 2 + Vector128 t = Vector128_.Average(b, c); // t = (b + c + 1) / 2 + Vector128 st = s ^ t; // st = s^t - Vector128 ad = Sse2.Xor(a, d); // ad = a^d - Vector128 bc = Sse2.Xor(b, c); // bc = b^c + Vector128 ad = a ^ d; // ad = a^d + Vector128 bc = b ^ c; // bc = b^c - Vector128 t1 = Sse2.Or(ad, bc); // (a^d) | (b^c) - Vector128 t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t) - Vector128 t3 = Sse2.And(t2, Vector128.Create((byte)1)); // (a^d) | (b^c) | (s^t) & 1 - Vector128 t4 = Sse2.Average(s, t); - Vector128 k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4 + Vector128 t1 = ad | bc; // (a^d) | (b^c) + Vector128 t2 = t1 | st; // (a^d) | (b^c) | (s^t) + Vector128 t3 = t2 & Vector128.Create((byte)1); // (a^d) | (b^c) | (s^t) & 1 + Vector128 t4 = Vector128_.Average(s, t); + Vector128 k = t4 - t3; // k = (a + b + c + d) / 4 - Vector128 diag1 = GetM(k, st, bc, t); - Vector128 diag2 = GetM(k, st, ad, s); + Vector128 diag1 = GetMVector128(k, st, bc, t); + Vector128 diag2 = GetMVector128(k, st, ad, s); // Pack the alternate pixels. - PackAndStore(a, b, diag1, diag2, output); // store top. - PackAndStore(c, d, diag2, diag1, output[(2 * 32)..]); + PackAndStoreVector128(a, b, diag1, diag2, output); // store top. + PackAndStoreVector128(c, d, diag2, diag1, output[(2 * 32)..]); } - private static void UpSampleLastBlock(Span tb, Span bb, int numPixels, Span output) + private static void UpSampleLastBlockVector128(Span tb, Span bb, int numPixels, Span output) { Span r1 = stackalloc byte[17]; Span r2 = stackalloc byte[17]; @@ -230,27 +230,27 @@ private static void UpSampleLastBlock(Span tb, Span bb, int numPixel ref byte r1Ref = ref MemoryMarshal.GetReference(r1); ref byte r2Ref = ref MemoryMarshal.GetReference(r2); - UpSample32Pixels(ref r1Ref, ref r2Ref, output); + UpSample32PixelsVector128(ref r1Ref, ref r2Ref, output); } // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 - private static Vector128 GetM(Vector128 k, Vector128 st, Vector128 ij, Vector128 input) + private static Vector128 GetMVector128(Vector128 k, Vector128 st, Vector128 ij, Vector128 input) { - Vector128 tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2 - Vector128 tmp1 = Sse2.And(ij, st); // (ij) & (s^t) - Vector128 tmp2 = Sse2.Xor(k, input); // (k^in) - Vector128 tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in) - Vector128 tmp4 = Sse2.And(tmp3, Vector128.Create((byte)1)); // & 1 -> lsb_correction + Vector128 tmp0 = Vector128_.Average(k, input); // (k + in + 1) / 2 + Vector128 tmp1 = ij & st; // (ij) & (s^t) + Vector128 tmp2 = k ^ input; // (k^in) + Vector128 tmp3 = tmp1 | tmp2; // ((ij) & (s^t)) | (k^in) + Vector128 tmp4 = tmp3 & Vector128.Create((byte)1); // & 1 -> lsb_correction - return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction + return tmp0 - tmp4; // (k + in + 1) / 2 - lsb_correction } - private static void PackAndStore(Vector128 a, Vector128 b, Vector128 da, Vector128 db, Span output) + private static void PackAndStoreVector128(Vector128 a, Vector128 b, Vector128 da, Vector128 db, Span output) { - Vector128 ta = Sse2.Average(a, da); // (9a + 3b + 3c + d + 8) / 16 - Vector128 tb = Sse2.Average(b, db); // (3a + 9b + c + 3d + 8) / 16 - Vector128 t1 = Sse2.UnpackLow(ta, tb); - Vector128 t2 = Sse2.UnpackHigh(ta, tb); + Vector128 ta = Vector128_.Average(a, da); // (9a + 3b + 3c + d + 8) / 16 + Vector128 tb = Vector128_.Average(b, db); // (3a + 9b + c + 3d + 8) / 16 + Vector128 t1 = Vector128_.UnpackLow(ta, tb); + Vector128 t2 = Vector128_.UnpackHigh(ta, tb); ref byte output0Ref = ref MemoryMarshal.GetReference(output); ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16); @@ -562,41 +562,42 @@ public static void YuvToBgr(int y, int u, int v, Span bgr) } [MethodImpl(InliningOptions.ShortMethod)] - private static void ConvertYuvToBgrSse41(Span topY, Span topDst, Span ru, Span rv, int curX, int step) => YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]); + private static void ConvertYuvToBgrVector128(Span topY, Span topDst, Span ru, Span rv, int curX, int step) + => YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]); [MethodImpl(InliningOptions.ShortMethod)] - private static void ConvertYuvToBgrWithBottomYSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) + private static void ConvertYuvToBgrWithBottomYVector128(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) { - YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]); - YuvToBgrSse41(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]); + YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]); + YuvToBgrVector128(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]); } - private static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) + private static void YuvToBgrVector128(Span y, Span u, Span v, Span dst) { ref byte yRef = ref MemoryMarshal.GetReference(y); ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128 r0, out Vector128 g0, out Vector128 b0); - ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128 r1, out Vector128 g1, out Vector128 b1); - ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128 r2, out Vector128 g2, out Vector128 b2); - ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128 r3, out Vector128 g3, out Vector128 b3); + ConvertYuv444ToBgrVector128(ref yRef, ref uRef, ref vRef, out Vector128 r0, out Vector128 g0, out Vector128 b0); + ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128 r1, out Vector128 g1, out Vector128 b1); + ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128 r2, out Vector128 g2, out Vector128 b2); + ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128 r3, out Vector128 g3, out Vector128 b3); // Cast to 8b and store as BBBBGGGGRRRR. - Vector128 bgr0 = Sse2.PackUnsignedSaturate(b0, b1); - Vector128 bgr1 = Sse2.PackUnsignedSaturate(b2, b3); - Vector128 bgr2 = Sse2.PackUnsignedSaturate(g0, g1); - Vector128 bgr3 = Sse2.PackUnsignedSaturate(g2, g3); - Vector128 bgr4 = Sse2.PackUnsignedSaturate(r0, r1); - Vector128 bgr5 = Sse2.PackUnsignedSaturate(r2, r3); + Vector128 bgr0 = Vector128_.PackUnsignedSaturate(b0, b1); + Vector128 bgr1 = Vector128_.PackUnsignedSaturate(b2, b3); + Vector128 bgr2 = Vector128_.PackUnsignedSaturate(g0, g1); + Vector128 bgr3 = Vector128_.PackUnsignedSaturate(g2, g3); + Vector128 bgr4 = Vector128_.PackUnsignedSaturate(r0, r1); + Vector128 bgr5 = Vector128_.PackUnsignedSaturate(r2, r3); // Pack as BGRBGRBGRBGR. - PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst); + PlanarTo24bVector128(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst); } // Pack the planar buffers // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... - private static void PlanarTo24bSse41(Vector128 input0, Vector128 input1, Vector128 input2, Vector128 input3, Vector128 input4, Vector128 input5, Span rgb) + private static void PlanarTo24bVector128(Vector128 input0, Vector128 input1, Vector128 input2, Vector128 input3, Vector128 input4, Vector128 input5, Span rgb) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. @@ -612,7 +613,7 @@ private static void PlanarTo24bSse41(Vector128 input0, Vector128 inp // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 // Process R. - ChannelMixing( + ChannelMixingVector128( input0, input1, Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5), // PlanarTo24Shuffle0 @@ -627,7 +628,7 @@ private static void PlanarTo24bSse41(Vector128 input0, Vector128 inp // Process G. // Same as before, just shifted to the left by one and including the right padding. - ChannelMixing( + ChannelMixingVector128( input2, input3, Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255), // PlanarTo24Shuffle3 @@ -641,7 +642,7 @@ private static void PlanarTo24bSse41(Vector128 input0, Vector128 inp out Vector128 g5); // Process B. - ChannelMixing( + ChannelMixingVector128( input4, input5, Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255), // PlanarTo24Shuffle6 @@ -655,24 +656,24 @@ private static void PlanarTo24bSse41(Vector128 input0, Vector128 inp out Vector128 b5); // OR the different channels. - Vector128 rg0 = Sse2.Or(r0, g0); - Vector128 rg1 = Sse2.Or(r1, g1); - Vector128 rg2 = Sse2.Or(r2, g2); - Vector128 rg3 = Sse2.Or(r3, g3); - Vector128 rg4 = Sse2.Or(r4, g4); - Vector128 rg5 = Sse2.Or(r5, g5); + Vector128 rg0 = r0 | g0; + Vector128 rg1 = r1 | g1; + Vector128 rg2 = r2 | g2; + Vector128 rg3 = r3 | g3; + Vector128 rg4 = r4 | g4; + Vector128 rg5 = r5 | g5; ref byte outputRef = ref MemoryMarshal.GetReference(rgb); - Unsafe.As>(ref outputRef) = Sse2.Or(rg0, b0); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5); + Unsafe.As>(ref outputRef) = rg0 | b0; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 16)) = rg1 | b1; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 32)) = rg2 | b2; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 48)) = rg3 | b3; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 64)) = rg4 | b4; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 80)) = rg5 | b5; } // Shuffles the input buffer as A0 0 0 A1 0 0 A2 - private static void ChannelMixing( + private static void ChannelMixingVector128( Vector128 input0, Vector128 input1, Vector128 shuffle0, @@ -685,53 +686,53 @@ private static void ChannelMixing( out Vector128 output4, out Vector128 output5) { - output0 = Ssse3.Shuffle(input0, shuffle0); - output1 = Ssse3.Shuffle(input0, shuffle1); - output2 = Ssse3.Shuffle(input0, shuffle2); - output3 = Ssse3.Shuffle(input1, shuffle0); - output4 = Ssse3.Shuffle(input1, shuffle1); - output5 = Ssse3.Shuffle(input1, shuffle2); + output0 = Vector128_.ShuffleNative(input0, shuffle0); + output1 = Vector128_.ShuffleNative(input0, shuffle1); + output2 = Vector128_.ShuffleNative(input0, shuffle2); + output3 = Vector128_.ShuffleNative(input1, shuffle0); + output4 = Vector128_.ShuffleNative(input1, shuffle1); + output5 = Vector128_.ShuffleNative(input1, shuffle2); } // Convert 32 samples of YUV444 to B/G/R - private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128 r, out Vector128 g, out Vector128 b) + private static void ConvertYuv444ToBgrVector128(ref byte y, ref byte u, ref byte v, out Vector128 r, out Vector128 g, out Vector128 b) { // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. Vector128 y0 = Unsafe.As>(ref y); Vector128 u0 = Unsafe.As>(ref u); Vector128 v0 = Unsafe.As>(ref v); - y0 = Sse2.UnpackLow(Vector128.Zero, y0); - u0 = Sse2.UnpackLow(Vector128.Zero, u0); - v0 = Sse2.UnpackLow(Vector128.Zero, v0); + y0 = Vector128_.UnpackLow(Vector128.Zero, y0); + u0 = Vector128_.UnpackLow(Vector128.Zero, u0); + v0 = Vector128_.UnpackLow(Vector128.Zero, v0); // These constants are 14b fixed-point version of ITU-R BT.601 constants. // R = (19077 * y + 26149 * v - 14234) >> 6 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 // B = (19077 * y + 33050 * u - 17685) >> 6 - var k19077 = Vector128.Create((ushort)19077); - var k26149 = Vector128.Create((ushort)26149); - var k14234 = Vector128.Create((ushort)14234); + Vector128 k19077 = Vector128.Create((ushort)19077); + Vector128 k26149 = Vector128.Create((ushort)26149); + Vector128 k14234 = Vector128.Create((ushort)14234); - Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), k19077); - Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), k26149); - Vector128 g0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419)); - Vector128 g1 = Sse2.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320)); + Vector128 y1 = Vector128_.MultiplyHigh(y0.AsUInt16(), k19077); + Vector128 r0 = Vector128_.MultiplyHigh(v0.AsUInt16(), k26149); + Vector128 g0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419)); + Vector128 g1 = Vector128_.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320)); - Vector128 r1 = Sse2.Subtract(y1.AsUInt16(), k14234); - Vector128 r2 = Sse2.Add(r1, r0); + Vector128 r1 = y1.AsUInt16() - k14234; + Vector128 r2 = r1 + r0; - Vector128 g2 = Sse2.Add(y1.AsUInt16(), Vector128.Create((ushort)8708)); - Vector128 g3 = Sse2.Add(g0, g1); - Vector128 g4 = Sse2.Subtract(g2, g3); + Vector128 g2 = y1.AsUInt16() + Vector128.Create((ushort)8708); + Vector128 g3 = g0 + g1; + Vector128 g4 = g2 - g3; - Vector128 b0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16()); - Vector128 b1 = Sse2.AddSaturate(b0, y1); - Vector128 b2 = Sse2.SubtractSaturate(b1, Vector128.Create((ushort)17685)); + Vector128 b0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16()); + Vector128 b1 = Vector128_.AddSaturate(b0, y1); + Vector128 b2 = Vector128_.SubtractSaturate(b1, Vector128.Create((ushort)17685)); // Use logical shift for B2, which can be larger than 32767. - r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815] - g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710] - b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] + r = Vector128.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815] + g = Vector128.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710] + b = Vector128.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] } [MethodImpl(InliningOptions.ShortMethod)] From 1a91ec9d86059950eb4b1b91413eea0768f2d008 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 3 Jun 2025 23:22:08 +1000 Subject: [PATCH 14/20] Port common utils and alpha decoder --- .../Common/Helpers/Vector128Utilities.cs | 64 ++++++++++++++ .../Common/Helpers/Vector256Utilities.cs | 43 ++++++++++ src/ImageSharp/Formats/Webp/AlphaDecoder.cs | 7 +- .../Formats/Webp/WebpCommonUtils.cs | 86 +++++++++---------- 4 files changed, 153 insertions(+), 47 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 760296c9d3..4492b297ce 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -1300,4 +1300,68 @@ public static Vector128 SubtractSaturate(Vector128 left, Vector128 // Narrow back to signed bytes return Vector128.Narrow(diffLo, diffHi); } + + /// + /// Create mask from the most significant bit of each 8-bit element in , and store the result. + /// + /// + /// The vector containing packed 8-bit integers from which to create the mask. + /// + /// + /// A 16-bit integer mask where each bit corresponds to the most significant bit of each 8-bit element + /// in . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int MoveMask(Vector128 value) + { + if (Sse2.IsSupported) + { + return Sse2.MoveMask(value); + } + + if (AdvSimd.IsSupported) + { + // https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon + Vector128 powers = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); + Vector128 masked = value & powers; + + Vector128 sum8 = AdvSimd.AddPairwiseWidening(masked); + Vector128 sum16 = AdvSimd.AddPairwiseWidening(sum8); + Vector128 sum32 = AdvSimd.AddPairwiseWidening(sum16); + + // Extract lower 8 bits of each 64-bit lane + byte lo = sum32.AsByte().GetElement(0); + byte hi = sum32.AsByte().GetElement(8); + + return (hi << 8) | lo; + } + + { + // Step 1: isolate MSBs + Vector128 msbMask = Vector128.Create((byte)0x80); + Vector128 masked = value & msbMask; + + // Step 2: shift each byte so MSB lands in bit position [0..15] + // i.e. convert: 0x80 → 1 << i + Vector128 bitShifts = Vector128.Create((ushort)1, 2, 4, 8, 16, 32, 64, 128); + Vector128 bitShiftsHigh = Vector128.Create(256, 512, 1024, 2048, 4096, 8192, 16384, 32768); + + // Step 3: widen to ushort + (Vector128 lo, Vector128 hi) = Vector128.Widen(masked); + + // Step 4: compare > 0 to get 0xFFFF where MSB was set + lo = Vector128.ConditionalSelect(Vector128.Equals(lo, Vector128.Zero), Vector128.Zero, bitShifts); + hi = Vector128.ConditionalSelect(Vector128.Equals(hi, Vector128.Zero), Vector128.Zero, bitShiftsHigh); + + // Step 5: bitwise OR the two halves + Vector128 maskVector = lo | hi; + + // Step 6: horizontal OR reduction via shuffles + maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)4, 5, 6, 7, 0, 1, 2, 3)); + maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)2, 3, 0, 1, 6, 7, 4, 5)); + maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)1, 0, 3, 2, 5, 4, 7, 6)); + + return maskVector.ToScalar(); + } + } } diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 71dfadc399..e61dcb6bf9 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -231,6 +231,27 @@ public static Vector256 PackSignedSaturate(Vector256 left, Vector256 return Vector256.Narrow(lefClamped, rightClamped); } + /// + /// Packs signed 16-bit integers to signed 8-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 PackSignedSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.PackSignedSaturate(left, right); + } + + Vector256 min = Vector256.Create((short)sbyte.MinValue); + Vector256 max = Vector256.Create((short)sbyte.MaxValue); + Vector256 lefClamped = Clamp(left, min, max); + Vector256 rightClamped = Clamp(right, min, max); + return Vector256.Narrow(lefClamped, rightClamped); + } + /// /// Restricts a vector between a minimum and a maximum value. /// @@ -466,6 +487,28 @@ public static Vector256 SubtractSaturate(Vector256 left, Vector256 + /// Create mask from the most significant bit of each 8-bit element in , and store the result. + /// + /// + /// The vector containing packed 8-bit integers from which to create the mask. + /// + /// + /// A 16-bit integer mask where each bit corresponds to the most significant bit of each 8-bit element + /// in . + /// + public static int MoveMask(Vector256 value) + { + if (Avx2.IsSupported) + { + return Avx2.MoveMask(value); + } + + int loMask = Vector128_.MoveMask(value.GetLower()); + int hiMask = Vector128_.MoveMask(value.GetUpper()); + return loMask | (hiMask << 16); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Webp/AlphaDecoder.cs b/src/ImageSharp/Formats/Webp/AlphaDecoder.cs index 43dab1ffc4..c7ce12fc7b 100644 --- a/src/ImageSharp/Formats/Webp/AlphaDecoder.cs +++ b/src/ImageSharp/Formats/Webp/AlphaDecoder.cs @@ -6,7 +6,6 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Formats.Webp.BitReader; @@ -314,7 +313,7 @@ private static void ColorIndexInverseTransformAlpha( private static void HorizontalUnfilter(Span prev, Span input, Span dst, int width) { - if ((Sse2.IsSupported || AdvSimd.IsSupported) && width >= 9) + if (Vector128.IsHardwareAccelerated && width >= 9) { dst[0] = (byte)(input[0] + (prev.IsEmpty ? 0 : prev[0])); nuint i; @@ -362,7 +361,7 @@ private static void VerticalUnfilter(Span prev, Span input, Span prev, Span input, Span a0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, i)); Vector256 b0 = Unsafe.As>(ref Unsafe.Add(ref prevRef, i)); - Vector256 c0 = Avx2.Add(a0.AsByte(), b0.AsByte()); + Vector256 c0 = a0.AsByte() + b0.AsByte(); ref byte outputRef = ref Unsafe.Add(ref dstRef, i); Unsafe.As>(ref outputRef) = c0; } diff --git a/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs b/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs index 1ca409f9a4..b08fe15f51 100644 --- a/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs +++ b/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs @@ -3,7 +3,7 @@ using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Formats.Webp; @@ -20,7 +20,7 @@ internal static class WebpCommonUtils /// Returns true if alpha has non-0xff values. public static unsafe bool CheckNonOpaque(ReadOnlySpan row) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { ReadOnlySpan rowBytes = MemoryMarshal.AsBytes(row); int i = 0; @@ -32,19 +32,19 @@ public static unsafe bool CheckNonOpaque(ReadOnlySpan row) for (; i + 128 <= length; i += 128) { - Vector256 a0 = Avx.LoadVector256(src + i).AsByte(); - Vector256 a1 = Avx.LoadVector256(src + i + 32).AsByte(); - Vector256 a2 = Avx.LoadVector256(src + i + 64).AsByte(); - Vector256 a3 = Avx.LoadVector256(src + i + 96).AsByte(); - Vector256 b0 = Avx2.And(a0, alphaMaskVector256).AsInt32(); - Vector256 b1 = Avx2.And(a1, alphaMaskVector256).AsInt32(); - Vector256 b2 = Avx2.And(a2, alphaMaskVector256).AsInt32(); - Vector256 b3 = Avx2.And(a3, alphaMaskVector256).AsInt32(); - Vector256 c0 = Avx2.PackSignedSaturate(b0, b1).AsInt16(); - Vector256 c1 = Avx2.PackSignedSaturate(b2, b3).AsInt16(); - Vector256 d = Avx2.PackSignedSaturate(c0, c1).AsByte(); - Vector256 bits = Avx2.CompareEqual(d, all0x80Vector256); - int mask = Avx2.MoveMask(bits); + Vector256 a0 = Vector256.Load(src + i).AsByte(); + Vector256 a1 = Vector256.Load(src + i + 32).AsByte(); + Vector256 a2 = Vector256.Load(src + i + 64).AsByte(); + Vector256 a3 = Vector256.Load(src + i + 96).AsByte(); + Vector256 b0 = (a0 & alphaMaskVector256).AsInt32(); + Vector256 b1 = (a1 & alphaMaskVector256).AsInt32(); + Vector256 b2 = (a2 & alphaMaskVector256).AsInt32(); + Vector256 b3 = (a3 & alphaMaskVector256).AsInt32(); + Vector256 c0 = Vector256_.PackSignedSaturate(b0, b1).AsInt16(); + Vector256 c1 = Vector256_.PackSignedSaturate(b2, b3).AsInt16(); + Vector256 d = Vector256_.PackSignedSaturate(c0, c1).AsByte(); + Vector256 bits = Vector256.Equals(d, all0x80Vector256); + int mask = Vector256_.MoveMask(bits); if (mask != -1) { return true; @@ -53,7 +53,7 @@ public static unsafe bool CheckNonOpaque(ReadOnlySpan row) for (; i + 64 <= length; i += 64) { - if (IsNoneOpaque64Bytes(src, i)) + if (IsNoneOpaque64BytesVector128(src, i)) { return true; } @@ -61,7 +61,7 @@ public static unsafe bool CheckNonOpaque(ReadOnlySpan row) for (; i + 32 <= length; i += 32) { - if (IsNoneOpaque32Bytes(src, i)) + if (IsNonOpaque32BytesVector128(src, i)) { return true; } @@ -76,7 +76,7 @@ public static unsafe bool CheckNonOpaque(ReadOnlySpan row) } } } - else if (Sse2.IsSupported) + else if (Vector128.IsHardwareAccelerated) { ReadOnlySpan rowBytes = MemoryMarshal.AsBytes(row); int i = 0; @@ -85,7 +85,7 @@ public static unsafe bool CheckNonOpaque(ReadOnlySpan row) { for (; i + 64 <= length; i += 64) { - if (IsNoneOpaque64Bytes(src, i)) + if (IsNoneOpaque64BytesVector128(src, i)) { return true; } @@ -93,7 +93,7 @@ public static unsafe bool CheckNonOpaque(ReadOnlySpan row) for (; i + 32 <= length; i += 32) { - if (IsNoneOpaque32Bytes(src, i)) + if (IsNonOpaque32BytesVector128(src, i)) { return true; } @@ -122,38 +122,38 @@ public static unsafe bool CheckNonOpaque(ReadOnlySpan row) return false; } - private static unsafe bool IsNoneOpaque64Bytes(byte* src, int i) + private static unsafe bool IsNoneOpaque64BytesVector128(byte* src, int i) { Vector128 alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255); - Vector128 a0 = Sse2.LoadVector128(src + i).AsByte(); - Vector128 a1 = Sse2.LoadVector128(src + i + 16).AsByte(); - Vector128 a2 = Sse2.LoadVector128(src + i + 32).AsByte(); - Vector128 a3 = Sse2.LoadVector128(src + i + 48).AsByte(); - Vector128 b0 = Sse2.And(a0, alphaMask).AsInt32(); - Vector128 b1 = Sse2.And(a1, alphaMask).AsInt32(); - Vector128 b2 = Sse2.And(a2, alphaMask).AsInt32(); - Vector128 b3 = Sse2.And(a3, alphaMask).AsInt32(); - Vector128 c0 = Sse2.PackSignedSaturate(b0, b1).AsInt16(); - Vector128 c1 = Sse2.PackSignedSaturate(b2, b3).AsInt16(); - Vector128 d = Sse2.PackSignedSaturate(c0, c1).AsByte(); - Vector128 bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte()); - int mask = Sse2.MoveMask(bits); + Vector128 a0 = Vector128.Load(src + i).AsByte(); + Vector128 a1 = Vector128.Load(src + i + 16).AsByte(); + Vector128 a2 = Vector128.Load(src + i + 32).AsByte(); + Vector128 a3 = Vector128.Load(src + i + 48).AsByte(); + Vector128 b0 = (a0 & alphaMask).AsInt32(); + Vector128 b1 = (a1 & alphaMask).AsInt32(); + Vector128 b2 = (a2 & alphaMask).AsInt32(); + Vector128 b3 = (a3 & alphaMask).AsInt32(); + Vector128 c0 = Vector128_.PackSignedSaturate(b0, b1).AsInt16(); + Vector128 c1 = Vector128_.PackSignedSaturate(b2, b3).AsInt16(); + Vector128 d = Vector128_.PackSignedSaturate(c0, c1).AsByte(); + Vector128 bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte()); + int mask = Vector128_.MoveMask(bits); return mask != 0xFFFF; } - private static unsafe bool IsNoneOpaque32Bytes(byte* src, int i) + private static unsafe bool IsNonOpaque32BytesVector128(byte* src, int i) { Vector128 alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255); - Vector128 a0 = Sse2.LoadVector128(src + i).AsByte(); - Vector128 a1 = Sse2.LoadVector128(src + i + 16).AsByte(); - Vector128 b0 = Sse2.And(a0, alphaMask).AsInt32(); - Vector128 b1 = Sse2.And(a1, alphaMask).AsInt32(); - Vector128 c = Sse2.PackSignedSaturate(b0, b1).AsInt16(); - Vector128 d = Sse2.PackSignedSaturate(c, c).AsByte(); - Vector128 bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte()); - int mask = Sse2.MoveMask(bits); + Vector128 a0 = Vector128.Load(src + i).AsByte(); + Vector128 a1 = Vector128.Load(src + i + 16).AsByte(); + Vector128 b0 = (a0 & alphaMask).AsInt32(); + Vector128 b1 = (a1 & alphaMask).AsInt32(); + Vector128 c = Vector128_.PackSignedSaturate(b0, b1).AsInt16(); + Vector128 d = Vector128_.PackSignedSaturate(c, c).AsByte(); + Vector128 bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte()); + int mask = Vector128_.MoveMask(bits); return mask != 0xFFFF; } } From 6b5392bc3d73bc8da1d9ad750eed636d62cdeddb Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 3 Jun 2025 23:51:57 +1000 Subject: [PATCH 15/20] Remove restrictions from vector utilities --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 18 ++++----- .../Common/Helpers/Vector128Utilities.cs | 4 +- .../Common/Helpers/Vector256Utilities.cs | 37 ++----------------- .../Common/Helpers/Vector512Utilities.cs | 37 ++----------------- .../Webp/Lossless/ColorSpaceTransformUtils.cs | 2 +- 5 files changed, 21 insertions(+), 77 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 0f399d2de0..6fef043169 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -66,9 +66,9 @@ public static void Shuffle4Reduce( ref Span destination, [ConstantExpected] byte control) { - if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) || - (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) || - Vector128.IsHardwareAccelerated) + if (Vector512.IsHardwareAccelerated || + Vector256.IsHardwareAccelerated || + Vector128.IsHardwareAccelerated) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -112,8 +112,8 @@ public static void Shuffle4Reduce( ref Span destination, [ConstantExpected] byte control) { - if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) || - (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) || + if (Vector512.IsHardwareAccelerated || + Vector256.IsHardwareAccelerated || Vector128.IsHardwareAccelerated) { int remainder = 0; @@ -249,7 +249,7 @@ private static void Shuffle4( Span destination, [ConstantExpected] byte control) { - if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) + if (Vector512.IsHardwareAccelerated) { ref Vector512 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector512 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -277,7 +277,7 @@ private static void Shuffle4( } } } - else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) + else if (Vector256.IsHardwareAccelerated) { ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -341,7 +341,7 @@ private static void Shuffle4( Span destination, [ConstantExpected] byte control) { - if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) + if (Vector512.IsHardwareAccelerated) { Span temp = stackalloc byte[Vector512.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -373,7 +373,7 @@ private static void Shuffle4( } } } - else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) + else if (Vector256.IsHardwareAccelerated) { Span temp = stackalloc byte[Vector256.Count]; Shuffle.MMShuffleSpan(ref temp, control); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 4492b297ce..1676f69d1b 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -1323,7 +1323,9 @@ public static int MoveMask(Vector128 value) { // https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon Vector128 powers = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); - Vector128 masked = value & powers; + Vector128 msbMask = Vector128.Create((byte)0x80); + Vector128 normalized = AdvSimd.CompareEqual(value & msbMask, msbMask); // 0xFF or 0x00 + Vector128 masked = normalized & powers; Vector128 sum8 = AdvSimd.AddPairwiseWidening(masked); Vector128 sum16 = AdvSimd.AddPairwiseWidening(sum8); diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index e61dcb6bf9..e1c40107fe 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -1,7 +1,6 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. -using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; @@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers; internal static class Vector256_ #pragma warning restore SA1649 // File name should match first type name { - /// - /// Gets a value indicating whether shuffle byte operations are supported. - /// - public static bool SupportsShuffleNativeFloat - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx.IsSupported; - } - - /// - /// Gets a value indicating whether shuffle byte operations are supported. - /// - public static bool SupportsShuffleNativeByte - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx2.IsSupported; - } - /// /// Creates a new vector by selecting values from an input vector using a set of indices. /// @@ -47,15 +28,7 @@ public static bool SupportsShuffleNativeByte /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 ShuffleNative(Vector256 vector, [ConstantExpected] byte control) - { - if (Avx.IsSupported) - { - return Avx.Shuffle(vector, vector, control); - } - - ThrowUnreachableException(); - return default; - } + => Avx.Shuffle(vector, vector, control); /// /// Creates a new vector by selecting values from an input vector using a set of indices. @@ -73,8 +46,9 @@ public static Vector256 ShuffleNative(Vector256 vector, Vector256 @@ -508,7 +482,4 @@ public static int MoveMask(Vector256 value) int hiMask = Vector128_.MoveMask(value.GetUpper()); return loMask | (hiMask << 16); } - - [DoesNotReturn] - private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index 63de5dc10a..ded47f48ee 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -1,7 +1,6 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. -using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; @@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers; internal static class Vector512_ #pragma warning restore SA1649 // File name should match first type name { - /// - /// Gets a value indicating whether shuffle float operations are supported. - /// - public static bool SupportsShuffleNativeFloat - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx512F.IsSupported; - } - - /// - /// Gets a value indicating whether shuffle byte operations are supported. - /// - public static bool SupportsShuffleNativeByte - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx512BW.IsSupported; - } - /// /// Creates a new vector by selecting values from an input vector using the control. /// @@ -47,15 +28,7 @@ public static bool SupportsShuffleNativeByte /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 ShuffleNative(Vector512 vector, [ConstantExpected] byte control) - { - if (Avx512F.IsSupported) - { - return Avx512F.Shuffle(vector, vector, control); - } - - ThrowUnreachableException(); - return default; - } + => Avx512F.Shuffle(vector, vector, control); /// /// Creates a new vector by selecting values from an input vector using a set of indices. @@ -73,8 +46,9 @@ public static Vector512 ShuffleNative(Vector512 vector, Vector512 @@ -175,7 +149,4 @@ public static Vector512 MultiplyAdd( [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 Clamp(Vector512 value, Vector512 min, Vector512 max) => Vector512.Min(Vector512.Max(value, min), max); - - [DoesNotReturn] - private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs index 5c6fb56043..a0930c75b0 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs @@ -12,7 +12,7 @@ internal static class ColorSpaceTransformUtils { public static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) { - if (Vector256_.SupportsShuffleNativeByte && tileWidth >= 16) + if (Vector256.IsHardwareAccelerated && tileWidth >= 16) { const int span = 16; Span values = stackalloc ushort[span]; From 1a63729cb9d17725913922184f8ba1884144bbfe Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 4 Jun 2025 09:56:22 +1000 Subject: [PATCH 16/20] Add Arm64 movemask --- .../Common/Helpers/Vector128Utilities.cs | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 1676f69d1b..2228dae49a 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -1319,9 +1319,29 @@ public static int MoveMask(Vector128 value) return Sse2.MoveMask(value); } + // AdvSimd versions ported from Stack Overflow answer: + // https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon + if (AdvSimd.Arm64.IsSupported) + { + // Shift values to align each MSB to its corresponding bit in the output + Vector128 shift = Vector128.Create(-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0); + + // Mask to isolate MSBs + Vector128 msbMask = Vector128.Create((byte)0x80); + Vector128 masked = value & msbMask; + + // Shift each MSB into the correct bit position + Vector128 shifted = AdvSimd.ShiftLogical(masked.AsSByte(), shift).AsByte(); + + // Sum lanes: lower 8 go into bits 0–7, upper 8 go into bits 8–15 + byte lo = AdvSimd.Arm64.AddAcross(shifted.GetLower()).ToScalar(); + byte hi = AdvSimd.Arm64.AddAcross(shifted.GetUpper()).ToScalar(); + + return lo + (hi << 8); + } + if (AdvSimd.IsSupported) { - // https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon Vector128 powers = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); Vector128 msbMask = Vector128.Create((byte)0x80); Vector128 normalized = AdvSimd.CompareEqual(value & msbMask, msbMask); // 0xFF or 0x00 From e553807429ff282e919d478704165055c9a19465 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 4 Jun 2025 12:10:45 +1000 Subject: [PATCH 17/20] Port LosslessUtils V128 --- .../Common/Helpers/Vector128Utilities.cs | 63 ++++++ .../Common/Helpers/Vector256Utilities.cs | 4 +- .../Common/Helpers/Vector512Utilities.cs | 65 +----- .../Formats/Webp/Lossless/LosslessUtils.cs | 189 +++++++----------- 4 files changed, 144 insertions(+), 177 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 2228dae49a..a3b8e0156e 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -126,6 +126,33 @@ public static Vector128 ShuffleHigh(Vector128 value, [ConstantExpe return Vector128.Create(value.GetLower(), Vector64.Shuffle(value.GetUpper(), indices)); } + /// + /// Shuffle 16-bit integers in the low 64 bits of using the control in . + /// Store the results in the low 64 bits of the destination, with the high 64 bits being copied from . + /// + /// The input vector containing packed 16-bit integers to shuffle. + /// The shuffle control byte. + /// + /// A vector containing the shuffled 16-bit integers in the low 64 bits, with the high 64 bits copied from . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ShuffleLow(Vector128 value, [ConstantExpected] byte control) + { + if (Sse2.IsSupported) + { + return Sse2.ShuffleLow(value, control); + } + + // Don't use InverseMMShuffle here as we want to avoid the cast. + Vector64 indices = Vector64.Create( + (short)(control & 0x3), + (short)((control >> 2) & 0x3), + (short)((control >> 4) & 0x3), + (short)((control >> 6) & 0x3)); + + return Vector128.Create(Vector64.Shuffle(value.GetLower(), indices), value.GetUpper()); + } + /// /// Creates a new vector by selecting values from an input vector using a set of indices. /// @@ -198,6 +225,42 @@ public static Vector128 ShiftLeftBytesInVector(Vector128 value, [Con return Vector128.Shuffle(value, Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) - Vector128.Create(numBytes)); } + /// + /// Shift packed 16-bit integers in left by while + /// shifting in zeros, and store the results + /// + /// The vector containing packed 16-bit integers to shift. + /// The number of bits to shift left. + /// + /// A vector containing the packed 16-bit integers shifted left by , with zeros shifted in. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ShiftLeftLogical(Vector128 value, [ConstantExpected] byte count) + { + if (Sse2.IsSupported) + { + return Sse2.ShiftLeftLogical(value, count); + } + + // Zero lanes where count >= 16 to match SSE2 + if (count >= 16) + { + return Vector128.Zero; + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.ShiftLogical(value, Vector128.Create((short)count)); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.ShiftLeft(value, count); + } + + return Vector128.ShiftLeft(value, count); + } + /// /// Right aligns elements of two source 128-bit values depending on bits in a mask. /// diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index e1c40107fe..4769df2b0b 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -46,9 +46,7 @@ public static Vector256 ShuffleNative(Vector256 vector, Vector256 diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index ded47f48ee..03ee4626cd 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -46,9 +46,7 @@ public static Vector512 ShuffleNative(Vector512 vector, Vector512 @@ -59,25 +57,7 @@ public static Vector512 ShuffleNative(Vector512 vector, Vector512The . [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 ConvertToInt32RoundToEven(Vector512 vector) - { - if (Avx512F.IsSupported) - { - return Avx512F.ConvertToVector512Int32(vector); - } - - if (Avx.IsSupported) - { - Vector256 lower = Avx.ConvertToVector256Int32(vector.GetLower()); - Vector256 upper = Avx.ConvertToVector256Int32(vector.GetUpper()); - return Vector512.Create(lower, upper); - } - - Vector512 sign = vector & Vector512.Create(-0.0f); - Vector512 val_2p23_f32 = sign | Vector512.Create(8388608.0f); - - val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32; - return Vector512.ConvertToInt32(val_2p23_f32 | sign); - } + => Avx512F.ConvertToVector512Int32(vector); /// /// Rounds all values in to the nearest integer @@ -86,28 +66,11 @@ public static Vector512 ConvertToInt32RoundToEven(Vector512 vector) /// The vector [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 RoundToNearestInteger(Vector512 vector) - { - if (Avx512F.IsSupported) - { - // imm8 = 0b1000: - // imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers) - // imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions) - return Avx512F.RoundScale(vector, 0b0000_1000); - } - if (Avx.IsSupported) - { - Vector256 lower = Avx.RoundToNearestInteger(vector.GetLower()); - Vector256 upper = Avx.RoundToNearestInteger(vector.GetUpper()); - return Vector512.Create(lower, upper); - } - - Vector512 sign = vector & Vector512.Create(-0F); - Vector512 val_2p23_f32 = sign | Vector512.Create(8388608F); - - val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32; - return val_2p23_f32 | sign; - } + // imm8 = 0b1000: + // imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers) + // imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions) + => Avx512F.RoundScale(vector, 0b0000_1000); /// /// Performs a multiplication and an addition of the . @@ -122,21 +85,7 @@ public static Vector512 MultiplyAdd( Vector512 va, Vector512 vm0, Vector512 vm1) - { - if (Avx512F.IsSupported) - { - return Avx512F.FusedMultiplyAdd(vm0, vm1, va); - } - - if (Fma.IsSupported) - { - Vector256 lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower()); - Vector256 upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper()); - return Vector512.Create(lower, upper); - } - - return va + (vm0 * vm1); - } + => Avx512F.FusedMultiplyAdd(vm0, vm1, va); /// /// Restricts a vector between a minimum and a maximum value. diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 5287f0b753..b96525b426 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -6,6 +6,7 @@ using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Memory; namespace SixLabors.ImageSharp.Formats.Webp.Lossless; @@ -94,7 +95,7 @@ public static int PrefixEncode(int distance, ref int extraBits, ref int extraBit /// The pixel data to apply the transformation. public static void AddGreenToBlueAndRed(Span pixelData) { - if (Avx2.IsSupported && pixelData.Length >= 8) + if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8) { Vector256 addGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); nuint numPixels = (uint)pixelData.Length; @@ -103,8 +104,8 @@ public static void AddGreenToBlueAndRed(Span pixelData) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); - Vector256 in0g0g = Avx2.Shuffle(input, addGreenToBlueAndRedMaskAvx2); - Vector256 output = Avx2.Add(input, in0g0g); + Vector256 in0g0g = Vector256_.ShuffleNative(input, addGreenToBlueAndRedMaskAvx2); + Vector256 output = input + in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 8; } @@ -115,39 +116,17 @@ public static void AddGreenToBlueAndRed(Span pixelData) AddGreenToBlueAndRedScalar(pixelData[(int)i..]); } } - else if (Ssse3.IsSupported && pixelData.Length >= 4) - { - Vector128 addGreenToBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); - nuint numPixels = (uint)pixelData.Length; - nuint i = 0; - do - { - ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); - Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 in0g0g = Ssse3.Shuffle(input, addGreenToBlueAndRedMaskSsse3); - Vector128 output = Sse2.Add(input, in0g0g); - Unsafe.As>(ref pos) = output.AsUInt32(); - i += 4; - } - while (i <= numPixels - 4); - - if (i != numPixels) - { - AddGreenToBlueAndRedScalar(pixelData[(int)i..]); - } - } - else if (Sse2.IsSupported && pixelData.Length >= 4) + else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4) { + Vector128 addGreenToBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); nuint numPixels = (uint)pixelData.Length; nuint i = 0; do { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g - Vector128 b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g - Vector128 output = Sse2.Add(input.AsByte(), c.AsByte()); + Vector128 in0g0g = Vector128_.ShuffleNative(input, addGreenToBlueAndRedMask); + Vector128 output = input + in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 4; } @@ -180,7 +159,7 @@ private static void AddGreenToBlueAndRedScalar(Span pixelData) public static void SubtractGreenFromBlueAndRed(Span pixelData) { - if (Avx2.IsSupported && pixelData.Length >= 8) + if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8) { Vector256 subtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); nuint numPixels = (uint)pixelData.Length; @@ -189,8 +168,8 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); - Vector256 in0g0g = Avx2.Shuffle(input, subtractGreenFromBlueAndRedMaskAvx2); - Vector256 output = Avx2.Subtract(input, in0g0g); + Vector256 in0g0g = Vector256_.ShuffleNative(input, subtractGreenFromBlueAndRedMaskAvx2); + Vector256 output = input - in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 8; } @@ -201,39 +180,17 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]); } } - else if (Ssse3.IsSupported && pixelData.Length >= 4) - { - Vector128 subtractGreenFromBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); - nuint numPixels = (uint)pixelData.Length; - nuint i = 0; - do - { - ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); - Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 in0g0g = Ssse3.Shuffle(input, subtractGreenFromBlueAndRedMaskSsse3); - Vector128 output = Sse2.Subtract(input, in0g0g); - Unsafe.As>(ref pos) = output.AsUInt32(); - i += 4; - } - while (i <= numPixels - 4); - - if (i != numPixels) - { - SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]); - } - } - else if (Sse2.IsSupported && pixelData.Length >= 4) + else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4) { + Vector128 subtractGreenFromBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); nuint numPixels = (uint)pixelData.Length; nuint i = 0; do { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g - Vector128 b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g - Vector128 output = Sse2.Subtract(input.AsByte(), c.AsByte()); + Vector128 in0g0g = Vector128_.ShuffleNative(input, subtractGreenFromBlueAndRedMask); + Vector128 output = input - in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 4; } @@ -412,7 +369,7 @@ public static void TransformColor(Vp8LMultipliers m, Span pixelData, int n TransformColorScalar(m, pixelData[(int)idx..], numPixels - (int)idx); } } - else if (Sse2.IsSupported && numPixels >= 4) + else if (Vector128.IsHardwareAccelerated && numPixels >= 4) { Vector128 transformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); Vector128 transformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); @@ -423,16 +380,16 @@ public static void TransformColor(Vp8LMultipliers m, Span pixelData, int n { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector128 input = Unsafe.As>(ref pos); - Vector128 a = Sse2.And(input.AsByte(), transformColorAlphaGreenMask); - Vector128 b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector128 e = Sse2.ShiftLeftLogical(input.AsInt16(), 8); - Vector128 f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); - Vector128 g = Sse2.ShiftRightLogical(f.AsInt32(), 16); - Vector128 h = Sse2.Add(g.AsByte(), d.AsByte()); - Vector128 i = Sse2.And(h, transformColorRedBlueMask); - Vector128 output = Sse2.Subtract(input.AsByte(), i); + Vector128 a = input.AsByte() & transformColorAlphaGreenMask; + Vector128 b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector128 e = Vector128_.ShiftLeftLogical(input.AsInt16(), 8); + Vector128 f = Vector128_.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); + Vector128 g = Vector128.ShiftRightLogical(f.AsInt32(), 16); + Vector128 h = g.AsByte() + d.AsByte(); + Vector128 i = h & transformColorRedBlueMask; + Vector128 output = input.AsByte() - i; Unsafe.As>(ref pos) = output.AsUInt32(); idx += 4; } @@ -503,7 +460,7 @@ public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData TransformColorInverseScalar(m, pixelData[(int)idx..]); } } - else if (Sse2.IsSupported && pixelData.Length >= 4) + else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4) { Vector128 transformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); @@ -514,17 +471,17 @@ public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector128 input = Unsafe.As>(ref pos); - Vector128 a = Sse2.And(input.AsByte(), transformColorInverseAlphaGreenMask); - Vector128 b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector128 e = Sse2.Add(input.AsByte(), d.AsByte()); - Vector128 f = Sse2.ShiftLeftLogical(e.AsInt16(), 8); - Vector128 g = Sse2.MultiplyHigh(f, multsb2.AsInt16()); - Vector128 h = Sse2.ShiftRightLogical(g.AsInt32(), 8); - Vector128 i = Sse2.Add(h.AsByte(), f.AsByte()); - Vector128 j = Sse2.ShiftRightLogical(i.AsInt16(), 8); - Vector128 output = Sse2.Or(j.AsByte(), a); + Vector128 a = input.AsByte() & transformColorInverseAlphaGreenMask; + Vector128 b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector128 e = input.AsByte() + d.AsByte(); + Vector128 f = Vector128_.ShiftLeftLogical(e.AsInt16(), 8); + Vector128 g = Vector128_.MultiplyHigh(f, multsb2.AsInt16()); + Vector128 h = Vector128.ShiftRightLogical(g.AsInt32(), 8); + Vector128 i = h.AsByte() + f.AsByte(); + Vector128 j = Vector128.ShiftRightLogical(i.AsInt16(), 8); + Vector128 output = j.AsByte() | a; Unsafe.As>(ref pos) = output.AsUInt32(); } @@ -1401,15 +1358,15 @@ public static uint AddPixels(uint a, uint b) private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - Vector128 c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128.Zero); - Vector128 c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128.Zero); - Vector128 c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128.Zero); - Vector128 v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16()); - Vector128 v2 = Sse2.Subtract(v1, c2Vec.AsInt16()); - Vector128 b = Sse2.PackUnsignedSaturate(v2, v2); - return Sse2.ConvertToUInt32(b.AsUInt32()); + Vector128 c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128.Zero); + Vector128 c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128.Zero); + Vector128 c2Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128.Zero); + Vector128 v1 = c0Vec.AsInt16() + c1Vec.AsInt16(); + Vector128 v2 = v1 - c2Vec.AsInt16(); + Vector128 b = Vector128_.PackUnsignedSaturate(v2, v2); + return b.AsUInt32().ToScalar(); } { @@ -1432,20 +1389,20 @@ private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2) private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - Vector128 c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128.Zero); - Vector128 c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128.Zero); - Vector128 b0 = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128.Zero); - Vector128 avg = Sse2.Add(c1Vec.AsInt16(), c0Vec.AsInt16()); - Vector128 a0 = Sse2.ShiftRightLogical(avg, 1); - Vector128 a1 = Sse2.Subtract(a0, b0.AsInt16()); - Vector128 bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16()); - Vector128 a2 = Sse2.Subtract(a1, bgta); - Vector128 a3 = Sse2.ShiftRightArithmetic(a2, 1); - Vector128 a4 = Sse2.Add(a0, a3).AsInt16(); - Vector128 a5 = Sse2.PackUnsignedSaturate(a4, a4); - return Sse2.ConvertToUInt32(a5.AsUInt32()); + Vector128 c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128.Zero); + Vector128 c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128.Zero); + Vector128 b0 = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128.Zero); + Vector128 avg = c1Vec.AsInt16() + c0Vec.AsInt16(); + Vector128 a0 = Vector128.ShiftRightLogical(avg, 1); + Vector128 a1 = a0 - b0.AsInt16(); + Vector128 bgta = Vector128.GreaterThan(b0.AsInt16(), a0.AsInt16()); + Vector128 a2 = a1 - bgta; + Vector128 a3 = Vector128.ShiftRightArithmetic(a2, 1); + Vector128 a4 = (a0 + a3).AsInt16(); + Vector128 a5 = Vector128_.PackUnsignedSaturate(a4, a4); + return a5.AsUInt32().ToScalar(); } { @@ -1475,23 +1432,23 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2) private static uint Select(uint a, uint b, uint c, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { fixed (short* ptr = &MemoryMarshal.GetReference(scratch)) { - Vector128 a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte(); - Vector128 b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte(); - Vector128 c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte(); - Vector128 ac0 = Sse2.SubtractSaturate(a0, c0); - Vector128 ca0 = Sse2.SubtractSaturate(c0, a0); - Vector128 bc0 = Sse2.SubtractSaturate(b0, c0); - Vector128 cb0 = Sse2.SubtractSaturate(c0, b0); - Vector128 ac = Sse2.Or(ac0, ca0); - Vector128 bc = Sse2.Or(bc0, cb0); - Vector128 pa = Sse2.UnpackLow(ac, Vector128.Zero); // |a - c| - Vector128 pb = Sse2.UnpackLow(bc, Vector128.Zero); // |b - c| - Vector128 diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16()); - Sse2.Store((ushort*)ptr, diff); + Vector128 a0 = Vector128.CreateScalar(a).AsByte(); + Vector128 b0 = Vector128.CreateScalar(b).AsByte(); + Vector128 c0 = Vector128.CreateScalar(c).AsByte(); + Vector128 ac0 = Vector128_.SubtractSaturate(a0, c0); + Vector128 ca0 = Vector128_.SubtractSaturate(c0, a0); + Vector128 bc0 = Vector128_.SubtractSaturate(b0, c0); + Vector128 cb0 = Vector128_.SubtractSaturate(c0, b0); + Vector128 ac = ac0 | ca0; + Vector128 bc = bc0 | cb0; + Vector128 pa = Vector128_.UnpackLow(ac, Vector128.Zero); // |a - c| + Vector128 pb = Vector128_.UnpackLow(bc, Vector128.Zero); // |b - c| + Vector128 diff = pb.AsUInt16() - pa.AsUInt16(); + diff.Store((ushort*)ptr); int paMinusPb = ptr[3] + ptr[2] + ptr[1] + ptr[0]; return (paMinusPb <= 0) ? a : b; } From 0c0748e1c77fc68a47b9b68b98e1923d3e7f3413 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 4 Jun 2025 12:24:53 +1000 Subject: [PATCH 18/20] Update LosslessUtils.cs --- src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index b96525b426..8cc9fd05b9 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -97,14 +97,14 @@ public static void AddGreenToBlueAndRed(Span pixelData) { if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8) { - Vector256 addGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); + Vector256 addGreenToBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); nuint numPixels = (uint)pixelData.Length; nuint i = 0; do { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); - Vector256 in0g0g = Vector256_.ShuffleNative(input, addGreenToBlueAndRedMaskAvx2); + Vector256 in0g0g = Vector256_.ShuffleNative(input, addGreenToBlueAndRedMask); Vector256 output = input + in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 8; @@ -161,14 +161,14 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) { if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8) { - Vector256 subtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); + Vector256 subtractGreenFromBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); nuint numPixels = (uint)pixelData.Length; nuint i = 0; do { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); - Vector256 in0g0g = Vector256_.ShuffleNative(input, subtractGreenFromBlueAndRedMaskAvx2); + Vector256 in0g0g = Vector256_.ShuffleNative(input, subtractGreenFromBlueAndRedMask); Vector256 output = input - in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 8; From 362707343fe242abed264bba9dd6abc0095a6242 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 6 Jun 2025 14:20:28 +1000 Subject: [PATCH 19/20] Update based on feedback --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 15 +- .../Common/Helpers/Vector128Utilities.cs | 276 ++++++------------ .../Common/Helpers/Vector256Utilities.cs | 29 +- .../Webp/Lossless/ColorSpaceTransformUtils.cs | 7 +- .../Formats/Webp/Lossless/LosslessUtils.cs | 7 +- .../Formats/Webp/WebpCommonUtils.cs | 8 +- 6 files changed, 125 insertions(+), 217 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 6fef043169..ff5ea5de33 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -375,6 +375,11 @@ private static void Shuffle4( } else if (Vector256.IsHardwareAccelerated) { + // ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb). + // MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte, + // so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F) + // for indexing within each lane, and ignores the upper bits unless bit 7 is set, + // this usage is guaranteed to remain within-lane and non-zeroing. Span temp = stackalloc byte[Vector256.Count]; Shuffle.MMShuffleSpan(ref temp, control); Vector256 mask = Unsafe.As>(ref MemoryMarshal.GetReference(temp)); @@ -391,17 +396,17 @@ private static void Shuffle4( ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector256 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector256_.ShuffleNative(vs0, mask); - Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask); - Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask); - Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask); + vd0 = Vector256_.ShufflePerLane(vs0, mask); + Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)1), mask); + Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)2), mask); + Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)3), mask); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask); + Unsafe.Add(ref destinationBase, i) = Vector256_.ShufflePerLane(Unsafe.Add(ref sourceBase, i), mask); } } } diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index a3b8e0156e..7eac4f58c4 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -47,8 +47,10 @@ public static Vector128 Average(Vector128 left, Vector128 righ return AdvSimd.FusedAddRoundedHalving(left, right); } - // Portable fallback: (a + b + 1) >> 1 - return (left + right + Vector128.Create((byte)1)) >> 1; + // Account for potential 9th bit to ensure correct rounded result. + return Vector128.Narrow( + (Vector128.WidenLower(left) + Vector128.WidenLower(right) + Vector128.One) >> 1, + (Vector128.WidenUpper(left) + Vector128.WidenUpper(right) + Vector128.One) >> 1); } /// @@ -117,13 +119,17 @@ public static Vector128 ShuffleHigh(Vector128 value, [ConstantExpe } // Don't use InverseMMShuffle here as we want to avoid the cast. - Vector64 indices = Vector64.Create( - (short)(control & 0x3), - (short)((control >> 2) & 0x3), - (short)((control >> 4) & 0x3), - (short)((control >> 6) & 0x3)); - - return Vector128.Create(value.GetLower(), Vector64.Shuffle(value.GetUpper(), indices)); + Vector128 indices = Vector128.Create( + 0, + 1, + 2, + 3, + (short)((control & 0x3) + 4), + (short)(((control >> 2) & 0x3) + 4), + (short)(((control >> 4) & 0x3) + 4), + (short)(((control >> 6) & 0x3) + 4)); + + return Vector128.Shuffle(value, indices); } /// @@ -144,13 +150,17 @@ public static Vector128 ShuffleLow(Vector128 value, [ConstantExpec } // Don't use InverseMMShuffle here as we want to avoid the cast. - Vector64 indices = Vector64.Create( - (short)(control & 0x3), - (short)((control >> 2) & 0x3), - (short)((control >> 4) & 0x3), - (short)((control >> 6) & 0x3)); - - return Vector128.Create(Vector64.Shuffle(value.GetLower(), indices), value.GetUpper()); + Vector128 indices = Vector128.Create( + (short)(control & 0x3), + (short)((control >> 2) & 0x3), + (short)((control >> 4) & 0x3), + (short)((control >> 6) & 0x3), + 4, + 5, + 6, + 7); + + return Vector128.Shuffle(value, indices); } /// @@ -237,28 +247,13 @@ public static Vector128 ShiftLeftBytesInVector(Vector128 value, [Con [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 ShiftLeftLogical(Vector128 value, [ConstantExpected] byte count) { - if (Sse2.IsSupported) - { - return Sse2.ShiftLeftLogical(value, count); - } - // Zero lanes where count >= 16 to match SSE2 if (count >= 16) { return Vector128.Zero; } - if (AdvSimd.IsSupported) - { - return AdvSimd.ShiftLogical(value, Vector128.Create((short)count)); - } - - if (PackedSimd.IsSupported) - { - return PackedSimd.ShiftLeft(value, count); - } - - return Vector128.ShiftLeft(value, count); + return value << count; } /// @@ -536,6 +531,11 @@ public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector12 Vector128 prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower()); Vector128 prodHi = AdvSimd.MultiplyWideningLower(left.GetUpper(), right.GetUpper()); + if (AdvSimd.Arm64.IsSupported) + { + return AdvSimd.Arm64.AddPairwise(prodLo, prodHi); + } + Vector128 v0 = AdvSimd.AddPairwiseWidening(prodLo); Vector128 v1 = AdvSimd.AddPairwiseWidening(prodHi); @@ -587,50 +587,26 @@ public static Vector128 HorizontalAdd(Vector128 left, Vector128 even = Vector128.Create(0, 2, 4, 6, 8, 8, 8, 8); - Vector128 odd = Vector128.Create(1, 3, 5, 7, 8, 8, 8, 8); - Vector128 v0 = Vector128.Shuffle(right, even); - Vector128 v1 = Vector128.Shuffle(right, odd); - Vector128 v2 = Vector128.Shuffle(left, even); - Vector128 v3 = Vector128.Shuffle(left, odd); - - return v0 + v1 + v2 + v3; - } - - /// - /// Multiply the packed 16-bit integers in and , producing - /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. - /// - /// - /// The first vector containing packed 16-bit integers to multiply. - /// - /// - /// The second vector containing packed 16-bit integers to multiply. - /// - /// - /// A vector containing the low 16 bits of the products of the packed 16-bit integers - /// from and . - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 MultiplyLow(Vector128 left, Vector128 right) - { - if (Sse2.IsSupported) + if (AdvSimd.IsSupported) { - return Sse2.MultiplyLow(left, right); - } + Vector128 v0 = AdvSimd.AddPairwiseWidening(left); + Vector128 v1 = AdvSimd.AddPairwiseWidening(right); - // Widen each half of the short vectors into two int vectors - (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); - (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + return Vector128.Narrow(v0, v1); + } - // Elementwise multiply: each int lane now holds the full 32-bit product - Vector128 prodLo = leftLo * rightLo; - Vector128 prodHi = leftHi * rightHi; + { + // Extract the low and high parts of the products shuffling them to form a result we can add together. + // Use out-of-bounds to zero out the unused lanes. + Vector128 even = Vector128.Create(0, 2, 4, 6, 8, 8, 8, 8); + Vector128 odd = Vector128.Create(1, 3, 5, 7, 8, 8, 8, 8); + Vector128 v0 = Vector128.Shuffle(right, even); + Vector128 v1 = Vector128.Shuffle(right, odd); + Vector128 v2 = Vector128.Shuffle(left, even); + Vector128 v3 = Vector128.Shuffle(left, odd); - // Narrow the two int vectors back into one short vector - return Vector128.Narrow(prodLo, prodHi); + return v0 + v1 + v2 + v3; + } } /// @@ -655,20 +631,33 @@ public static Vector128 MultiplyHigh(Vector128 left, Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); - (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + if (AdvSimd.IsSupported) + { + Vector128 prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower()); + Vector128 prodHi = AdvSimd.MultiplyWideningUpper(left, right); + + prodLo >>= 16; + prodHi >>= 16; + + return Vector128.Narrow(prodLo, prodHi); + } + + { + // Widen each half of the short vectors into two int vectors + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); - // Elementwise multiply: each int lane now holds the full 32-bit product - Vector128 prodLo = leftLo * rightLo; - Vector128 prodHi = leftHi * rightHi; + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; - // Arithmetic shift right by 16 bits to extract the high word - prodLo >>= 16; - prodHi >>= 16; + // Arithmetic shift right by 16 bits to extract the high word + prodLo >>= 16; + prodHi >>= 16; - // Narrow the two int vectors back into one short vector - return Vector128.Narrow(prodLo, prodHi); + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } } /// @@ -693,20 +682,33 @@ public static Vector128 MultiplyHigh(Vector128 left, Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); - (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + if (AdvSimd.IsSupported) + { + Vector128 prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower()); + Vector128 prodHi = AdvSimd.MultiplyWideningUpper(left, right); - // Elementwise multiply: each int lane now holds the full 32-bit product - Vector128 prodLo = leftLo * rightLo; - Vector128 prodHi = leftHi * rightHi; + prodLo >>= 16; + prodHi >>= 16; - // Arithmetic shift right by 16 bits to extract the high word - prodLo >>= 16; - prodHi >>= 16; + return Vector128.Narrow(prodLo, prodHi); + } + + { + // Widen each half of the short vectors into two uint vectors + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); - // Narrow the two int vectors back into one short vector - return Vector128.Narrow(prodLo, prodHi); + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; + + // Arithmetic shift right by 16 bits to extract the high word + prodLo >>= 16; + prodHi >>= 16; + + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } } /// @@ -1363,90 +1365,4 @@ public static Vector128 SubtractSaturate(Vector128 left, Vector128 // Narrow back to signed bytes return Vector128.Narrow(diffLo, diffHi); } - - /// - /// Create mask from the most significant bit of each 8-bit element in , and store the result. - /// - /// - /// The vector containing packed 8-bit integers from which to create the mask. - /// - /// - /// A 16-bit integer mask where each bit corresponds to the most significant bit of each 8-bit element - /// in . - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int MoveMask(Vector128 value) - { - if (Sse2.IsSupported) - { - return Sse2.MoveMask(value); - } - - // AdvSimd versions ported from Stack Overflow answer: - // https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon - if (AdvSimd.Arm64.IsSupported) - { - // Shift values to align each MSB to its corresponding bit in the output - Vector128 shift = Vector128.Create(-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0); - - // Mask to isolate MSBs - Vector128 msbMask = Vector128.Create((byte)0x80); - Vector128 masked = value & msbMask; - - // Shift each MSB into the correct bit position - Vector128 shifted = AdvSimd.ShiftLogical(masked.AsSByte(), shift).AsByte(); - - // Sum lanes: lower 8 go into bits 0–7, upper 8 go into bits 8–15 - byte lo = AdvSimd.Arm64.AddAcross(shifted.GetLower()).ToScalar(); - byte hi = AdvSimd.Arm64.AddAcross(shifted.GetUpper()).ToScalar(); - - return lo + (hi << 8); - } - - if (AdvSimd.IsSupported) - { - Vector128 powers = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); - Vector128 msbMask = Vector128.Create((byte)0x80); - Vector128 normalized = AdvSimd.CompareEqual(value & msbMask, msbMask); // 0xFF or 0x00 - Vector128 masked = normalized & powers; - - Vector128 sum8 = AdvSimd.AddPairwiseWidening(masked); - Vector128 sum16 = AdvSimd.AddPairwiseWidening(sum8); - Vector128 sum32 = AdvSimd.AddPairwiseWidening(sum16); - - // Extract lower 8 bits of each 64-bit lane - byte lo = sum32.AsByte().GetElement(0); - byte hi = sum32.AsByte().GetElement(8); - - return (hi << 8) | lo; - } - - { - // Step 1: isolate MSBs - Vector128 msbMask = Vector128.Create((byte)0x80); - Vector128 masked = value & msbMask; - - // Step 2: shift each byte so MSB lands in bit position [0..15] - // i.e. convert: 0x80 → 1 << i - Vector128 bitShifts = Vector128.Create((ushort)1, 2, 4, 8, 16, 32, 64, 128); - Vector128 bitShiftsHigh = Vector128.Create(256, 512, 1024, 2048, 4096, 8192, 16384, 32768); - - // Step 3: widen to ushort - (Vector128 lo, Vector128 hi) = Vector128.Widen(masked); - - // Step 4: compare > 0 to get 0xFFFF where MSB was set - lo = Vector128.ConditionalSelect(Vector128.Equals(lo, Vector128.Zero), Vector128.Zero, bitShifts); - hi = Vector128.ConditionalSelect(Vector128.Equals(hi, Vector128.Zero), Vector128.Zero, bitShiftsHigh); - - // Step 5: bitwise OR the two halves - Vector128 maskVector = lo | hi; - - // Step 6: horizontal OR reduction via shuffles - maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)4, 5, 6, 7, 0, 1, 2, 3)); - maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)2, 3, 0, 1, 6, 7, 4, 5)); - maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)1, 0, 3, 2, 5, 4, 7, 6)); - - return maskVector.ToScalar(); - } - } } diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 4769df2b0b..14ac13dd8d 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -39,14 +39,17 @@ public static Vector256 ShuffleNative(Vector256 vector, [ConstantE /// /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 ShuffleNative(Vector256 vector, Vector256 indices) + public static Vector256 ShufflePerLane(Vector256 vector, Vector256 indices) { if (Avx2.IsSupported) { return Avx2.Shuffle(vector, indices); } - return Vector256.Shuffle(vector, indices); + Vector128 indicesLo = indices.GetLower(); + Vector128 lower = Vector128_.ShuffleNative(vector.GetLower(), indicesLo); + Vector128 upper = Vector128_.ShuffleNative(vector.GetUpper(), indicesLo); + return Vector256.Create(lower, upper); } /// @@ -458,26 +461,4 @@ public static Vector256 SubtractSaturate(Vector256 left, Vector256 - /// Create mask from the most significant bit of each 8-bit element in , and store the result. - /// - /// - /// The vector containing packed 8-bit integers from which to create the mask. - /// - /// - /// A 16-bit integer mask where each bit corresponds to the most significant bit of each 8-bit element - /// in . - /// - public static int MoveMask(Vector256 value) - { - if (Avx2.IsSupported) - { - return Avx2.MoveMask(value); - } - - int loMask = Vector128_.MoveMask(value.GetLower()); - int hiMask = Vector128_.MoveMask(value.GetUpper()); - return loMask | (hiMask << 16); - } } diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs index a0930c75b0..c701d56d3f 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs @@ -16,6 +16,9 @@ public static void CollectColorBlueTransforms(Span bgra, int stride, int t { const int span = 16; Span values = stackalloc ushort[span]; + + // These shuffle masks are safe for use with Avx2.Shuffle because all indices are within their respective 128-bit lanes (0–15 for the low mask, 16–31 for the high mask), + // and all disabled lanes are set to 0xFF to zero those bytes per the vpshufb specification. This guarantees lane-local shuffling with no cross-lane violations. Vector256 collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255); Vector256 collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30); Vector256 collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); @@ -33,8 +36,8 @@ public static void CollectColorBlueTransforms(Span bgra, int stride, int t nuint input1Idx = x + (span / 2); Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector256 r0 = Vector256_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask256); - Vector256 r1 = Vector256_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask256); + Vector256 r0 = Vector256_.ShufflePerLane(input0, collectColorBlueTransformsShuffleLowMask256); + Vector256 r1 = Vector256_.ShufflePerLane(input1, collectColorBlueTransformsShuffleHighMask256); Vector256 r = r0 | r1; Vector256 gb0 = input0 & collectColorBlueTransformsGreenBlueMask256; Vector256 gb1 = input1 & collectColorBlueTransformsGreenBlueMask256; diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 8cc9fd05b9..e573097e53 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -97,6 +97,9 @@ public static void AddGreenToBlueAndRed(Span pixelData) { if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8) { + // The `255` values disable the write for alpha (A), since 0x80 is set in the control byte (high bit set). + // Each byte index is within its respective 128-bit lane (0–15 and 16–31), so this is safe for per-lane shuffle. + // The high bits are not set for the index bytes, and the values are always < 16 per lane, satisfying AVX2 lane rules. Vector256 addGreenToBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); nuint numPixels = (uint)pixelData.Length; nuint i = 0; @@ -104,7 +107,7 @@ public static void AddGreenToBlueAndRed(Span pixelData) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); - Vector256 in0g0g = Vector256_.ShuffleNative(input, addGreenToBlueAndRedMask); + Vector256 in0g0g = Vector256_.ShufflePerLane(input, addGreenToBlueAndRedMask); Vector256 output = input + in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 8; @@ -168,7 +171,7 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); - Vector256 in0g0g = Vector256_.ShuffleNative(input, subtractGreenFromBlueAndRedMask); + Vector256 in0g0g = Vector256_.ShufflePerLane(input, subtractGreenFromBlueAndRedMask); Vector256 output = input - in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 8; diff --git a/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs b/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs index b08fe15f51..acfa26b4ff 100644 --- a/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs +++ b/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs @@ -44,8 +44,8 @@ public static unsafe bool CheckNonOpaque(ReadOnlySpan row) Vector256 c1 = Vector256_.PackSignedSaturate(b2, b3).AsInt16(); Vector256 d = Vector256_.PackSignedSaturate(c0, c1).AsByte(); Vector256 bits = Vector256.Equals(d, all0x80Vector256); - int mask = Vector256_.MoveMask(bits); - if (mask != -1) + uint mask = bits.ExtractMostSignificantBits(); + if (mask != 0xFFFF_FFFF) { return true; } @@ -138,7 +138,7 @@ private static unsafe bool IsNoneOpaque64BytesVector128(byte* src, int i) Vector128 c1 = Vector128_.PackSignedSaturate(b2, b3).AsInt16(); Vector128 d = Vector128_.PackSignedSaturate(c0, c1).AsByte(); Vector128 bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte()); - int mask = Vector128_.MoveMask(bits); + uint mask = bits.ExtractMostSignificantBits(); return mask != 0xFFFF; } @@ -153,7 +153,7 @@ private static unsafe bool IsNonOpaque32BytesVector128(byte* src, int i) Vector128 c = Vector128_.PackSignedSaturate(b0, b1).AsInt16(); Vector128 d = Vector128_.PackSignedSaturate(c, c).AsByte(); Vector128 bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte()); - int mask = Vector128_.MoveMask(bits); + uint mask = bits.ExtractMostSignificantBits(); return mask != 0xFFFF; } } From 8355353985776f06d8199b05102cc28b00f4a72f Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 10 Jun 2025 10:48:04 +1000 Subject: [PATCH 20/20] Respond to additional feedback --- .../Common/Helpers/Vector128Utilities.cs | 14 ++++---------- src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 16 ++++++++-------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 7eac4f58c4..a5d377eb90 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -86,11 +86,6 @@ public static Vector128 ShuffleNative(Vector128 vector, [ConstantE [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 ShuffleNative(Vector128 vector, [ConstantExpected] byte control) { - if (Sse2.IsSupported) - { - return Sse2.Shuffle(vector, control); - } - // Don't use InverseMMShuffle here as we want to avoid the cast. Vector128 indices = Vector128.Create( control & 0x3, @@ -529,17 +524,16 @@ public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector12 if (AdvSimd.IsSupported) { Vector128 prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower()); - Vector128 prodHi = AdvSimd.MultiplyWideningLower(left.GetUpper(), right.GetUpper()); + Vector128 prodHi = AdvSimd.MultiplyWideningUpper(left, right); if (AdvSimd.Arm64.IsSupported) { return AdvSimd.Arm64.AddPairwise(prodLo, prodHi); } - Vector128 v0 = AdvSimd.AddPairwiseWidening(prodLo); - Vector128 v1 = AdvSimd.AddPairwiseWidening(prodHi); - - return Vector128.Narrow(v0, v1); + Vector64 v0 = AdvSimd.AddPairwise(prodLo.GetLower(), prodLo.GetUpper()); + Vector64 v1 = AdvSimd.AddPairwise(prodHi.GetLower(), prodHi.GetUpper()); + return Vector128.Create(v0, v1); } { diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 72420a0947..7fe71588c4 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -221,10 +221,10 @@ public static void ITransformOne(Span reference, Span input, Span ref0 = Vector128.CreateScalar(Unsafe.As(ref referenceRef)).AsByte(); - Vector128 ref1 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); - Vector128 ref2 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); - Vector128 ref3 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); + Vector128 ref0 = Vector128.CreateScalar(Unsafe.ReadUnaligned(ref referenceRef)).AsByte(); + Vector128 ref1 = Vector128.CreateScalar(Unsafe.ReadUnaligned(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); + Vector128 ref2 = Vector128.CreateScalar(Unsafe.ReadUnaligned(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); + Vector128 ref3 = Vector128.CreateScalar(Unsafe.ReadUnaligned(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); // Convert to 16b. ref0 = Vector128_.UnpackLow(ref0, Vector128.Zero); @@ -253,10 +253,10 @@ public static void ITransformOne(Span reference, Span input, Span(ref outputRef) = output0; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; + Unsafe.WriteUnaligned(ref outputRef, output0); + Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps), output1); + Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2), output2); + Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3), output3); } else {