From 6e8def1cc87808965cd0fc5a6f141161cc02de27 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 11 Nov 2021 00:11:44 +0100
Subject: [PATCH 01/12] Add sse2 version of inverse transform

---
 .../Formats/Webp/Lossy/LossyUtils.cs          |  60 ++++--
 .../Formats/Webp/Lossy/Vp8Encoding.cs         | 204 +++++++++++++++++-
 2 files changed, 239 insertions(+), 25 deletions(-)
diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 3064ccc030..cfac273c49 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -661,28 +661,7 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
             // a20 a21 a22 a23   b20 b21 b22 b23
             // a30 a31 a32 a33   b30 b31 b32 b33
             // Transpose the two 4x4.
-            Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
-            Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
-            Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
-            Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
-
-            // a00 a10 a01 a11   a02 a12 a03 a13
-            // a20 a30 a21 a31   a22 a32 a23 a33
-            // b00 b10 b01 b11   b02 b12 b03 b13
-            // b20 b30 b21 b31   b22 b32 b23 b33
-            Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
-            Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
-            Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
-            Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
-
-            // a00 a10 a20 a30 a01 a11 a21 a31
-            // b00 b10 b20 b30 b01 b11 b21 b31
-            // a02 a12 a22 a32 a03 a13 a23 a33
-            // b02 b12 a22 b32 b03 b13 b23 b33
-            Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
-            Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
-            Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
-            Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
+            Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128<long> output0, out Vector128<long> output1, out Vector128<long> output2, out Vector128<long> output3);
 
             // a00 a10 a20 a30   b00 b10 b20 b30
             // a01 a11 a21 a31   b01 b11 b21 b31
@@ -728,6 +707,43 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
             Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
             return sum[3] + sum[2] + sum[1] + sum[0];
         }
+
+        // Transpose two 4x4 16b matrices horizontally stored in registers.
+        public static void Vp8Transpose_2_4x4_16b(Vector128<short> b0, Vector128<short> b1, Vector128<short> b2, Vector128<short> b3, out Vector128<long> output0, out Vector128<long> output1, out Vector128<long> output2, out Vector128<long> output3)
+        {
+            // Transpose the two 4x4.
+            // a00 a01 a02 a03   b00 b01 b02 b03
+            // a10 a11 a12 a13   b10 b11 b12 b13
+            // a20 a21 a22 a23   b20 b21 b22 b23
+            // a30 a31 a32 a33   b30 b31 b32 b33
+            Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
+            Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
+            Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
+            Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
+
+            // a00 a10 a01 a11   a02 a12 a03 a13
+            // a20 a30 a21 a31   a22 a32 a23 a33
+            // b00 b10 b01 b11   b02 b12 b03 b13
+            // b20 b30 b21 b31   b22 b32 b23 b33
+            Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
+            Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
+            Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
+            Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
+
+            // a00 a10 a20 a30 a01 a11 a21 a31
+            // b00 b10 b20 b30 b01 b11 b21 b31
+            // a02 a12 a22 a32 a03 a13 a23 a33
+            // b02 b12 a22 b32 b03 b13 b23 b33
+            output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
+            output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
+            output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
+            output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
+
+            // a00 a10 a20 a30   b00 b10 b20 b30
+            // a01 a11 a21 a31   b01 b11 b21 b31
+            // a02 a12 a22 a32   b02 b12 b22 b32
+            // a03 a13 a23 a33   b03 b13 b23 b33
+        }
 #endif
 
         public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scratch)
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index 0567a0f27d..cb149bec7f 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -4,6 +4,11 @@
 using System;
 using System.Buffers.Binary;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
@@ -60,6 +65,12 @@ internal static class Vp8Encoding
 
         public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 };
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        public static readonly Vector128<short> K1 = Vector128.Create((short)20091).AsInt16();
+
+        public static readonly Vector128<short> K2 = Vector128.Create((short)-30068).AsInt16();
+#endif
+
         static Vp8Encoding()
         {
             for (int i = -255; i <= 255 + 255; i++)
@@ -68,12 +79,199 @@ static Vp8Encoding()
             }
         }
 
+        // Transforms (Paragraph 14.4)
+        // Does one or two inverse transforms.
         public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, bool doTwo, Span<int> scratch)
         {
-            ITransformOne(reference, input, dst, scratch);
-            if (doTwo)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
             {
-                ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
+                // This implementation makes use of 16-bit fixed point versions of two
+                // multiply constants:
+                //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+                //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+                //
+                // To be able to use signed 16-bit integers, we use the following trick to
+                // have constants within range:
+                // - Associated constants are obtained by subtracting the 16-bit fixed point
+                //   version of one:
+                //      k = K - (1 << 16)  =>  K = k + (1 << 16)
+                //      K1 = 85267  =>  k1 =  20091
+                //      K2 = 35468  =>  k2 = -30068
+                // - The multiplication of a variable by a constant become the sum of the
+                //   variable and the multiplication of that variable by the associated
+                //   constant:
+                //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+
+                // Load and concatenate the transform coefficients (we'll do two inverse
+                // transforms in parallel). In the case of only one inverse transform, the
+                // second half of the vectors will just contain random value we'll never
+                // use nor store.
+                ref short inputRef = ref MemoryMarshal.GetReference(input);
+                var in0 = Vector128.Create(Unsafe.As<short, long>(ref inputRef), 0);
+                var in1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 4)), 0);
+                var in2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 8)), 0);
+                var in3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 12)), 0);
+
+                // a00 a10 a20 a30   x x x x
+                // a01 a11 a21 a31   x x x x
+                // a02 a12 a22 a32   x x x x
+                // a03 a13 a23 a33   x x x x
+                if (doTwo)
+                {
+                    var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0);
+                    var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0);
+                    var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
+                    var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);
+
+                    in0 = Sse2.UnpackLow(in0, inb0);
+                    in1 = Sse2.UnpackLow(in1, inb1);
+                    in2 = Sse2.UnpackLow(in2, inb2);
+                    in3 = Sse2.UnpackLow(in3, inb3);
+
+                    // a00 a10 a20 a30   b00 b10 b20 b30
+                    // a01 a11 a21 a31   b01 b11 b21 b31
+                    // a02 a12 a22 a32   b02 b12 b22 b32
+                    // a03 a13 a23 a33   b03 b13 b23 b33
+                }
+
+                // Vertical pass and subsequent transpose.
+                // First pass, c and d calculations are longer because of the "trick" multiplications.
+                Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
+                Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
+
+                // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+                Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16());
+                Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16());
+                Vector128<long> c3 = Sse2.Subtract(in1, in3);
+                Vector128<short> c4 = Sse2.Subtract(c1, c2);
+                Vector128<short> c = Sse2.Add(c3.AsInt16(), c4.AsInt16());
+
+                // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+                Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16());
+                Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16());
+                Vector128<long> d3 = Sse2.Add(in1, in3);
+                Vector128<short> d4 = Sse2.Add(d1, d2);
+                Vector128<short> d = Sse2.Add(d3.AsInt16(), d4.AsInt16());
+
+                // Second pass.
+                Vector128<short> tmp0 = Sse2.Add(a, d);
+                Vector128<short> tmp1 = Sse2.Add(b, c);
+                Vector128<short> tmp2 = Sse2.Subtract(b, c);
+                Vector128<short> tmp3 = Sse2.Subtract(a, d);
+
+                // Transpose the two 4x4.
+                LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
+
+                // Horizontal pass and subsequent transpose.
+                // First pass, c and d calculations are longer because of the "trick" multiplications.
+                var four = Vector128.Create((short)4);
+                Vector128<short> dc = Sse2.Add(t0.AsInt16(), four);
+                a = Sse2.Add(dc, t2.AsInt16());
+                b = Sse2.Subtract(dc, t2.AsInt16());
+
+                // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+                c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
+                c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
+                c3 = Sse2.Subtract(t1, t3);
+                c4 = Sse2.Subtract(c1, c2);
+                c = Sse2.Add(c3.AsInt16(), c4);
+
+                // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+                d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
+                d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
+                d3 = Sse2.Add(t1, t3);
+                d4 = Sse2.Add(d1, d2);
+                d = Sse2.Add(d3.AsInt16(), d4);
+
+                // Second pass.
+                tmp0 = Sse2.Add(a, d);
+                tmp1 = Sse2.Add(b, c);
+                tmp2 = Sse2.Subtract(b, c);
+                tmp3 = Sse2.Subtract(a, d);
+                Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
+                Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
+                Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
+                Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+
+                // Transpose the two 4x4.
+                LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
+
+                // Add inverse transform to 'ref' and store.
+                // Load the reference(s).
+                Vector128<byte> ref0 = Vector128<byte>.Zero;
+                Vector128<byte> ref1 = Vector128<byte>.Zero;
+                Vector128<byte> ref2 = Vector128<byte>.Zero;
+                Vector128<byte> ref3 = Vector128<byte>.Zero;
+                ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
+                if (doTwo)
+                {
+                    // Load eight bytes/pixels per line.
+                    ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
+                    ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
+                    ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
+                    ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
+                }
+                else
+                {
+                    // Load four bytes/pixels per line.
+                    ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
+                    ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
+                    ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
+                    ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
+                }
+
+                // Convert to 16b.
+                ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
+                ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
+                ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
+                ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
+
+                // Add the inverse transform(s).
+                Vector128<ushort> ref0InvAdded = Sse2.Add(ref0.AsUInt16(), t0.AsUInt16());
+                Vector128<ushort> ref1InvAdded = Sse2.Add(ref1.AsUInt16(), t1.AsUInt16());
+                Vector128<ushort> ref2InvAdded = Sse2.Add(ref2.AsUInt16(), t2.AsUInt16());
+                Vector128<ushort> ref3InvAdded = Sse2.Add(ref3.AsUInt16(), t3.AsUInt16());
+
+                // Unsigned saturate to 8b.
+                ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16());
+                ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded.AsInt16(), ref1InvAdded.AsInt16());
+                ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded.AsInt16(), ref2InvAdded.AsInt16());
+                ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded.AsInt16(), ref3InvAdded.AsInt16());
+
+                // Unsigned saturate to 8b.
+                if (doTwo)
+                {
+                    // Store eight bytes/pixels per line.
+                    ref byte outputRef = ref MemoryMarshal.GetReference(dst);
+                    Unsafe.As<byte, Vector128<byte>>(ref outputRef) = ref0;
+                    Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1;
+                    Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2;
+                    Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3;
+                }
+                else
+                {
+                    // Store four bytes/pixels per line.
+                    int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
+                    int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
+                    int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
+                    int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
+
+                    ref byte outputRef = ref MemoryMarshal.GetReference(dst);
+                    Unsafe.As<byte, int>(ref outputRef) = output0;
+                    Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
+                    Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
+                    Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
+                }
+            }
+            else
+#endif
+            {
+                ITransformOne(reference, input, dst, scratch);
+                if (doTwo)
+                {
+                    ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
+                }
             }
         }
 

From 835ecead49cd0e98b223d4e4cb9b32d11190b8b2 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 11 Nov 2021 13:58:02 +0100
Subject: [PATCH 02/12] Store only eight bytes per line

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs         | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index c0f81b49f4..55fa2593c9 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -15,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
     /// <summary>
     /// Methods for encoding a VP8 frame.
     /// </summary>
-    internal static class Vp8Encoding
+    internal static unsafe class Vp8Encoding
     {
         private const int KC1 = 20091 + (1 << 16);
 
@@ -69,6 +69,8 @@ internal static class Vp8Encoding
         public static readonly Vector128<short> K1 = Vector128.Create((short)20091).AsInt16();
 
         public static readonly Vector128<short> K2 = Vector128.Create((short)-30068).AsInt16();
+
+        public static readonly Vector128<short> Four = Vector128.Create((short)4);
 #endif
 
         static Vp8Encoding()
@@ -85,6 +87,7 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
+            //if (false)
             {
                 // This implementation makes use of 16-bit fixed point versions of two
                 // multiply constants:
@@ -165,8 +168,7 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
 
                 // Horizontal pass and subsequent transpose.
                 // First pass, c and d calculations are longer because of the "trick" multiplications.
-                var four = Vector128.Create((short)4);
-                Vector128<short> dc = Sse2.Add(t0.AsInt16(), four);
+                Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
                 a = Sse2.Add(dc, t2.AsInt16());
                 b = Sse2.Subtract(dc, t2.AsInt16());
 
@@ -243,11 +245,14 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 if (doTwo)
                 {
                     // Store eight bytes/pixels per line.
-                    ref byte outputRef = ref MemoryMarshal.GetReference(dst);
-                    Unsafe.As<byte, Vector128<byte>>(ref outputRef) = ref0;
-                    Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1;
-                    Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2;
-                    Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3;
+                    // TODO: avoid pinning, if possible.
+                    fixed (byte* dstPtr = dst)
+                    {
+                        Sse2.StoreScalar((long*)dstPtr, ref0.AsInt64());
+                        Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref0.AsInt64());
+                        Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref0.AsInt64());
+                        Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref0.AsInt64());
+                    }
                 }
                 else
                 {

From 5968de8f779c21d46facd7b088ec8ae05ecb4a7b Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 11 Nov 2021 13:58:37 +0100
Subject: [PATCH 03/12] Add sse tests for inverse transform

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs         |  1 -
 .../Formats/WebP/Vp8EncodingTests.cs          | 57 +++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index 55fa2593c9..8f8cf7643a 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -87,7 +87,6 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
-            //if (false)
             {
                 // This implementation makes use of 16-bit fixed point versions of two
                 // multiply constants:
diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
new file mode 100644
index 0000000000..cd5a24d8cf
--- /dev/null
+++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
@@ -0,0 +1,57 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System.Linq;
+using SixLabors.ImageSharp.Formats.Webp.Lossy;
+using SixLabors.ImageSharp.Tests.TestUtilities;
+using Xunit;
+
+namespace SixLabors.ImageSharp.Tests.Formats.WebP
+{
+    [Trait("Format", "Webp")]
+    public class Vp8EncodingTests
+    {
+        private static void RunInverseTransformTest()
+        {
+            // arrange
+            byte[] reference =
+            {
+                128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129,
+                129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128,
+                128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129,
+                129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+                129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128,
+                128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129,
+                129, 129, 129, 129, 129, 129, 129, 129
+            };
+            short[] input = { 177, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 177, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+            byte[] dst = new byte[128];
+            byte[] expected =
+            {
+                150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            };
+            int[] scratch = new int[16];
+
+            // act
+            Vp8Encoding.ITransform(reference, input, dst, true, scratch);
+
+            // assert
+            Assert.True(dst.SequenceEqual(expected));
+        }
+
+        [Fact]
+        public void InverseTransform_Works() => RunInverseTransformTest();
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void InverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void InverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.DisableHWIntrinsic);
+#endif
+    }
+}

From 5c0b598ece1dd8ca63664c93c01591310a98a16c Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 11 Nov 2021 15:03:37 +0100
Subject: [PATCH 04/12] Fix copy paste mistake

---
 src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index 8f8cf7643a..dab466b9a4 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -248,9 +248,9 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                     fixed (byte* dstPtr = dst)
                     {
                         Sse2.StoreScalar((long*)dstPtr, ref0.AsInt64());
-                        Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref0.AsInt64());
-                        Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref0.AsInt64());
-                        Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref0.AsInt64());
+                        Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref1.AsInt64());
+                        Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref2.AsInt64());
+                        Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref3.AsInt64());
                     }
                 }
                 else

From 6039d2a8719a263d9a71a4cffb8b1325d6384947 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 11 Nov 2021 16:57:40 +0100
Subject: [PATCH 05/12] Better test case

---
 .../Formats/WebP/Vp8EncodingTests.cs                 | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
index cd5a24d8cf..0534963897 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
@@ -24,15 +24,15 @@ private static void RunInverseTransformTest()
                 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129,
                 129, 129, 129, 129, 129, 129, 129, 129
             };
-            short[] input = { 177, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 177, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+            short[] input = { 1, 216, -48, 0, 96, -24, -48, 24, 0, -24, 24, 0, 0, 0, 0, 0, 38, -240, -72, -24, 0, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
             byte[] dst = new byte[128];
             byte[] expected =
             {
-                150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+                161, 160, 149, 105, 78, 127, 156, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 160, 160, 133, 85, 81, 129, 155, 167, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 156, 147, 109, 76, 85, 130, 153, 163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 152, 128, 87, 83, 88, 132, 152, 159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
             };
             int[] scratch = new int[16];
 

From abcbc4c48d6bce5543a45003742f98ccd0b7ef9d Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 11 Nov 2021 17:01:56 +0100
Subject: [PATCH 06/12] Fix issue: vectors need to be short type

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs           | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index dab466b9a4..6ec191baaa 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -3,6 +3,7 @@
 
 using System;
 using System.Buffers.Binary;
+using System.Linq;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
@@ -145,14 +146,14 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
                 Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16());
                 Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16());
-                Vector128<long> c3 = Sse2.Subtract(in1, in3);
+                Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
                 Vector128<short> c4 = Sse2.Subtract(c1, c2);
                 Vector128<short> c = Sse2.Add(c3.AsInt16(), c4.AsInt16());
 
                 // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
                 Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16());
                 Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16());
-                Vector128<long> d3 = Sse2.Add(in1, in3);
+                Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
                 Vector128<short> d4 = Sse2.Add(d1, d2);
                 Vector128<short> d = Sse2.Add(d3.AsInt16(), d4.AsInt16());
 
@@ -174,14 +175,14 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
                 c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
                 c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
-                c3 = Sse2.Subtract(t1, t3);
+                c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
                 c4 = Sse2.Subtract(c1, c2);
                 c = Sse2.Add(c3.AsInt16(), c4);
 
                 // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
                 d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
                 d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
-                d3 = Sse2.Add(t1, t3);
+                d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
                 d4 = Sse2.Add(d1, d2);
                 d = Sse2.Add(d3.AsInt16(), d4);
 
@@ -229,10 +230,10 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
 
                 // Add the inverse transform(s).
-                Vector128<ushort> ref0InvAdded = Sse2.Add(ref0.AsUInt16(), t0.AsUInt16());
-                Vector128<ushort> ref1InvAdded = Sse2.Add(ref1.AsUInt16(), t1.AsUInt16());
-                Vector128<ushort> ref2InvAdded = Sse2.Add(ref2.AsUInt16(), t2.AsUInt16());
-                Vector128<ushort> ref3InvAdded = Sse2.Add(ref3.AsUInt16(), t3.AsUInt16());
+                Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
+                Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
+                Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
+                Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
 
                 // Unsigned saturate to 8b.
                 ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16());

From 18ecb065a313601b5d81329f99677bc1357ce8d2 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 11 Nov 2021 17:13:23 +0100
Subject: [PATCH 07/12] Add tests for executing only one transform

---
 .../Formats/WebP/Vp8EncodingTests.cs          | 49 +++++++++++++++++--
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
index 0534963897..c4f8601b14 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
@@ -11,7 +11,39 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
     [Trait("Format", "Webp")]
     public class Vp8EncodingTests
     {
-        private static void RunInverseTransformTest()
+        private static void RunOneInverseTransformTest()
+        {
+            // arrange
+            byte[] reference =
+            {
+                128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129,
+                129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128,
+                128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129,
+                129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+                129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128,
+                128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129,
+                129, 129, 129, 129, 129, 129, 129, 129
+            };
+            short[] input = { 1, 216, -48, 0, 96, -24, -48, 24, 0, -24, 24, 0, 0, 0, 0, 0, 38, -240, -72, -24, 0, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+            byte[] dst = new byte[128];
+            byte[] expected =
+            {
+                161, 160, 149, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 160, 160, 133, 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 156, 147, 109, 76, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 152, 128, 87, 83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0
+            };
+            int[] scratch = new int[16];
+
+            // act
+            Vp8Encoding.ITransform(reference, input, dst, false, scratch);
+
+            // assert
+            Assert.True(dst.SequenceEqual(expected));
+        }
+
+        private static void RunTwoInverseTransformTest()
         {
             // arrange
             byte[] reference =
@@ -44,14 +76,23 @@ private static void RunInverseTransformTest()
         }
 
         [Fact]
-        public void InverseTransform_Works() => RunInverseTransformTest();
+        public void OneInverseTransform_Works() => RunOneInverseTransformTest();
+
+        [Fact]
+        public void TwoInverseTransform_Works() => RunTwoInverseTransformTest();
 
 #if SUPPORTS_RUNTIME_INTRINSICS
         [Fact]
-        public void InverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.AllowAll);
+        public void OneInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void OneInverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.DisableHWIntrinsic);
+
+        [Fact]
+        public void TwoInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTwoInverseTransformTest, HwIntrinsics.AllowAll);
 
         [Fact]
-        public void InverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.DisableHWIntrinsic);
+        public void TwoInverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTwoInverseTransformTest, HwIntrinsics.DisableHWIntrinsic);
 #endif
     }
 }

From 6e548b5e5bace5fa4c58529d616ff14438fc89bf Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 11 Nov 2021 17:25:00 +0100
Subject: [PATCH 08/12] Remove unnecessary casts

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs         | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index 6ec191baaa..70500566f0 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -144,18 +144,18 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
 
                 // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
-                Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16());
-                Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16());
+                Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
+                Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
                 Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
                 Vector128<short> c4 = Sse2.Subtract(c1, c2);
-                Vector128<short> c = Sse2.Add(c3.AsInt16(), c4.AsInt16());
+                Vector128<short> c = Sse2.Add(c3, c4);
 
                 // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
-                Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16());
-                Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16());
+                Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
+                Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
                 Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
                 Vector128<short> d4 = Sse2.Add(d1, d2);
-                Vector128<short> d = Sse2.Add(d3.AsInt16(), d4.AsInt16());
+                Vector128<short> d = Sse2.Add(d3, d4);
 
                 // Second pass.
                 Vector128<short> tmp0 = Sse2.Add(a, d);
@@ -177,14 +177,14 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
                 c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
                 c4 = Sse2.Subtract(c1, c2);
-                c = Sse2.Add(c3.AsInt16(), c4);
+                c = Sse2.Add(c3, c4);
 
                 // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
                 d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
                 d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
                 d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
                 d4 = Sse2.Add(d1, d2);
-                d = Sse2.Add(d3.AsInt16(), d4);
+                d = Sse2.Add(d3, d4);
 
                 // Second pass.
                 tmp0 = Sse2.Add(a, d);
@@ -236,10 +236,10 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
 
                 // Unsigned saturate to 8b.
-                ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16());
-                ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded.AsInt16(), ref1InvAdded.AsInt16());
-                ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded.AsInt16(), ref2InvAdded.AsInt16());
-                ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded.AsInt16(), ref3InvAdded.AsInt16());
+                ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
+                ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
+                ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
+                ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
 
                 // Unsigned saturate to 8b.
                 if (doTwo)

From a201e8a1427976addf71974adfffc742cf8ab888 Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Thu, 11 Nov 2021 18:09:38 +0100
Subject: [PATCH 09/12] Avoid pinning

---
 src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index 70500566f0..34a3a5f177 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -242,17 +242,14 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
 
                 // Unsigned saturate to 8b.
+                ref byte outputRef = ref MemoryMarshal.GetReference(dst);
                 if (doTwo)
                 {
                     // Store eight bytes/pixels per line.
-                    // TODO: avoid pinning, if possible.
-                    fixed (byte* dstPtr = dst)
-                    {
-                        Sse2.StoreScalar((long*)dstPtr, ref0.AsInt64());
-                        Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref1.AsInt64());
-                        Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref2.AsInt64());
-                        Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref3.AsInt64());
-                    }
+                    Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower();
+                    Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower();
+                    Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower();
+                    Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower();
                 }
                 else
                 {
@@ -262,7 +259,6 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                     int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
                     int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
 
-                    ref byte outputRef = ref MemoryMarshal.GetReference(dst);
                     Unsafe.As<byte, int>(ref outputRef) = output0;
                     Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
                     Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;

From b7059ae23a72f62ae760f453b21d64fbd63057b0 Mon Sep 17 00:00:00 2001
From: Brian Popow <38701097+brianpopow@users.noreply.github.com>
Date: Fri, 12 Nov 2021 12:58:58 +0100
Subject: [PATCH 10/12] Add [MethodImpl(InliningOptions.ShortMethod)]

Co-authored-by: Anton Firszov <antonfir@gmail.com>
---
 src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index c80fd5817a..b8986f66ff 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -750,6 +750,7 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
         }
 
         // Transpose two 4x4 16b matrices horizontally stored in registers.
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static void Vp8Transpose_2_4x4_16b(Vector128<short> b0, Vector128<short> b1, Vector128<short> b2, Vector128<short> b3, out Vector128<long> output0, out Vector128<long> output1, out Vector128<long> output2, out Vector128<long> output3)
         {
             // Transpose the two 4x4.

From 544319e9ea8689e6f257c03e7990136bbfaad53e Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 12 Nov 2021 13:18:41 +0100
Subject: [PATCH 11/12] ITransform now always does two transforms

---
 src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs |   6 +-
 .../Formats/Webp/Lossy/Vp8Encoding.cs         | 277 ++++++++++++------
 .../Formats/WebP/Vp8EncodingTests.cs          |   4 +-
 3 files changed, 192 insertions(+), 95 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 38ed80590d..2fcea8ceea 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -329,7 +329,7 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M
             LossyUtils.TransformWht(dcTmp, tmp, scratch);
             for (n = 0; n < 16; n += 2)
             {
-                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true, scratch);
+                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), scratch);
             }
 
             return nz;
@@ -342,7 +342,7 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<
             Span<int> scratch = it.Scratch3.AsSpan(0, 16);
             Vp8Encoding.FTransform(src, reference, tmp, scratch);
             int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
-            Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
+            Vp8Encoding.ITransformOne(reference, tmp, yuvOut, scratch);
 
             return nz;
         }
@@ -375,7 +375,7 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc
 
             for (n = 0; n < 8; n += 2)
             {
-                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true, scratch);
+                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), scratch);
             }
 
             return nz << 16;
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index 34a3a5f177..bcecdcd757 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -3,7 +3,6 @@
 
 using System;
 using System.Buffers.Binary;
-using System.Linq;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
@@ -16,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
     /// <summary>
     /// Methods for encoding a VP8 frame.
     /// </summary>
-    internal static unsafe class Vp8Encoding
+    internal static class Vp8Encoding
     {
         private const int KC1 = 20091 + (1 << 16);
 
@@ -83,8 +82,8 @@ static Vp8Encoding()
         }
 
         // Transforms (Paragraph 14.4)
-        // Does one or two inverse transforms.
-        public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, bool doTwo, Span<int> scratch)
+        // Does two inverse transforms.
+        public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
@@ -120,23 +119,20 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 // a01 a11 a21 a31   x x x x
                 // a02 a12 a22 a32   x x x x
                 // a03 a13 a23 a33   x x x x
-                if (doTwo)
-                {
-                    var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0);
-                    var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0);
-                    var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
-                    var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);
-
-                    in0 = Sse2.UnpackLow(in0, inb0);
-                    in1 = Sse2.UnpackLow(in1, inb1);
-                    in2 = Sse2.UnpackLow(in2, inb2);
-                    in3 = Sse2.UnpackLow(in3, inb3);
-
-                    // a00 a10 a20 a30   b00 b10 b20 b30
-                    // a01 a11 a21 a31   b01 b11 b21 b31
-                    // a02 a12 a22 a32   b02 b12 b22 b32
-                    // a03 a13 a23 a33   b03 b13 b23 b33
-                }
+                var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0);
+                var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0);
+                var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
+                var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);
+
+                in0 = Sse2.UnpackLow(in0, inb0);
+                in1 = Sse2.UnpackLow(in1, inb1);
+                in2 = Sse2.UnpackLow(in2, inb2);
+                in3 = Sse2.UnpackLow(in3, inb3);
+
+                // a00 a10 a20 a30   b00 b10 b20 b30
+                // a01 a11 a21 a31   b01 b11 b21 b31
+                // a02 a12 a22 a32   b02 b12 b22 b32
+                // a03 a13 a23 a33   b03 b13 b23 b33
 
                 // Vertical pass and subsequent transpose.
                 // First pass, c and d calculations are longer because of the "trick" multiplications.
@@ -206,22 +202,12 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 Vector128<byte> ref2 = Vector128<byte>.Zero;
                 Vector128<byte> ref3 = Vector128<byte>.Zero;
                 ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
-                if (doTwo)
-                {
-                    // Load eight bytes/pixels per line.
-                    ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
-                    ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
-                    ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
-                    ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
-                }
-                else
-                {
-                    // Load four bytes/pixels per line.
-                    ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
-                    ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
-                    ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
-                    ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
-                }
+
+                // Load eight bytes/pixels per line.
+                ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
+                ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
+                ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
+                ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
 
                 // Convert to 16b.
                 ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
@@ -243,72 +229,183 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
 
                 // Unsigned saturate to 8b.
                 ref byte outputRef = ref MemoryMarshal.GetReference(dst);
-                if (doTwo)
-                {
-                    // Store eight bytes/pixels per line.
-                    Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower();
-                    Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower();
-                    Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower();
-                    Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower();
-                }
-                else
-                {
-                    // Store four bytes/pixels per line.
-                    int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
-                    int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
-                    int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
-                    int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
-
-                    Unsafe.As<byte, int>(ref outputRef) = output0;
-                    Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
-                    Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
-                    Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
-                }
+
+                // Store eight bytes/pixels per line.
+                Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower();
+                Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower();
+                Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower();
+                Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower();
             }
             else
 #endif
             {
                 ITransformOne(reference, input, dst, scratch);
-                if (doTwo)
-                {
-                    ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
-                }
+                ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
             }
         }
 
         public static void ITransformOne(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
         {
-            int i;
-            Span<int> tmp = scratch.Slice(0, 16);
-            for (i = 0; i < 4; i++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
             {
-                // vertical pass.
-                int a = input[0] + input[8];
-                int b = input[0] - input[8];
-                int c = Mul(input[4], KC2) - Mul(input[12], KC1);
-                int d = Mul(input[4], KC1) + Mul(input[12], KC2);
-                tmp[0] = a + d;
-                tmp[1] = b + c;
-                tmp[2] = b - c;
-                tmp[3] = a - d;
-                tmp = tmp.Slice(4);
-                input = input.Slice(1);
-            }
+                // Load and concatenate the transform coefficients (we'll do two inverse
+                // transforms in parallel). In the case of only one inverse transform, the
+                // second half of the vectors will just contain random value we'll never
+                // use nor store.
+                ref short inputRef = ref MemoryMarshal.GetReference(input);
+                var in0 = Vector128.Create(Unsafe.As<short, long>(ref inputRef), 0);
+                var in1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 4)), 0);
+                var in2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 8)), 0);
+                var in3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 12)), 0);
 
-            tmp = scratch;
-            for (i = 0; i < 4; i++)
+                // a00 a10 a20 a30   x x x x
+                // a01 a11 a21 a31   x x x x
+                // a02 a12 a22 a32   x x x x
+                // a03 a13 a23 a33   x x x x
+
+                // Vertical pass and subsequent transpose.
+                // First pass, c and d calculations are longer because of the "trick" multiplications.
+                Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
+                Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
+
+                // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+                Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
+                Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
+                Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
+                Vector128<short> c4 = Sse2.Subtract(c1, c2);
+                Vector128<short> c = Sse2.Add(c3, c4);
+
+                // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+                Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
+                Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
+                Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
+                Vector128<short> d4 = Sse2.Add(d1, d2);
+                Vector128<short> d = Sse2.Add(d3, d4);
+
+                // Second pass.
+                Vector128<short> tmp0 = Sse2.Add(a, d);
+                Vector128<short> tmp1 = Sse2.Add(b, c);
+                Vector128<short> tmp2 = Sse2.Subtract(b, c);
+                Vector128<short> tmp3 = Sse2.Subtract(a, d);
+
+                // Transpose the two 4x4.
+                LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
+
+                // Horizontal pass and subsequent transpose.
+                // First pass, c and d calculations are longer because of the "trick" multiplications.
+                Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
+                a = Sse2.Add(dc, t2.AsInt16());
+                b = Sse2.Subtract(dc, t2.AsInt16());
+
+                // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+                c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
+                c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
+                c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
+                c4 = Sse2.Subtract(c1, c2);
+                c = Sse2.Add(c3, c4);
+
+                // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+                d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
+                d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
+                d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
+                d4 = Sse2.Add(d1, d2);
+                d = Sse2.Add(d3, d4);
+
+                // Second pass.
+                tmp0 = Sse2.Add(a, d);
+                tmp1 = Sse2.Add(b, c);
+                tmp2 = Sse2.Subtract(b, c);
+                tmp3 = Sse2.Subtract(a, d);
+                Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
+                Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
+                Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
+                Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+
+                // Transpose the two 4x4.
+                LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
+
+                // Add inverse transform to 'ref' and store.
+                // Load the reference(s).
+                Vector128<byte> ref0 = Vector128<byte>.Zero;
+                Vector128<byte> ref1 = Vector128<byte>.Zero;
+                Vector128<byte> ref2 = Vector128<byte>.Zero;
+                Vector128<byte> ref3 = Vector128<byte>.Zero;
+                ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
+
+                // Load four bytes/pixels per line.
+                ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
+                ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
+                ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
+                ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
+
+                // Convert to 16b.
+                ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
+                ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
+                ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
+                ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
+
+                // Add the inverse transform(s).
+                Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
+                Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
+                Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
+                Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
+
+                // Unsigned saturate to 8b.
+                ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
+                ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
+                ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
+                ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
+
+                // Unsigned saturate to 8b.
+                ref byte outputRef = ref MemoryMarshal.GetReference(dst);
+
+                // Store four bytes/pixels per line.
+                int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
+                int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
+                int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
+                int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
+
+                Unsafe.As<byte, int>(ref outputRef) = output0;
+                Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
+                Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
+                Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
+            }
+            else
+#endif
             {
-                // horizontal pass.
-                int dc = tmp[0] + 4;
-                int a = dc + tmp[8];
-                int b = dc - tmp[8];
-                int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1);
-                int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2);
-                Store(dst, reference, 0, i, a + d);
-                Store(dst, reference, 1, i, b + c);
-                Store(dst, reference, 2, i, b - c);
-                Store(dst, reference, 3, i, a - d);
-                tmp = tmp.Slice(1);
+                int i;
+                Span<int> tmp = scratch.Slice(0, 16);
+                for (i = 0; i < 4; i++)
+                {
+                    // vertical pass.
+                    int a = input[0] + input[8];
+                    int b = input[0] - input[8];
+                    int c = Mul(input[4], KC2) - Mul(input[12], KC1);
+                    int d = Mul(input[4], KC1) + Mul(input[12], KC2);
+                    tmp[0] = a + d;
+                    tmp[1] = b + c;
+                    tmp[2] = b - c;
+                    tmp[3] = a - d;
+                    tmp = tmp.Slice(4);
+                    input = input.Slice(1);
+                }
+
+                tmp = scratch;
+                for (i = 0; i < 4; i++)
+                {
+                    // horizontal pass.
+                    int dc = tmp[0] + 4;
+                    int a = dc + tmp[8];
+                    int b = dc - tmp[8];
+                    int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1);
+                    int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2);
+                    Store(dst, reference, 0, i, a + d);
+                    Store(dst, reference, 1, i, b + c);
+                    Store(dst, reference, 2, i, b - c);
+                    Store(dst, reference, 3, i, a - d);
+                    tmp = tmp.Slice(1);
+                }
             }
         }
 
diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
index c4f8601b14..17c9beb9b7 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
@@ -37,7 +37,7 @@ private static void RunOneInverseTransformTest()
             int[] scratch = new int[16];
 
             // act
-            Vp8Encoding.ITransform(reference, input, dst, false, scratch);
+            Vp8Encoding.ITransformOne(reference, input, dst, scratch);
 
             // assert
             Assert.True(dst.SequenceEqual(expected));
@@ -69,7 +69,7 @@ private static void RunTwoInverseTransformTest()
             int[] scratch = new int[16];
 
             // act
-            Vp8Encoding.ITransform(reference, input, dst, true, scratch);
+            Vp8Encoding.ITransform(reference, input, dst, scratch);
 
             // assert
             Assert.True(dst.SequenceEqual(expected));

From 5074ee6204f7c33875ee40988f1dc9bb20211a3b Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 12 Nov 2021 13:33:30 +0100
Subject: [PATCH 12/12] Refactor: extract horizontal and vertical pass into
 methods

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs         | 161 +++++++-----------
 1 file changed, 63 insertions(+), 98 deletions(-)

diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index bcecdcd757..aa4ab5767b 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -136,61 +136,14 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
 
                 // Vertical pass and subsequent transpose.
                 // First pass, c and d calculations are longer because of the "trick" multiplications.
-                Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
-                Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
-
-                // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
-                Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
-                Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
-                Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
-                Vector128<short> c4 = Sse2.Subtract(c1, c2);
-                Vector128<short> c = Sse2.Add(c3, c4);
-
-                // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
-                Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
-                Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
-                Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
-                Vector128<short> d4 = Sse2.Add(d1, d2);
-                Vector128<short> d = Sse2.Add(d3, d4);
-
-                // Second pass.
-                Vector128<short> tmp0 = Sse2.Add(a, d);
-                Vector128<short> tmp1 = Sse2.Add(b, c);
-                Vector128<short> tmp2 = Sse2.Subtract(b, c);
-                Vector128<short> tmp3 = Sse2.Subtract(a, d);
+                InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
 
                 // Transpose the two 4x4.
                 LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
 
                 // Horizontal pass and subsequent transpose.
                 // First pass, c and d calculations are longer because of the "trick" multiplications.
-                Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
-                a = Sse2.Add(dc, t2.AsInt16());
-                b = Sse2.Subtract(dc, t2.AsInt16());
-
-                // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
-                c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
-                c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
-                c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
-                c4 = Sse2.Subtract(c1, c2);
-                c = Sse2.Add(c3, c4);
-
-                // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
-                d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
-                d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
-                d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
-                d4 = Sse2.Add(d1, d2);
-                d = Sse2.Add(d3, d4);
-
-                // Second pass.
-                tmp0 = Sse2.Add(a, d);
-                tmp1 = Sse2.Add(b, c);
-                tmp2 = Sse2.Subtract(b, c);
-                tmp3 = Sse2.Subtract(a, d);
-                Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
-                Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
-                Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
-                Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+                InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
 
                 // Transpose the two 4x4.
                 LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
@@ -266,61 +219,14 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
 
                 // Vertical pass and subsequent transpose.
                 // First pass, c and d calculations are longer because of the "trick" multiplications.
-                Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
-                Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
-
-                // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
-                Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
-                Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
-                Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
-                Vector128<short> c4 = Sse2.Subtract(c1, c2);
-                Vector128<short> c = Sse2.Add(c3, c4);
-
-                // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
-                Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
-                Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
-                Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
-                Vector128<short> d4 = Sse2.Add(d1, d2);
-                Vector128<short> d = Sse2.Add(d3, d4);
-
-                // Second pass.
-                Vector128<short> tmp0 = Sse2.Add(a, d);
-                Vector128<short> tmp1 = Sse2.Add(b, c);
-                Vector128<short> tmp2 = Sse2.Subtract(b, c);
-                Vector128<short> tmp3 = Sse2.Subtract(a, d);
+                InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
 
                 // Transpose the two 4x4.
                 LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
 
                 // Horizontal pass and subsequent transpose.
                 // First pass, c and d calculations are longer because of the "trick" multiplications.
-                Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
-                a = Sse2.Add(dc, t2.AsInt16());
-                b = Sse2.Subtract(dc, t2.AsInt16());
-
-                // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
-                c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
-                c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
-                c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
-                c4 = Sse2.Subtract(c1, c2);
-                c = Sse2.Add(c3, c4);
-
-                // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
-                d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
-                d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
-                d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
-                d4 = Sse2.Add(d1, d2);
-                d = Sse2.Add(d3, d4);
-
-                // Second pass.
-                tmp0 = Sse2.Add(a, d);
-                tmp1 = Sse2.Add(b, c);
-                tmp2 = Sse2.Subtract(b, c);
-                tmp3 = Sse2.Subtract(a, d);
-                Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
-                Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
-                Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
-                Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+                InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
 
                 // Transpose the two 4x4.
                 LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
@@ -409,6 +315,65 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
             }
         }
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static void InverseTransformVerticalPass(Vector128<long> in0, Vector128<long> in2, Vector128<long> in1, Vector128<long> in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3)
+        {
+            Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
+            Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
+
+            // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+            Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
+            Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
+            Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
+            Vector128<short> c4 = Sse2.Subtract(c1, c2);
+            Vector128<short> c = Sse2.Add(c3, c4);
+
+            // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+            Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
+            Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
+            Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
+            Vector128<short> d4 = Sse2.Add(d1, d2);
+            Vector128<short> d = Sse2.Add(d3, d4);
+
+            // Second pass.
+            tmp0 = Sse2.Add(a, d);
+            tmp1 = Sse2.Add(b, c);
+            tmp2 = Sse2.Subtract(b, c);
+            tmp3 = Sse2.Subtract(a, d);
+        }
+
+        private static void InverseTransformHorizontalPass(Vector128<long> t0, Vector128<long> t2, Vector128<long> t1, Vector128<long> t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3)
+        {
+            Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
+            Vector128<short> a = Sse2.Add(dc, t2.AsInt16());
+            Vector128<short> b = Sse2.Subtract(dc, t2.AsInt16());
+
+            // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+            Vector128<short> c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
+            Vector128<short> c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
+            Vector128<short> c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
+            Vector128<short> c4 = Sse2.Subtract(c1, c2);
+            Vector128<short> c = Sse2.Add(c3, c4);
+
+            // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+            Vector128<short> d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
+            Vector128<short> d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
+            Vector128<short> d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
+            Vector128<short> d4 = Sse2.Add(d1, d2);
+            Vector128<short> d = Sse2.Add(d3, d4);
+
+            // Second pass.
+            Vector128<short> tmp0 = Sse2.Add(a, d);
+            Vector128<short> tmp1 = Sse2.Add(b, c);
+            Vector128<short> tmp2 = Sse2.Subtract(b, c);
+            Vector128<short> tmp3 = Sse2.Subtract(a, d);
+            shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
+            shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
+            shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
+            shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+        }
+#endif
+
         public static void FTransform2(Span<byte> src, Span<byte> reference, Span<short> output, Span<short> output2, Span<int> scratch)
         {
             FTransform(src, reference, output, scratch);