From 6e8def1cc87808965cd0fc5a6f141161cc02de27 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 00:11:44 +0100 Subject: [PATCH 01/12] Add sse2 version of inverse transform --- .../Formats/Webp/Lossy/LossyUtils.cs | 60 ++++-- .../Formats/Webp/Lossy/Vp8Encoding.cs | 204 +++++++++++++++++- 2 files changed, 239 insertions(+), 25 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 3064ccc030..cfac273c49 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -661,28 +661,7 @@ public static int TTransformSse41(Span inputA, Span inputB, Span transpose00 = Sse2.UnpackLow(b0, b1); - Vector128 transpose01 = Sse2.UnpackLow(b2, b3); - Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); - Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); - - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); - Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); - - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - Vector128 output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); - Vector128 output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); - Vector128 output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); - Vector128 output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -728,6 +707,43 @@ public static int TTransformSse41(Span inputA, Span inputB, Span>(ref outputRef) = result.AsInt32(); return sum[3] + sum[2] + sum[1] + sum[0]; } + + // Transpose two 4x4 16b matrices horizontally stored in registers. + public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) + { + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + Vector128 transpose00 = Sse2.UnpackLow(b0, b1); + Vector128 transpose01 = Sse2.UnpackLow(b2, b3); + Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); + Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); + + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); + Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); + + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); + output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); + output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); + output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } #endif public static void TransformTwo(Span src, Span dst, Span scratch) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 0567a0f27d..cb149bec7f 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -4,6 +4,11 @@ using System; using System.Buffers.Binary; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Formats.Webp.Lossy { @@ -60,6 +65,12 @@ internal static class Vp8Encoding public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 }; +#if SUPPORTS_RUNTIME_INTRINSICS + public static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16(); + + public static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16(); +#endif + static Vp8Encoding() { for (int i = -255; i <= 255 + 255; i++) @@ -68,12 +79,199 @@ static Vp8Encoding() } } + // Transforms (Paragraph 14.4) + // Does one or two inverse transforms. public static void ITransform(Span reference, Span input, Span dst, bool doTwo, Span scratch) { - ITransformOne(reference, input, dst, scratch); - if (doTwo) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); + // This implementation makes use of 16-bit fixed point versions of two + // multiply constants: + // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 + // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 + // + // To be able to use signed 16-bit integers, we use the following trick to + // have constants within range: + // - Associated constants are obtained by subtracting the 16-bit fixed point + // version of one: + // k = K - (1 << 16) => K = k + (1 << 16) + // K1 = 85267 => k1 = 20091 + // K2 = 35468 => k2 = -30068 + // - The multiplication of a variable by a constant become the sum of the + // variable and the multiplication of that variable by the associated + // constant: + // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x + + // Load and concatenate the transform coefficients (we'll do two inverse + // transforms in parallel). In the case of only one inverse transform, the + // second half of the vectors will just contain random value we'll never + // use nor store. + ref short inputRef = ref MemoryMarshal.GetReference(input); + var in0 = Vector128.Create(Unsafe.As(ref inputRef), 0); + var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 4)), 0); + var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 8)), 0); + var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 12)), 0); + + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + if (doTwo) + { + var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 16)), 0); + var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 20)), 0); + var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); + var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); + + in0 = Sse2.UnpackLow(in0, inb0); + in1 = Sse2.UnpackLow(in1, inb1); + in2 = Sse2.UnpackLow(in2, inb2); + in3 = Sse2.UnpackLow(in3, inb3); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Vertical pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); + Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16()); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16()); + Vector128 c3 = Sse2.Subtract(in1, in3); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3.AsInt16(), c4.AsInt16()); + + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16()); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16()); + Vector128 d3 = Sse2.Add(in1, in3); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3.AsInt16(), d4.AsInt16()); + + // Second pass. + Vector128 tmp0 = Sse2.Add(a, d); + Vector128 tmp1 = Sse2.Add(b, c); + Vector128 tmp2 = Sse2.Subtract(b, c); + Vector128 tmp3 = Sse2.Subtract(a, d); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + + // Horizontal pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + var four = Vector128.Create((short)4); + Vector128 dc = Sse2.Add(t0.AsInt16(), four); + a = Sse2.Add(dc, t2.AsInt16()); + b = Sse2.Subtract(dc, t2.AsInt16()); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); + c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); + c3 = Sse2.Subtract(t1, t3); + c4 = Sse2.Subtract(c1, c2); + c = Sse2.Add(c3.AsInt16(), c4); + + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); + d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); + d3 = Sse2.Add(t1, t3); + d4 = Sse2.Add(d1, d2); + d = Sse2.Add(d3.AsInt16(), d4); + + // Second pass. + tmp0 = Sse2.Add(a, d); + tmp1 = Sse2.Add(b, c); + tmp2 = Sse2.Subtract(b, c); + tmp3 = Sse2.Subtract(a, d); + Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + + // Add inverse transform to 'ref' and store. + // Load the reference(s). + Vector128 ref0 = Vector128.Zero; + Vector128 ref1 = Vector128.Zero; + Vector128 ref2 = Vector128.Zero; + Vector128 ref3 = Vector128.Zero; + ref byte referenceRef = ref MemoryMarshal.GetReference(reference); + if (doTwo) + { + // Load eight bytes/pixels per line. + ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); + ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); + ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); + ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); + } + else + { + // Load four bytes/pixels per line. + ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); + ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); + ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); + ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); + } + + // Convert to 16b. + ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); + ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); + ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); + ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + + // Add the inverse transform(s). + Vector128 ref0InvAdded = Sse2.Add(ref0.AsUInt16(), t0.AsUInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsUInt16(), t1.AsUInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsUInt16(), t2.AsUInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsUInt16(), t3.AsUInt16()); + + // Unsigned saturate to 8b. + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16()); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded.AsInt16(), ref1InvAdded.AsInt16()); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded.AsInt16(), ref2InvAdded.AsInt16()); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded.AsInt16(), ref3InvAdded.AsInt16()); + + // Unsigned saturate to 8b. + if (doTwo) + { + // Store eight bytes/pixels per line. + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + Unsafe.As>(ref outputRef) = ref0; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3; + } + else + { + // Store four bytes/pixels per line. + int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); + int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); + int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); + int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); + + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + Unsafe.As(ref outputRef) = output0; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; + } + } + else +#endif + { + ITransformOne(reference, input, dst, scratch); + if (doTwo) + { + ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); + } } } From 835ecead49cd0e98b223d4e4cb9b32d11190b8b2 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 13:58:02 +0100 Subject: [PATCH 02/12] Store only eight bytes per line --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index c0f81b49f4..55fa2593c9 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -15,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// /// Methods for encoding a VP8 frame. /// - internal static class Vp8Encoding + internal static unsafe class Vp8Encoding { private const int KC1 = 20091 + (1 << 16); @@ -69,6 +69,8 @@ internal static class Vp8Encoding public static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16(); public static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16(); + + public static readonly Vector128 Four = Vector128.Create((short)4); #endif static Vp8Encoding() @@ -85,6 +87,7 @@ public static void ITransform(Span reference, Span input, Span reference, Span input, Span dc = Sse2.Add(t0.AsInt16(), four); + Vector128 dc = Sse2.Add(t0.AsInt16(), Four); a = Sse2.Add(dc, t2.AsInt16()); b = Sse2.Subtract(dc, t2.AsInt16()); @@ -243,11 +245,14 @@ public static void ITransform(Span reference, Span input, Span>(ref outputRef) = ref0; - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1; - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2; - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3; + // TODO: avoid pinning, if possible. + fixed (byte* dstPtr = dst) + { + Sse2.StoreScalar((long*)dstPtr, ref0.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref0.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref0.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref0.AsInt64()); + } } else { From 5968de8f779c21d46facd7b088ec8ae05ecb4a7b Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 13:58:37 +0100 Subject: [PATCH 03/12] Add sse tests for inverse transform --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 1 - .../Formats/WebP/Vp8EncodingTests.cs | 57 +++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 55fa2593c9..8f8cf7643a 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -87,7 +87,6 @@ public static void ITransform(Span reference, Span input, Span RunInverseTransformTest(); + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void InverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.AllowAll); + + [Fact] + public void InverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); +#endif + } +} From 5c0b598ece1dd8ca63664c93c01591310a98a16c Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 15:03:37 +0100 Subject: [PATCH 04/12] Fix copy paste mistake --- src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 8f8cf7643a..dab466b9a4 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -248,9 +248,9 @@ public static void ITransform(Span reference, Span input, Span Date: Thu, 11 Nov 2021 16:57:40 +0100 Subject: [PATCH 05/12] Better test case --- .../Formats/WebP/Vp8EncodingTests.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs index cd5a24d8cf..0534963897 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -24,15 +24,15 @@ private static void RunInverseTransformTest() 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129 }; - short[] input = { 177, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 177, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + short[] input = { 1, 216, -48, 0, 96, -24, -48, 24, 0, -24, 24, 0, 0, 0, 0, 0, 38, -240, -72, -24, 0, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; byte[] dst = new byte[128]; byte[] expected = { - 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 161, 160, 149, 105, 78, 127, 156, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 160, 160, 133, 85, 81, 129, 155, 167, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 156, 147, 109, 76, 85, 130, 153, 163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 152, 128, 87, 83, 88, 132, 152, 159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int[] scratch = new int[16]; From abcbc4c48d6bce5543a45003742f98ccd0b7ef9d Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 17:01:56 +0100 Subject: [PATCH 06/12] Fix issue: vectors need to be short type --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index dab466b9a4..6ec191baaa 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -3,6 +3,7 @@ using System; using System.Buffers.Binary; +using System.Linq; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS @@ -145,14 +146,14 @@ public static void ITransform(Span reference, Span input, Span c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16()); Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16()); - Vector128 c3 = Sse2.Subtract(in1, in3); + Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); Vector128 c4 = Sse2.Subtract(c1, c2); Vector128 c = Sse2.Add(c3.AsInt16(), c4.AsInt16()); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16()); Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16()); - Vector128 d3 = Sse2.Add(in1, in3); + Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); Vector128 d4 = Sse2.Add(d1, d2); Vector128 d = Sse2.Add(d3.AsInt16(), d4.AsInt16()); @@ -174,14 +175,14 @@ public static void ITransform(Span reference, Span input, Span reference, Span input, Span.Zero); // Add the inverse transform(s). - Vector128 ref0InvAdded = Sse2.Add(ref0.AsUInt16(), t0.AsUInt16()); - Vector128 ref1InvAdded = Sse2.Add(ref1.AsUInt16(), t1.AsUInt16()); - Vector128 ref2InvAdded = Sse2.Add(ref2.AsUInt16(), t2.AsUInt16()); - Vector128 ref3InvAdded = Sse2.Add(ref3.AsUInt16(), t3.AsUInt16()); + Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); // Unsigned saturate to 8b. ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16()); From 18ecb065a313601b5d81329f99677bc1357ce8d2 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 17:13:23 +0100 Subject: [PATCH 07/12] Add tests for executing only one transform --- .../Formats/WebP/Vp8EncodingTests.cs | 49 +++++++++++++++++-- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs index 0534963897..c4f8601b14 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -11,7 +11,39 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP [Trait("Format", "Webp")] public class Vp8EncodingTests { - private static void RunInverseTransformTest() + private static void RunOneInverseTransformTest() + { + // arrange + byte[] reference = + { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129 + }; + short[] input = { 1, 216, -48, 0, 96, -24, -48, 24, 0, -24, 24, 0, 0, 0, 0, 0, 38, -240, -72, -24, 0, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + byte[] dst = new byte[128]; + byte[] expected = + { + 161, 160, 149, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 160, 160, 133, 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 156, 147, 109, 76, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 152, 128, 87, 83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 + }; + int[] scratch = new int[16]; + + // act + Vp8Encoding.ITransform(reference, input, dst, false, scratch); + + // assert + Assert.True(dst.SequenceEqual(expected)); + } + + private static void RunTwoInverseTransformTest() { // arrange byte[] reference = @@ -44,14 +76,23 @@ private static void RunInverseTransformTest() } [Fact] - public void InverseTransform_Works() => RunInverseTransformTest(); + public void OneInverseTransform_Works() => RunOneInverseTransformTest(); + + [Fact] + public void TwoInverseTransform_Works() => RunTwoInverseTransformTest(); #if SUPPORTS_RUNTIME_INTRINSICS [Fact] - public void InverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.AllowAll); + public void OneInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.AllowAll); + + [Fact] + public void OneInverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); + + [Fact] + public void TwoInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTwoInverseTransformTest, HwIntrinsics.AllowAll); [Fact] - public void InverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); + public void TwoInverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTwoInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); #endif } } From 6e548b5e5bace5fa4c58529d616ff14438fc89bf Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 17:25:00 +0100 Subject: [PATCH 08/12] Remove unnecessary casts --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 6ec191baaa..70500566f0 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -144,18 +144,18 @@ public static void ITransform(Span reference, Span input, Span b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16()); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16()); + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3.AsInt16(), c4.AsInt16()); + Vector128 c = Sse2.Add(c3, c4); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16()); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16()); + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3.AsInt16(), d4.AsInt16()); + Vector128 d = Sse2.Add(d3, d4); // Second pass. Vector128 tmp0 = Sse2.Add(a, d); @@ -177,14 +177,14 @@ public static void ITransform(Span reference, Span input, Span reference, Span input, Span ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); // Unsigned saturate to 8b. - ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16()); - ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded.AsInt16(), ref1InvAdded.AsInt16()); - ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded.AsInt16(), ref2InvAdded.AsInt16()); - ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded.AsInt16(), ref3InvAdded.AsInt16()); + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); // Unsigned saturate to 8b. if (doTwo) From a201e8a1427976addf71974adfffc742cf8ab888 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 18:09:38 +0100 Subject: [PATCH 09/12] Avoid pinning --- src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 70500566f0..34a3a5f177 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -242,17 +242,14 @@ public static void ITransform(Span reference, Span input, Span>(ref outputRef) = ref0.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower(); } else { @@ -262,7 +259,6 @@ public static void ITransform(Span reference, Span input, Span(ref outputRef) = output0; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; From b7059ae23a72f62ae760f453b21d64fbd63057b0 Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Fri, 12 Nov 2021 12:58:58 +0100 Subject: [PATCH 10/12] Add [MethodImpl(InliningOptions.ShortMethod)] Co-authored-by: Anton Firszov --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index c80fd5817a..b8986f66ff 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -750,6 +750,7 @@ public static int TTransformSse41(Span inputA, Span inputB, Span b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) { // Transpose the two 4x4. From 544319e9ea8689e6f257c03e7990136bbfaad53e Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 12 Nov 2021 13:18:41 +0100 Subject: [PATCH 11/12] ITransform now always does two transforms --- src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 6 +- .../Formats/Webp/Lossy/Vp8Encoding.cs | 277 ++++++++++++------ .../Formats/WebP/Vp8EncodingTests.cs | 4 +- 3 files changed, 192 insertions(+), 95 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs index 38ed80590d..2fcea8ceea 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs @@ -329,7 +329,7 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M LossyUtils.TransformWht(dcTmp, tmp, scratch); for (n = 0; n < 16; n += 2) { - Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true, scratch); + Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), scratch); } return nz; @@ -342,7 +342,7 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span< Span scratch = it.Scratch3.AsSpan(0, 16); Vp8Encoding.FTransform(src, reference, tmp, scratch); int nz = QuantizeBlock(tmp, levels, ref dqm.Y1); - Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch); + Vp8Encoding.ITransformOne(reference, tmp, yuvOut, scratch); return nz; } @@ -375,7 +375,7 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc for (n = 0; n < 8; n += 2) { - Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true, scratch); + Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), scratch); } return nz << 16; diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 34a3a5f177..bcecdcd757 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -3,7 +3,6 @@ using System; using System.Buffers.Binary; -using System.Linq; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS @@ -16,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// /// Methods for encoding a VP8 frame. /// - internal static unsafe class Vp8Encoding + internal static class Vp8Encoding { private const int KC1 = 20091 + (1 << 16); @@ -83,8 +82,8 @@ static Vp8Encoding() } // Transforms (Paragraph 14.4) - // Does one or two inverse transforms. - public static void ITransform(Span reference, Span input, Span dst, bool doTwo, Span scratch) + // Does two inverse transforms. + public static void ITransform(Span reference, Span input, Span dst, Span scratch) { #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) @@ -120,23 +119,20 @@ public static void ITransform(Span reference, Span input, Span(ref Unsafe.Add(ref inputRef, 16)), 0); - var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 20)), 0); - var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); - var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); - - in0 = Sse2.UnpackLow(in0, inb0); - in1 = Sse2.UnpackLow(in1, inb1); - in2 = Sse2.UnpackLow(in2, inb2); - in3 = Sse2.UnpackLow(in3, inb3); - - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 - } + var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 16)), 0); + var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 20)), 0); + var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); + var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); + + in0 = Sse2.UnpackLow(in0, inb0); + in1 = Sse2.UnpackLow(in1, inb1); + in2 = Sse2.UnpackLow(in2, inb2); + in3 = Sse2.UnpackLow(in3, inb3); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. @@ -206,22 +202,12 @@ public static void ITransform(Span reference, Span input, Span ref2 = Vector128.Zero; Vector128 ref3 = Vector128.Zero; ref byte referenceRef = ref MemoryMarshal.GetReference(reference); - if (doTwo) - { - // Load eight bytes/pixels per line. - ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); - ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); - ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); - ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); - } - else - { - // Load four bytes/pixels per line. - ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); - ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); - ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); - ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); - } + + // Load eight bytes/pixels per line. + ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); + ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); + ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); + ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); // Convert to 16b. ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); @@ -243,72 +229,183 @@ public static void ITransform(Span reference, Span input, Span>(ref outputRef) = ref0.GetLower(); - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower(); - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower(); - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower(); - } - else - { - // Store four bytes/pixels per line. - int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); - int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); - int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); - int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); - - Unsafe.As(ref outputRef) = output0; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; - } + + // Store eight bytes/pixels per line. + Unsafe.As>(ref outputRef) = ref0.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower(); } else #endif { ITransformOne(reference, input, dst, scratch); - if (doTwo) - { - ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); - } + ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); } } public static void ITransformOne(Span reference, Span input, Span dst, Span scratch) { - int i; - Span tmp = scratch.Slice(0, 16); - for (i = 0; i < 4; i++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - // vertical pass. - int a = input[0] + input[8]; - int b = input[0] - input[8]; - int c = Mul(input[4], KC2) - Mul(input[12], KC1); - int d = Mul(input[4], KC1) + Mul(input[12], KC2); - tmp[0] = a + d; - tmp[1] = b + c; - tmp[2] = b - c; - tmp[3] = a - d; - tmp = tmp.Slice(4); - input = input.Slice(1); - } + // Load and concatenate the transform coefficients (we'll do two inverse + // transforms in parallel). In the case of only one inverse transform, the + // second half of the vectors will just contain random value we'll never + // use nor store. + ref short inputRef = ref MemoryMarshal.GetReference(input); + var in0 = Vector128.Create(Unsafe.As(ref inputRef), 0); + var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 4)), 0); + var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 8)), 0); + var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 12)), 0); - tmp = scratch; - for (i = 0; i < 4; i++) + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + + // Vertical pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); + Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); + Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3, c4); + + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); + Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3, d4); + + // Second pass. + Vector128 tmp0 = Sse2.Add(a, d); + Vector128 tmp1 = Sse2.Add(b, c); + Vector128 tmp2 = Sse2.Subtract(b, c); + Vector128 tmp3 = Sse2.Subtract(a, d); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + + // Horizontal pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + Vector128 dc = Sse2.Add(t0.AsInt16(), Four); + a = Sse2.Add(dc, t2.AsInt16()); + b = Sse2.Subtract(dc, t2.AsInt16()); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); + c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); + c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); + c4 = Sse2.Subtract(c1, c2); + c = Sse2.Add(c3, c4); + + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); + d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); + d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); + d4 = Sse2.Add(d1, d2); + d = Sse2.Add(d3, d4); + + // Second pass. + tmp0 = Sse2.Add(a, d); + tmp1 = Sse2.Add(b, c); + tmp2 = Sse2.Subtract(b, c); + tmp3 = Sse2.Subtract(a, d); + Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + + // Add inverse transform to 'ref' and store. + // Load the reference(s). + Vector128 ref0 = Vector128.Zero; + Vector128 ref1 = Vector128.Zero; + Vector128 ref2 = Vector128.Zero; + Vector128 ref3 = Vector128.Zero; + ref byte referenceRef = ref MemoryMarshal.GetReference(reference); + + // Load four bytes/pixels per line. + ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); + ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); + ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); + ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); + + // Convert to 16b. + ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); + ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); + ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); + ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + + // Add the inverse transform(s). + Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); + + // Unsigned saturate to 8b. + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); + + // Unsigned saturate to 8b. + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + + // Store four bytes/pixels per line. + int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); + int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); + int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); + int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); + + Unsafe.As(ref outputRef) = output0; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; + } + else +#endif { - // horizontal pass. - int dc = tmp[0] + 4; - int a = dc + tmp[8]; - int b = dc - tmp[8]; - int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1); - int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2); - Store(dst, reference, 0, i, a + d); - Store(dst, reference, 1, i, b + c); - Store(dst, reference, 2, i, b - c); - Store(dst, reference, 3, i, a - d); - tmp = tmp.Slice(1); + int i; + Span tmp = scratch.Slice(0, 16); + for (i = 0; i < 4; i++) + { + // vertical pass. + int a = input[0] + input[8]; + int b = input[0] - input[8]; + int c = Mul(input[4], KC2) - Mul(input[12], KC1); + int d = Mul(input[4], KC1) + Mul(input[12], KC2); + tmp[0] = a + d; + tmp[1] = b + c; + tmp[2] = b - c; + tmp[3] = a - d; + tmp = tmp.Slice(4); + input = input.Slice(1); + } + + tmp = scratch; + for (i = 0; i < 4; i++) + { + // horizontal pass. + int dc = tmp[0] + 4; + int a = dc + tmp[8]; + int b = dc - tmp[8]; + int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1); + int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2); + Store(dst, reference, 0, i, a + d); + Store(dst, reference, 1, i, b + c); + Store(dst, reference, 2, i, b - c); + Store(dst, reference, 3, i, a - d); + tmp = tmp.Slice(1); + } } } diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs index c4f8601b14..17c9beb9b7 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -37,7 +37,7 @@ private static void RunOneInverseTransformTest() int[] scratch = new int[16]; // act - Vp8Encoding.ITransform(reference, input, dst, false, scratch); + Vp8Encoding.ITransformOne(reference, input, dst, scratch); // assert Assert.True(dst.SequenceEqual(expected)); @@ -69,7 +69,7 @@ private static void RunTwoInverseTransformTest() int[] scratch = new int[16]; // act - Vp8Encoding.ITransform(reference, input, dst, true, scratch); + Vp8Encoding.ITransform(reference, input, dst, scratch); // assert Assert.True(dst.SequenceEqual(expected)); From 5074ee6204f7c33875ee40988f1dc9bb20211a3b Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 12 Nov 2021 13:33:30 +0100 Subject: [PATCH 12/12] Refactor: extract horizontal and vertical pass into methods --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 161 +++++++----------- 1 file changed, 63 insertions(+), 98 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index bcecdcd757..aa4ab5767b 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -136,61 +136,14 @@ public static void ITransform(Span reference, Span input, Span a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); - - // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3, c4); - - // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); - - // Second pass. - Vector128 tmp0 = Sse2.Add(a, d); - Vector128 tmp1 = Sse2.Add(b, c); - Vector128 tmp2 = Sse2.Subtract(b, c); - Vector128 tmp3 = Sse2.Subtract(a, d); + InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Four); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); - - // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); - - // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); - - // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); @@ -266,61 +219,14 @@ public static void ITransformOne(Span reference, Span input, Span a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); - - // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3, c4); - - // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); - - // Second pass. - Vector128 tmp0 = Sse2.Add(a, d); - Vector128 tmp1 = Sse2.Add(b, c); - Vector128 tmp2 = Sse2.Subtract(b, c); - Vector128 tmp3 = Sse2.Subtract(a, d); + InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Four); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); - - // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); - - // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); - - // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); @@ -409,6 +315,65 @@ public static void ITransformOne(Span reference, Span input, Span in0, Vector128 in2, Vector128 in1, Vector128 in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3) + { + Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); + Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); + Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3, c4); + + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); + Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3, d4); + + // Second pass. + tmp0 = Sse2.Add(a, d); + tmp1 = Sse2.Add(b, c); + tmp2 = Sse2.Subtract(b, c); + tmp3 = Sse2.Subtract(a, d); + } + + private static void InverseTransformHorizontalPass(Vector128 t0, Vector128 t2, Vector128 t1, Vector128 t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3) + { + Vector128 dc = Sse2.Add(t0.AsInt16(), Four); + Vector128 a = Sse2.Add(dc, t2.AsInt16()); + Vector128 b = Sse2.Subtract(dc, t2.AsInt16()); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + Vector128 c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); + Vector128 c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3, c4); + + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + Vector128 d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); + Vector128 d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3, d4); + + // Second pass. + Vector128 tmp0 = Sse2.Add(a, d); + Vector128 tmp1 = Sse2.Add(b, c); + Vector128 tmp2 = Sse2.Subtract(b, c); + Vector128 tmp3 = Sse2.Subtract(a, d); + shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); + shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); + shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); + shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + } +#endif + public static void FTransform2(Span src, Span reference, Span output, Span output2, Span scratch) { FTransform(src, reference, output, scratch);