From f8fae7f2437b1e275b2b6dbdbbaff081c089855c Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Tue, 24 Mar 2026 16:05:19 -0700 Subject: [PATCH 1/5] rewrite BlendVariableMask when mask is created from vector --- src/coreclr/jit/rationalize.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index e32a1b13ef6b52..fd7449b6d61163 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -638,7 +638,8 @@ void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStac return; } - GenTree* op2 = node->Op(2); + GenTree* op2 = node->Op(2); + GenTree*& op3 = node->Op(3); // We're in the post-order visit and are traversing in execution order, so // everything between op2 and node will have already been rewritten to LIR @@ -655,20 +656,25 @@ void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStac if (op2->isEmbeddedMaskingCompatible(m_compiler, tgtMaskSize, tgtSimdBaseType)) { - // We are going to utilize the embedded mask, so we don't need to rewrite. However, - // we want to fixup the simdBaseType here since it simplifies lowering and allows - // both embedded broadcast and the mask to be live simultaneously. + // Make sure we had a mask to begin with. We don't want to create a mask + // solely for the purpose of embedding it. - if (tgtSimdBaseType != TYP_UNDEF) + if (!op3->OperIsHWIntrinsic() || + (op3->AsHWIntrinsic()->GetHWIntrinsicId() != NI_AVX512_ConvertVectorToMask)) { - op2->AsHWIntrinsic()->SetSimdBaseType(tgtSimdBaseType); + // We are going to utilize the embedded mask, so we don't need to rewrite. However, + // we want to fixup the simdBaseType here since it simplifies lowering and allows + // both embedded broadcast and the mask to be live simultaneously. + + if (tgtSimdBaseType != TYP_UNDEF) + { + op2->AsHWIntrinsic()->SetSimdBaseType(tgtSimdBaseType); + } + return; } - return; } } - GenTree*& op3 = node->Op(3); - if (!ShouldRewriteToNonMaskHWIntrinsic(op3)) { return; From f7c178863774ac8de9b8b464a7e818982f82384b Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 22 Apr 2026 19:03:13 -0700 Subject: [PATCH 2/5] fix 127260 --- src/coreclr/jit/rationalize.cpp | 32 ++-- .../JitBlue/Runtime_127260/Runtime_127260.cs | 149 ++++++++++++++++++ .../JIT/Regression/Regression_ro_2.csproj | 1 + 3 files changed, 167 insertions(+), 15 deletions(-) create mode 100644 src/tests/JIT/Regression/JitBlue/Runtime_127260/Runtime_127260.cs diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index fd7449b6d61163..2baf5eba536f4d 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -649,33 +649,35 @@ void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStac // variant SideEffectSet scratchSideEffects; - if (scratchSideEffects.IsLirInvariantInRange(m_compiler, op2, node)) + // If the mask was originally a vector, we don't want to create a mask solely for + // the purpose of embedding it. vpmov*2m is relatively costly compared to blendvp*. + bool isVectorToMask = op3->OperIsConvertVectorToMask(); + bool isVectorBlendCompatible = true; + + if (isVectorToMask) + { + isVectorBlendCompatible = varTypeIsFloating(simdBaseType) || varTypeIsByte(simdBaseType); + } + else if (scratchSideEffects.IsLirInvariantInRange(m_compiler, op2, node)) { unsigned tgtMaskSize = simdSize / genTypeSize(simdBaseType); var_types tgtSimdBaseType = TYP_UNDEF; if (op2->isEmbeddedMaskingCompatible(m_compiler, tgtMaskSize, tgtSimdBaseType)) { - // Make sure we had a mask to begin with. We don't want to create a mask - // solely for the purpose of embedding it. + // We are going to utilize the embedded mask, so we don't need to rewrite. However, + // we want to fixup the simdBaseType here since it simplifies lowering and allows + // both embedded broadcast and the mask to be live simultaneously. - if (!op3->OperIsHWIntrinsic() || - (op3->AsHWIntrinsic()->GetHWIntrinsicId() != NI_AVX512_ConvertVectorToMask)) + if (tgtSimdBaseType != TYP_UNDEF) { - // We are going to utilize the embedded mask, so we don't need to rewrite. However, - // we want to fixup the simdBaseType here since it simplifies lowering and allows - // both embedded broadcast and the mask to be live simultaneously. - - if (tgtSimdBaseType != TYP_UNDEF) - { - op2->AsHWIntrinsic()->SetSimdBaseType(tgtSimdBaseType); - } - return; + op2->AsHWIntrinsic()->SetSimdBaseType(tgtSimdBaseType); } + return; } } - if (!ShouldRewriteToNonMaskHWIntrinsic(op3)) + if (!isVectorBlendCompatible || !ShouldRewriteToNonMaskHWIntrinsic(op3)) { return; } diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_127260/Runtime_127260.cs b/src/tests/JIT/Regression/JitBlue/Runtime_127260/Runtime_127260.cs new file mode 100644 index 00000000000000..793fe1e8e7be98 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_127260/Runtime_127260.cs @@ -0,0 +1,149 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using Xunit; + +public class Runtime_127260 +{ + [ConditionalFact(typeof(Sse41), nameof(Sse41.IsSupported))] + public static void TestBlendVariable() + { + Assert.Equal(Vector128.Zero, + BlendVariableSse41Single(Vector128.Create(-1.0f), Vector128.Zero, Vector128.Create(-0.0f))); + + Assert.Equal(Vector128.Zero, + BlendVariableSse41Double(Vector128.Create(-1.0), Vector128.Zero, Vector128.Create(-0.0))); + + Assert.Equal(Vector128.Zero, + BlendVariableSse41Int8(Vector128.Create(-1), Vector128.Zero, Vector128.Create(sbyte.MinValue))); + + Assert.Equal(Vector128.Create(0x00FF), + BlendVariableSse41Int16(Vector128.Create(-1), Vector128.Zero, Vector128.Create(short.MinValue))); + + Assert.Equal(Vector128.Create(0x00FFFFFF), + BlendVariableSse41Int32(Vector128.Create(-1), Vector128.Zero, Vector128.Create(int.MinValue))); + + Assert.Equal(Vector128.Create(0x00FFFFFF_FFFFFFFF), + BlendVariableSse41Int64(Vector128.Create(-1), Vector128.Zero, Vector128.Create(long.MinValue))); + } + + [ConditionalFact(typeof(Avx512BW.VL), nameof(Avx512BW.VL.IsSupported))] + public static void TestBlendVariableMask() + { + Assert.Equal(Vector128.Zero, + BlendVariableAvx512Single(Vector128.Create(-1.0f), Vector128.Zero, Vector128.Create(-0.0f))); + + Assert.Equal(Vector128.Zero, + BlendVariableAvx512Double(Vector128.Create(-1.0), Vector128.Zero, Vector128.Create(-0.0))); + + Assert.Equal(Vector128.Zero, + BlendVariableAvx512Int8(Vector128.Create(-1), Vector128.Zero, Vector128.Create(sbyte.MinValue))); + + Assert.Equal(Vector128.Zero, + BlendVariableAvx512Int16(Vector128.Create(-1), Vector128.Zero, Vector128.Create(short.MinValue))); + + Assert.Equal(Vector128.Zero, + BlendVariableAvx512Int32(Vector128.Create(-1), Vector128.Zero, Vector128.Create(int.MinValue))); + + Assert.Equal(Vector128.Zero, + BlendVariableAvx512Int64(Vector128.Create(-1), Vector128.Zero, Vector128.Create(long.MinValue))); + } + + [ConditionalFact(typeof(Avx512BW.VL), nameof(Avx512BW.VL.IsSupported))] + public static void TestContainableMask() + { + Assert.Equal(Vector128.Zero, + AddToNegativeSingle(Vector128.Create(-1.0f), Vector128.One)); + + Assert.Equal(Vector128.Zero, + AddToNegativeDouble(Vector128.Create(-1.0), Vector128.One)); + + Assert.Equal(Vector128.Zero, + AddToNegativeInt8(Vector128.Create(-1), Vector128.One)); + + Assert.Equal(Vector128.Zero, + AddToNegativeInt16(Vector128.Create(-1), Vector128.One)); + + Assert.Equal(Vector128.Zero, + AddToNegativeInt32(Vector128.Create(-1), Vector128.One)); + + Assert.Equal(Vector128.Zero, + AddToNegativeInt64(Vector128.Create(-1), Vector128.One)); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableSse41Single(Vector128 left, Vector128 right, Vector128 mask) + => Sse41.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableSse41Double(Vector128 left, Vector128 right, Vector128 mask) + => Sse41.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableSse41Int8(Vector128 left, Vector128 right, Vector128 mask) + => Sse41.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableSse41Int16(Vector128 left, Vector128 right, Vector128 mask) + => Sse41.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableSse41Int32(Vector128 left, Vector128 right, Vector128 mask) + => Sse41.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableSse41Int64(Vector128 left, Vector128 right, Vector128 mask) + => Sse41.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableAvx512Single(Vector128 left, Vector128 right, Vector128 mask) + => Avx512F.VL.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableAvx512Double(Vector128 left, Vector128 right, Vector128 mask) + => Avx512F.VL.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableAvx512Int8(Vector128 left, Vector128 right, Vector128 mask) + => Avx512BW.VL.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableAvx512Int16(Vector128 left, Vector128 right, Vector128 mask) + => Avx512BW.VL.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableAvx512Int32(Vector128 left, Vector128 right, Vector128 mask) + => Avx512F.VL.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 BlendVariableAvx512Int64(Vector128 left, Vector128 right, Vector128 mask) + => Avx512F.VL.BlendVariable(left, right, mask); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 AddToNegativeSingle(Vector128 left, Vector128 right) + => Sse41.BlendVariable(left, left + right, left); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 AddToNegativeDouble(Vector128 left, Vector128 right) + => Sse41.BlendVariable(left, left + right, left); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 AddToNegativeInt8(Vector128 left, Vector128 right) + => Sse41.BlendVariable(left, left + right, left); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 AddToNegativeInt16(Vector128 left, Vector128 right) + => Avx512BW.VL.BlendVariable(left, left + right, left); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 AddToNegativeInt32(Vector128 left, Vector128 right) + => Avx512F.VL.BlendVariable(left, left + right, left); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 AddToNegativeInt64(Vector128 left, Vector128 right) + => Avx512F.VL.BlendVariable(left, left + right, left); +} \ No newline at end of file diff --git a/src/tests/JIT/Regression/Regression_ro_2.csproj b/src/tests/JIT/Regression/Regression_ro_2.csproj index 5f898c2b921537..04778781d23124 100644 --- a/src/tests/JIT/Regression/Regression_ro_2.csproj +++ b/src/tests/JIT/Regression/Regression_ro_2.csproj @@ -95,6 +95,7 @@ + From 17130aba2032fedc7066a292a339018031965a30 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 23 Apr 2026 13:14:45 -0700 Subject: [PATCH 3/5] allow re-typing the blend --- src/coreclr/jit/rationalize.cpp | 38 ++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index 2baf5eba536f4d..25e715ebd8132a 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -656,7 +656,42 @@ void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStac if (isVectorToMask) { - isVectorBlendCompatible = varTypeIsFloating(simdBaseType) || varTypeIsByte(simdBaseType); + // The non-mask blend instructions only come in byte (pblendvb) or floating + // (blendvp[sd]) forms. We can use the byte variant as long as we have a + // per-element mask, or we can simply use the equivalent-sized floating type. + GenTree* maskVector = op3->AsHWIntrinsic()->Op(1); + + if (!maskVector->IsVectorPerElementMask(simdBaseType, simdSize)) + { + switch (simdBaseType) + { + case TYP_SHORT: + case TYP_USHORT: + { + isVectorBlendCompatible = false; + break; + } + + case TYP_INT: + case TYP_UINT: + { + simdBaseType = TYP_FLOAT; + break; + } + + case TYP_LONG: + case TYP_ULONG: + { + simdBaseType = TYP_DOUBLE; + break; + } + + default: + { + break; + } + } + } } else if (scratchSideEffects.IsLirInvariantInRange(m_compiler, op2, node)) { @@ -702,6 +737,7 @@ void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStac intrinsic = NI_X86Base_BlendVariable; } + node->SetSimdBaseType(simdBaseType); node->ChangeHWIntrinsicId(intrinsic); } From ae2c431f7ea61d76510d6496cf34222866c05214 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 23 Apr 2026 13:17:22 -0700 Subject: [PATCH 4/5] early return --- src/coreclr/jit/rationalize.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index 25e715ebd8132a..75152fd638941c 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -652,7 +652,6 @@ void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStac // If the mask was originally a vector, we don't want to create a mask solely for // the purpose of embedding it. vpmov*2m is relatively costly compared to blendvp*. bool isVectorToMask = op3->OperIsConvertVectorToMask(); - bool isVectorBlendCompatible = true; if (isVectorToMask) { @@ -668,8 +667,7 @@ void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStac case TYP_SHORT: case TYP_USHORT: { - isVectorBlendCompatible = false; - break; + return; } case TYP_INT: @@ -712,7 +710,7 @@ void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStac } } - if (!isVectorBlendCompatible || !ShouldRewriteToNonMaskHWIntrinsic(op3)) + if (!ShouldRewriteToNonMaskHWIntrinsic(op3)) { return; } From 902ef7bf8e1e9c7418b7bbf514ef78f321a8841e Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 23 Apr 2026 13:31:11 -0700 Subject: [PATCH 5/5] formatting --- src/coreclr/jit/rationalize.cpp | 4 +--- .../JIT/Regression/JitBlue/Runtime_127260/Runtime_127260.cs | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index 75152fd638941c..d0de421541dcbc 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -651,9 +651,7 @@ void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStac // If the mask was originally a vector, we don't want to create a mask solely for // the purpose of embedding it. vpmov*2m is relatively costly compared to blendvp*. - bool isVectorToMask = op3->OperIsConvertVectorToMask(); - - if (isVectorToMask) + if (op3->OperIsConvertVectorToMask()) { // The non-mask blend instructions only come in byte (pblendvb) or floating // (blendvp[sd]) forms. We can use the byte variant as long as we have a diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_127260/Runtime_127260.cs b/src/tests/JIT/Regression/JitBlue/Runtime_127260/Runtime_127260.cs index 793fe1e8e7be98..4edb8b913c7c4a 100644 --- a/src/tests/JIT/Regression/JitBlue/Runtime_127260/Runtime_127260.cs +++ b/src/tests/JIT/Regression/JitBlue/Runtime_127260/Runtime_127260.cs @@ -1,7 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86;