From 9ed07d70b141453994bc1c59c3751b0a22d5e57d Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Mon, 13 Apr 2026 19:46:57 -0700
Subject: [PATCH 1/7] Cleanup GenTree::IsVectorPerElementMask to ensure the
 masks are compatible with the expected element count

---
 src/coreclr/jit/gentree.cpp  | 122 +++++++---
 src/coreclr/jit/simd.h       |  43 +++-
 src/coreclr/jit/valuenum.cpp | 416 +++++++++++++++++++++--------------
 src/coreclr/jit/valuenum.h   |   9 +-
 4 files changed, 390 insertions(+), 200 deletions(-)

diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 9b5101f953ab65..139fad2b939000 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -31560,57 +31560,112 @@ bool GenTree::IsInvariant() const
 
 //-------------------------------------------------------------------
 // IsVectorPerElementMask: returns true if this node is a vector constant per-element mask
-//                         (every element has either all bits set or none of them).
+//                         (every element has either all bits set or none of them) for the
+//                         given simd size and base type.
 //
 // Arguments:
 //    simdBaseType - the base type of the constant being checked.
 //    simdSize     - the size of the SIMD type of the intrinsic.
 //
 // Returns:
-//     True if this node is a vector constant per-element mask.
+//     True if this node is a per-element mask compatible with simdBaseType and simdSize
 //
 bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize) const
 {
 #ifdef FEATURE_SIMD
-    if (IsCnsVec())
-    {
-        const GenTreeVecCon* vecCon = AsVecCon();
+    // This should be kept in sync with ValueNumStore::IsVectorPerElementMask
 
-        int elementCount = vecCon->ElementCount(simdSize, simdBaseType);
+    unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType);
 
-        switch (simdBaseType)
+    if (OperIsConst())
+    {
+        if (IsCnsVec())
         {
-            case TYP_BYTE:
-            case TYP_UBYTE:
-                return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u8[0], elementCount);
-            case TYP_SHORT:
-            case TYP_USHORT:
-                return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u16[0], elementCount);
-            case TYP_INT:
-            case TYP_UINT:
-            case TYP_FLOAT:
-                return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u32[0], elementCount);
-            case TYP_LONG:
-            case TYP_ULONG:
-            case TYP_DOUBLE:
-                return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u64[0], elementCount);
-            default:
-                unreached();
+            const GenTreeVecCon* vecCon = AsVecCon();
+            return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal, simdBaseType, elementCount);
+        }
+#if defined(FEATURE_MASKED_HW_INTRINSICS)
+        else if (IsCnsMsk())
+        {
+            // For constant MASK we need to ensure that only the
+            // lowest elementCount bits are set. If there are more
+            // then we are likely in a scenario where the mask would
+            // get treated incorrectly.
+            //
+            // Consider for example something that expects 2 bits but
+            // finds 3 or more set. This ends up somewhat undefined
+            // behavior as we will ignoring the upper bits and this
+            // could lead to incorrect results.
+
+            const GenTreeMskCon* mskCon = AsMskCon();
+            uint64_t             mask   = mskCon->gtSimdMaskVal.GetRawBits();
+            return (mask & simdmask_t::GetBitMask(elementCount)) == mask;
         }
+#endif // FEATURE_MASKED_HW_INTRINSICS
+
+        return false;
     }
-    else if (OperIsHWIntrinsic())
+
+    NamedIntrinsic intrinsicId           = NI_Illegal;
+    unsigned       intrinsicSimdSize     = 0;
+    var_types      intrinsicSimdBaseType = TYP_UNKNOWN;
+
+    if (OperIsHWIntrinsic())
     {
-        const GenTreeHWIntrinsic* intrinsic   = AsHWIntrinsic();
-        const NamedIntrinsic      intrinsicId = intrinsic->GetHWIntrinsicId();
+        const GenTreeHWIntrinsic* intrinsic = AsHWIntrinsic();
+
+        intrinsicId           = intrinsic->GetHWIntrinsicId();
+        intrinsicSimdSize     = intrinsic->GetSimdSize();
+        intrinsicSimdBaseType = intrinsic->GetSimdBaseType();
 
         if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
         {
-            // We directly return a per-element mask
-            return true;
+            if (varTypeIsMask(TypeGet()))
+            {
+                // When producing a MASK result, we need to ensure the
+                // element counts line up as otherwise we end up with
+                // a bitmask that may be interpreted incorrectly and
+                // lead to bad codegen.
+                //
+                // Consider for example that we have V128<uint> and
+                // so we expect 4 bits. However we are producing a
+                // mask for a V128<ulong> and so give 2 bits instead.
+                // If we produce all true we get 0b11 but the consumer
+                // needs 0b1111 for it to be used correctly. So the
+                // upper two elements are incorrectly treated as false
+                //
+                // Inversely we would expect 2-bits like 0b11 and would
+                // get four bits like 0b1011 instead. So we'd end up
+                // treating both elements as a match when the upper ulong
+                // only had half of it match and so shouldn't be counted.
+
+                return elementCount == GenTreeVecCon::ElementCount(intrinsicSimdSize, intrinsicSimdBaseType);
+            }
+            else
+            {
+                assert(varTypeIsSIMD(TypeGet()));
+                assert(simdSize == intrinsicSimdSize);
+
+                // When producing a SIMD result, we need for it to
+                // have a base type that is the same size or larger
+                // as what we expect.
+                //
+                // Consider for example us expecting `byte` and the
+                // intrinsic here produces `ushort`. In that case we
+                // expect every byte to be either `0x00` or `0xFF`
+                // and the intrinsic produces either `0x0000` or `0xFFFF`
+                // and so it meets this need.
+                //
+                // However, the inverse is not safe as we would expect
+                // `0x0000` or `0xFFFF`, but the intrinsic could produce
+                // `0x00FF` or `0xFF00` which fails the expectation.
+
+                return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType);
+            }
         }
 
         bool       isScalar = false;
-        genTreeOps oper     = intrinsic->GetOperForHWIntrinsicId(&isScalar);
+        genTreeOps oper     = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar);
 
         switch (oper)
         {
@@ -31622,6 +31677,11 @@ bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize)
             case GT_XOR_NOT:
             {
                 // We are a binary bitwise operation where both inputs are per-element masks
+                //
+                // While some cases like OR could combine in ways that produce a usable mask
+                // there isn't any way to statically determine this for non-constants and
+                // the constant cases should've already been folded.
+
                 return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize) &&
                        intrinsic->Op(2)->IsVectorPerElementMask(simdBaseType, simdSize);
             }
@@ -31641,10 +31701,6 @@ bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize)
 
         return false;
     }
-    else if (IsCnsMsk())
-    {
-        return true;
-    }
 #endif // FEATURE_SIMD
 
     return false;
diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h
index bca1a46e2d7890..43999e89edcebf 100644
--- a/src/coreclr/jit/simd.h
+++ b/src/coreclr/jit/simd.h
@@ -5,7 +5,7 @@
 #define _SIMD_H_
 
 template <typename T>
-static bool ElementsAreSame(T* array, size_t size)
+static bool ElementsAreSame(const T* array, size_t size)
 {
     for (size_t i = 1; i < size; i++)
     {
@@ -16,7 +16,7 @@ static bool ElementsAreSame(T* array, size_t size)
 }
 
 template <typename T>
-static bool ElementsAreAllBitsSetOrZero(T* array, size_t size)
+static bool ElementsAreAllBitsSetOrZero(const T* array, size_t size)
 {
     for (size_t i = 0; i < size; i++)
     {
@@ -384,6 +384,45 @@ typedef simd64_t simd_t;
 typedef simd16_t simd_t;
 #endif
 
+static bool ElementsAreAllBitsSetOrZero(const simd_t* simdVal, var_types simdBaseType, unsigned elementCount)
+{
+    assert(simdVal != nullptr);
+
+    switch (simdBaseType)
+    {
+        case TYP_BYTE:
+        case TYP_UBYTE:
+        {
+            return ElementsAreAllBitsSetOrZero(&simdVal->u8[0], elementCount);
+        }
+
+        case TYP_SHORT:
+        case TYP_USHORT:
+        {
+            return ElementsAreAllBitsSetOrZero(&simdVal->u16[0], elementCount);
+        }
+
+        case TYP_INT:
+        case TYP_UINT:
+        case TYP_FLOAT:
+        {
+            return ElementsAreAllBitsSetOrZero(&simdVal->u32[0], elementCount);
+        }
+
+        case TYP_LONG:
+        case TYP_ULONG:
+        case TYP_DOUBLE:
+        {
+            return ElementsAreAllBitsSetOrZero(&simdVal->u64[0], elementCount);
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+}
+
 inline bool IsUnaryBitwiseOperation(genTreeOps oper)
 {
     return (oper == GT_LZCNT) || (oper == GT_NOT);
diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp
index 5c2884cb29659e..181912a9d12582 100644
--- a/src/coreclr/jit/valuenum.cpp
+++ b/src/coreclr/jit/valuenum.cpp
@@ -2405,53 +2405,7 @@ bool ValueNumStore::VNIsVectorNaN(var_types simdType, var_types simdBaseType, Va
 {
     assert(varTypeIsSIMD(simdType));
 
-    simd_t vector = {};
-
-    switch (simdType)
-    {
-        case TYP_SIMD8:
-        {
-            simd8_t tmp = GetConstantSimd8(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD12:
-        {
-            simd12_t tmp = GetConstantSimd12(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD16:
-        {
-            simd16_t tmp = GetConstantSimd16(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-#if defined(TARGET_XARCH)
-        case TYP_SIMD32:
-        {
-            simd32_t tmp = GetConstantSimd32(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD64:
-        {
-            simd64_t tmp = GetConstantSimd64(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-#endif // TARGET_XARCH
-
-        default:
-        {
-            unreached();
-        }
-    }
-
+    simd_t   vector       = GetConstantSimd(valVN);
     uint32_t elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType);
 
     for (uint32_t i = 0; i < elementCount; i++)
@@ -2471,53 +2425,7 @@ bool ValueNumStore::VNIsVectorNegativeZero(var_types simdType, var_types simdBas
 {
     assert(varTypeIsSIMD(simdType));
 
-    simd_t vector = {};
-
-    switch (simdType)
-    {
-        case TYP_SIMD8:
-        {
-            simd8_t tmp = GetConstantSimd8(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD12:
-        {
-            simd12_t tmp = GetConstantSimd12(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD16:
-        {
-            simd16_t tmp = GetConstantSimd16(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-#if defined(TARGET_XARCH)
-        case TYP_SIMD32:
-        {
-            simd32_t tmp = GetConstantSimd32(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD64:
-        {
-            simd64_t tmp = GetConstantSimd64(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-#endif // TARGET_XARCH
-
-        default:
-        {
-            unreached();
-        }
-    }
-
+    simd_t   vector       = GetConstantSimd(valVN);
     uint32_t elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType);
 
     for (uint32_t i = 0; i < elementCount; i++)
@@ -4013,6 +3921,63 @@ float ValueNumStore::GetConstantSingle(ValueNum argVN)
 }
 
 #if defined(FEATURE_SIMD)
+// Given a simd constant value number return its value.
+//
+simd_t ValueNumStore::GetConstantSimd(ValueNum argVN)
+{
+    assert(IsVNConstant(argVN));
+
+    simd_t    vector   = {};
+    var_types simdType = TypeOfVN(argVN);
+
+    switch (simdType)
+    {
+        case TYP_SIMD8:
+        {
+            simd8_t tmp = ConstantValue<simd8_t>(argVN);
+            memcpy(&vector, &tmp, genTypeSize(simdType));
+            break;
+        }
+
+        case TYP_SIMD12:
+        {
+            simd12_t tmp = ConstantValue<simd12_t>(argVN);
+            memcpy(&vector, &tmp, genTypeSize(simdType));
+            break;
+        }
+
+        case TYP_SIMD16:
+        {
+            simd16_t tmp = ConstantValue<simd16_t>(argVN);
+            memcpy(&vector, &tmp, genTypeSize(simdType));
+            break;
+        }
+
+#if defined(TARGET_XARCH)
+        case TYP_SIMD32:
+        {
+            simd32_t tmp = ConstantValue<simd32_t>(argVN);
+            memcpy(&vector, &tmp, genTypeSize(simdType));
+            break;
+        }
+
+        case TYP_SIMD64:
+        {
+            simd64_t tmp = ConstantValue<simd64_t>(argVN);
+            memcpy(&vector, &tmp, genTypeSize(simdType));
+            break;
+        }
+#endif // TARGET_XARCH
+
+        default:
+        {
+            unreached();
+        }
+    }
+
+    return vector;
+}
+
 // Given a simd8 constant value number return its value as a simd8.
 //
 simd8_t ValueNumStore::GetConstantSimd8(ValueNum argVN)
@@ -8444,10 +8409,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(
 
             case GT_EQ:
             {
-                NamedIntrinsic argIntrinsicId;
-                unsigned       argSimdSize;
-                CorInfoType    argSimdBaseJitType;
-
                 if (varTypeIsFloating(baseType))
                 {
                     // Handle `(x == NaN) == false` and `(NaN == x) == false` for floating-point types
@@ -8458,27 +8419,24 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(
                         return VNZeroForType(type);
                     }
                 }
-                else if (IsVNHWIntrinsicFunc(argVN, &argIntrinsicId, &argSimdSize, &argSimdBaseJitType))
+                else if (IsVectorPerElementMask(argVN, baseType, simdSize))
                 {
-                    // This optimization is only safe if we know the other node produces
-                    // AllBitsSet or Zero per element and if the outer comparison is the
-                    // same size as what the other node produces for its mask
-
-                    if (!HWIntrinsicInfo::ReturnsPerElementMask(argIntrinsicId))
-                    {
-                        break;
-                    }
-
-                    if (genTypeSize(baseType) == genTypeSize(JitType2PreciseVarType(argSimdBaseJitType)))
-                    {
-                        break;
-                    }
-
-                    // Handle `(Mask == AllBitsSet) == Mask` and `(AllBitsSet == Mask) == Mask` for integral types
+                    // Handle `Equals(PerElementMask, AllBitsSet)` and `Equals(AllBitsSet, PerElementMask)` for
+                    // integrals
                     ValueNum allBitsVN = VNAllBitsForType(type, simdSize);
 
                     if (cnsVN == allBitsVN)
                     {
+                        // We are comparing something that is known per element to be either
+                        // AllBitsSet or Zero, with AllBitsSet.
+                        //
+                        // In such a case:
+                        // * `AllBitsSet == AllBitsSet` is true and so produces `AllBitsSet`
+                        // * `AllBitsSet == Zero` is false and so produces `Zero`
+                        //
+                        // This means that we are not changing anything and can just return
+                        // the per element mask
+
                         return argVN;
                     }
                 }
@@ -8630,10 +8588,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(
             {
                 var_types simdType = Compiler::getSIMDTypeForSize(simdSize);
 
-                NamedIntrinsic argIntrinsicId;
-                unsigned       argSimdSize;
-                CorInfoType    argSimdBaseJitType;
-
                 if (varTypeIsFloating(baseType))
                 {
                     // Handle `(x != NaN) == true` and `(NaN != x) == true` for floating-point types
@@ -8643,27 +8597,23 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(
                         return VNAllBitsForType(type, elementCount);
                     }
                 }
-                else if (IsVNHWIntrinsicFunc(argVN, &argIntrinsicId, &argSimdSize, &argSimdBaseJitType))
+                else if (IsVectorPerElementMask(argVN, baseType, simdSize))
                 {
-                    // This optimization is only safe if we know the other node produces
-                    // AllBitsSet or Zero per element and if the outer comparison is the
-                    // same size as what the other node produces for its mask
-
-                    if (!HWIntrinsicInfo::ReturnsPerElementMask(argIntrinsicId))
-                    {
-                        break;
-                    }
-
-                    if (genTypeSize(baseType) != genTypeSize(JitType2PreciseVarType(argSimdBaseJitType)))
-                    {
-                        break;
-                    }
-
                     // Handle `(Mask != Zero) == Mask` and `(Zero != Mask) == Mask` for integral types
                     ValueNum zeroVN = VNZeroForType(type);
 
                     if (cnsVN == zeroVN)
                     {
+                        // We are comparing something that is known per element to be either
+                        // AllBitsSet or Zero, with Zero.
+                        //
+                        // In such a case:
+                        // * `AllBitsSet != Zero` is true and so produces `AllBitsSet`
+                        // * `Zero != Zero` is false and so produces `Zero`
+                        //
+                        // This means that we are not changing anything and can just return
+                        // the per element mask
+
                         return argVN;
                     }
                 }
@@ -9291,6 +9241,151 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary(
     return VNForFunc(type, func, arg0VN, arg1VN, arg2VN, resultTypeVN);
 }
 
+//-------------------------------------------------------------------
+// IsVectorPerElementMask: returns true if the ValueNum is a vector constant per-element mask
+//                         (every element has either all bits set or none of them) for the
+//                         given simd size and base type.
+//
+// Arguments:
+//    vn           - the value number to check
+//    simdBaseType - the base type of the constant being checked.
+//    simdSize     - the size of the SIMD type of the intrinsic.
+//
+// Returns:
+//     True if vn is a per-element mask compatible with simdBaseType and simdSize
+//
+bool ValueNumStore::IsVectorPerElementMask(ValueNum vn, var_types simdBaseType, unsigned simdSize)
+{
+    // This should be kept in sync with GenTree::IsVectorPerElementMask
+
+    unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType);
+
+    if (IsVNConstant(vn))
+    {
+        var_types vnType = TypeOfVN(vn);
+
+        if (varTypeIsSIMD(vnType))
+        {
+            simd_t simdVal = GetConstantSimd(vn);
+            return ElementsAreAllBitsSetOrZero(&simdVal, simdBaseType, elementCount);
+        }
+#if defined(FEATURE_MASKED_HW_INTRINSICS)
+        else if (varTypeIsMask(vnType))
+        {
+            // For constant MASK we need to ensure that only the
+            // lowest elementCount bits are set. If there are more
+            // then we are likely in a scenario where the mask would
+            // get treated incorrectly.
+            //
+            // Consider for example something that expects 2 bits but
+            // finds 3 or more set. This ends up somewhat undefined
+            // behavior as we will ignoring the upper bits and this
+            // could lead to incorrect results.
+
+            simdmask_t simdMaskVal = GetConstantSimdMask(vn);
+            uint64_t   mask        = simdMaskVal.GetRawBits();
+            return (mask & simdmask_t::GetBitMask(elementCount)) == mask;
+        }
+#endif // FEATURE_MASKED_HW_INTRINSICS
+
+        return false;
+    }
+
+    VNFuncApp funcApp;
+
+    NamedIntrinsic intrinsicId           = NI_Illegal;
+    unsigned       intrinsicSimdSize     = 0;
+    var_types      intrinsicSimdBaseType = TYP_UNKNOWN;
+
+    if (IsVNHWIntrinsicFunc(vn, &funcApp, &intrinsicId, &intrinsicSimdSize, &intrinsicSimdBaseType))
+    {
+        if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
+        {
+            if (varTypeIsMask(TypeOfVN(vn)))
+            {
+                // When producing a MASK result, we need to ensure the
+                // element counts line up as otherwise we end up with
+                // a bitmask that may be interpreted incorrectly and
+                // lead to bad codegen.
+                //
+                // Consider for example that we have V128<uint> and
+                // so we expect 4 bits. However we are producing a
+                // mask for a V128<ulong> and so give 2 bits instead.
+                // If we produce all true we get 0b11 but the consumer
+                // needs 0b1111 for it to be used correctly. So the
+                // upper two elements are incorrectly treated as false
+                //
+                // Inversely we would expect 2-bits like 0b11 and would
+                // get four bits like 0b1011 instead. So we'd end up
+                // treating both elements as a match when the upper ulong
+                // only had half of it match and so shouldn't be counted.
+
+                return elementCount == GenTreeVecCon::ElementCount(intrinsicSimdSize, intrinsicSimdBaseType);
+            }
+            else
+            {
+                assert(varTypeIsSIMD(TypeOfVN(vn)));
+                assert(simdSize == intrinsicSimdSize);
+
+                // When producing a SIMD result, we need for it to
+                // have a base type that is the same size or larger
+                // as what we expect.
+                //
+                // Consider for example us expecting `byte` and the
+                // intrinsic here produces `ushort`. In that case we
+                // expect every byte to be either `0x00` or `0xFF`
+                // and the intrinsic produces either `0x0000` or `0xFFFF`
+                // and so it meets this need.
+                //
+                // However, the inverse is not safe as we would expect
+                // `0x0000` or `0xFFFF`, but the intrinsic could produce
+                // `0x00FF` or `0xFF00` which fails the expectation.
+
+                return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType);
+            }
+        }
+
+        bool       isScalar = false;
+        genTreeOps oper     = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar);
+
+        switch (oper)
+        {
+            case GT_AND:
+            case GT_AND_NOT:
+            case GT_OR:
+            case GT_OR_NOT:
+            case GT_XOR:
+            case GT_XOR_NOT:
+            {
+                // We are a binary bitwise operation where both inputs are per-element masks
+                //
+                // While some cases like OR could combine in ways that produce a usable mask
+                // there isn't any way to statically determine this for non-constants and
+                // the constant cases should've already been folded.
+
+                return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize) &&
+                       IsVectorPerElementMask(funcApp.m_args[1], simdBaseType, simdSize);
+            }
+
+            case GT_NOT:
+            {
+                // We are an unary bitwise operation where the input is a per-element mask
+                return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize);
+            }
+
+            default:
+            {
+                assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper));
+                break;
+            }
+        }
+
+        return false;
+    }
+
+    return false;
+}
+
 #endif // FEATURE_HW_INTRINSICS
 
 ValueNum ValueNumStore::EvalMathFuncUnary(var_types typ, NamedIntrinsic gtMathFN, ValueNum arg0VN)
@@ -10024,56 +10119,55 @@ bool ValueNumStore::IsVNBinFunc(ValueNum vn, VNFunc func, ValueNum* op1, ValueNu
 
 //----------------------------------------------------------------------------------
 // IsVNHWIntrinsicFunc: A specialized version of GetVNFunc that checks if the given ValueNum
-//     is a HWIntrinsic. If so, it returns the intrinsicId, simdSize, and simdBaseJitType.
+//     is a HWIntrinsic. If so, it returns the intrinsicId, simdSize, and simdBaseType.
 //
 // Arguments:
 //    vn              - The ValueNum to check.
+//    funcApp         - The function application
 //    intrinsicId     - The intrinsic id.
 //    simdSize        - The simd size of the intrinsic.
-//    simdBaseJitType - The simd base jit type for the intrinsic.
+//    simdBaseType    - The simd base type for the intrinsic.
 //
 // Return Value:
 //    true if the given vn is a VNFunc for a HWIntrinsic
 //
-bool ValueNumStore::IsVNHWIntrinsicFunc(ValueNum        vn,
-                                        NamedIntrinsic* intrinsicId,
-                                        unsigned*       simdSize,
-                                        CorInfoType*    simdBaseJitType)
+bool ValueNumStore::IsVNHWIntrinsicFunc(
+    ValueNum vn, VNFuncApp* funcApp, NamedIntrinsic* intrinsicId, unsigned* simdSize, var_types* simdBaseType)
 {
+    assert(funcApp != nullptr);
     assert(intrinsicId != nullptr);
+    assert(simdSize != nullptr);
+    assert(simdBaseType != nullptr);
 
 #if defined(FEATURE_HW_INTRINSICS)
-    VNFuncApp funcApp;
-
-    if (!GetVNFunc(vn, &funcApp))
+    if (!GetVNFunc(vn, funcApp))
     {
         return false;
     }
 
-    VNFunc outerFunc = funcApp.m_func;
+    VNFunc funcId = funcApp->m_func;
 
-    if ((outerFunc < VNF_HWI_FIRST) || (outerFunc > VNF_HWI_LAST))
+    if ((funcId < VNF_HWI_FIRST) || (funcId > VNF_HWI_LAST))
     {
         return false;
     }
-    assert(funcApp.m_arity != 0);
 
-    if (GetVNFunc(funcApp.m_args[funcApp.m_arity - 1], &funcApp))
-    {
-        return false;
-    }
-
-    assert(funcApp.m_func == VNF_SimdType);
-    assert(funcApp.m_arity != 2);
+    assert(funcApp->m_arity != 0);
+    VNFuncApp simdType;
 
-    if (!IsVNConstant(funcApp.m_args[0]) || !IsVNConstant(funcApp.m_args[1]))
+    if (GetVNFunc(funcApp->m_args[funcApp->m_arity - 1], &simdType))
     {
         return false;
     }
 
-    *intrinsicId     = NamedIntrinsic((outerFunc - VNF_HWI_FIRST) + (NI_HW_INTRINSIC_START + 1));
-    *simdSize        = static_cast<uint32_t>(GetConstantInt32(funcApp.m_args[0]));
-    *simdBaseJitType = static_cast<CorInfoType>(GetConstantInt32(funcApp.m_args[1]));
+    assert(simdType.m_func == VNF_SimdType);
+    assert(simdType.m_arity == 2);
+    assert(IsVNConstant(simdType.m_args[0]));
+    assert(IsVNConstant(simdType.m_args[1]));
+
+    *intrinsicId  = static_cast<NamedIntrinsic>((funcId - VNF_HWI_FIRST) + (NI_HW_INTRINSIC_START + 1));
+    *simdSize     = static_cast<uint32_t>(GetConstantInt32(funcApp->m_args[0]));
+    *simdBaseType = static_cast<var_types>(GetConstantInt32(funcApp->m_args[1]));
 
     return true;
 #else
@@ -10510,16 +10604,16 @@ void ValueNumStore::vnDumpMemOpaque(Compiler* comp, VNFuncApp* memOpaque)
 #ifdef FEATURE_SIMD
 void ValueNumStore::vnDumpSimdType(Compiler* comp, VNFuncApp* simdType)
 {
-    assert(simdType->m_func == VNF_SimdType); // Preconditions.
+    assert(simdType->m_func == VNF_SimdType);
+    assert(simdType->m_arity == 2);
     assert(IsVNConstant(simdType->m_args[0]));
     assert(IsVNConstant(simdType->m_args[1]));
 
-    int         simdSize    = ConstantValue<int>(simdType->m_args[0]);
-    CorInfoType baseJitType = (CorInfoType)ConstantValue<int>(simdType->m_args[1]);
+    int       simdSize     = ConstantValue<int>(simdType->m_args[0]);
+    var_types simdBaseType = static_cast<var_types>(ConstantValue<int>(simdType->m_args[1]));
 
     printf("%s(simd%d, %s)", VNFuncName(simdType->m_func), simdSize,
-           baseJitType == CORINFO_TYPE_UNDEF ? varTypeName(TYP_UNDEF)
-                                             : varTypeName(JitType2PreciseVarType(baseJitType)));
+           (simdBaseType == TYP_UNDEF) ? varTypeName(TYP_UNDEF) : varTypeName(simdBaseType));
 }
 #endif // FEATURE_SIMD
 
diff --git a/src/coreclr/jit/valuenum.h b/src/coreclr/jit/valuenum.h
index 8c75e17935a485..d2e22aadaf0475 100644
--- a/src/coreclr/jit/valuenum.h
+++ b/src/coreclr/jit/valuenum.h
@@ -387,6 +387,7 @@ class ValueNumStore
     float  GetConstantSingle(ValueNum argVN);
 
 #if defined(FEATURE_SIMD)
+    simd_t   GetConstantSimd(ValueNum argVN);
     simd8_t  GetConstantSimd8(ValueNum argVN);
     simd12_t GetConstantSimd12(ValueNum argVN);
     simd16_t GetConstantSimd16(ValueNum argVN);
@@ -1323,6 +1324,8 @@ class ValueNumStore
                                        ValueNum            arg1VN,
                                        ValueNum            arg2VN,
                                        ValueNum            resultTypeVN);
+
+    bool IsVectorPerElementMask(ValueNum vn, var_types simdBaseType, unsigned simdSize);
 #endif // FEATURE_HW_INTRINSICS
 
     // Returns "true" iff "vn" represents a function application.
@@ -1336,10 +1339,8 @@ class ValueNumStore
     bool IsVNBinFunc(ValueNum vn, VNFunc func, ValueNum* op1 = nullptr, ValueNum* op2 = nullptr);
 
     // Returns "true" iff "vn" is a function application for a HWIntrinsic
-    bool IsVNHWIntrinsicFunc(ValueNum        vn,
-                             NamedIntrinsic* intrinsicId,
-                             unsigned*       simdSize,
-                             CorInfoType*    simdBaseJitType);
+    bool IsVNHWIntrinsicFunc(
+        ValueNum vn, VNFuncApp* funcApp, NamedIntrinsic* intrinsicId, unsigned* simdSize, var_types* simdBaseType);
 
     // Returns "true" iff "vn" is a function application of the form "func(op, cns)"
     // the cns can be on the left side if the function is commutative.

From 7a6e97321b531faaf6fd36ffb10da04479a7c529 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Mon, 13 Apr 2026 20:09:28 -0700
Subject: [PATCH 2/7] Replace some ReturnsPerElementMask(id) checks with
 node->IsVectorPerElementMask(baseType, simdSize)

---
 src/coreclr/jit/gentree.cpp | 49 +++++++++++++++++++++++++------------
 src/coreclr/jit/morph.cpp   | 44 +++++++++++++++++++++------------
 2 files changed, 63 insertions(+), 30 deletions(-)

diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 139fad2b939000..578e03f5fd34f5 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -33058,24 +33058,24 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
                             break;
                         }
                     }
-                    else if (otherNode->OperIsHWIntrinsic())
+                    else if (otherNode->IsVectorPerElementMask(simdBaseType, simdSize))
                     {
-                        GenTreeHWIntrinsic* otherIntrinsic   = otherNode->AsHWIntrinsic();
-                        NamedIntrinsic      otherIntrinsicId = otherIntrinsic->GetHWIntrinsicId();
-
-                        if (HWIntrinsicInfo::ReturnsPerElementMask(otherIntrinsicId) &&
-                            (genTypeSize(simdBaseType) == genTypeSize(otherIntrinsic->GetSimdBaseType())))
+                        // Handle `Equals(PerElementMask, AllBitsSet)` and `Equals(AllBitsSet, PerElementMask)` for
+                        // integrals
+                        if (cnsNode->IsVectorAllBitsSet())
                         {
-                            // This optimization is only safe if we know the other node produces
-                            // AllBitsSet or Zero per element and if the outer comparison is the
-                            // same size as what the other node produces for its mask
+                            // We are comparing something that is known per element to be either
+                            // AllBitsSet or Zero, with AllBitsSet.
+                            //
+                            // In such a case:
+                            // * `AllBitsSet == AllBitsSet` is true and so produces `AllBitsSet`
+                            // * `AllBitsSet == Zero` is false and so produces `Zero`
+                            //
+                            // This means that we are not changing anything and can just return
+                            // the per element mask
 
-                            // Handle `(Mask == AllBitsSet) == Mask` and `(AllBitsSet == Mask) == Mask` for integrals
-                            if (cnsNode->IsVectorAllBitsSet())
-                            {
-                                resultNode = otherNode;
-                                break;
-                            }
+                            resultNode = otherNode;
+                            break;
                         }
                     }
                     break;
@@ -33246,6 +33246,25 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
                             break;
                         }
                     }
+                    else if (otherNode->IsVectorPerElementMask(simdBaseType, simdSize))
+                    {
+                        // Handle `~Equals(PerElementMask, Zero)` and `~Equals(Zero, PerElementMask)` for integrals
+                        if (cnsNode->IsVectorZero())
+                        {
+                            // We are comparing something that is known per element to be either
+                            // AllBitsSet or Zero, with Zero.
+                            //
+                            // In such a case:
+                            // * `AllBitsSet != Zero` is true and so produces `AllBitsSet`
+                            // * `Zero != Zero` is false and so produces `Zero`
+                            //
+                            // This means that we are not changing anything and can just return
+                            // the per element mask
+
+                            resultNode = otherNode;
+                            break;
+                        }
+                    }
                     else if (otherNode->OperIsHWIntrinsic())
                     {
                         GenTreeHWIntrinsic* otherIntrinsic   = otherNode->AsHWIntrinsic();
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index fe092701846223..4f370aae0883d5 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -11503,44 +11503,58 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree)
             std::swap(op1, op2);
         }
 
-        if (((oper == GT_EQ) || (oper == GT_NE)) && op1->OperIsHWIntrinsic() && op2->IsCnsVec())
+        if ((oper == GT_EQ) || (oper == GT_NE))
         {
-            GenTreeHWIntrinsic* op1Intrinsic   = op1->AsHWIntrinsic();
-            NamedIntrinsic      op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId();
-            var_types           op1Type        = op1Intrinsic->TypeGet();
-
-            if (HWIntrinsicInfo::ReturnsPerElementMask(op1IntrinsicId) &&
-                (genTypeSize(simdBaseType) == genTypeSize(op1Intrinsic->GetSimdBaseType())))
+            if (op2->IsCnsVec() && op1->IsVectorPerElementMask(simdBaseType, simdSize))
             {
-                // This optimization is only safe if we know the other node produces
-                // AllBitsSet or Zero per element and if the outer comparison is the
-                // same size as what the other node produces for its mask
-
                 bool reverseCond = false;
 
                 if (oper == GT_EQ)
                 {
-                    // Handle `Mask == Zero` and `Zero == Mask` for integral types
+                    // Handle `Equals(Mask, Zero)` for integral types
                     if (op2->IsVectorZero())
                     {
+                        // We are comparing something that is known per element to be either
+                        // AllBitsSet or Zero, with Zero.
+                        //
+                        // In such a case:
+                        // * `AllBitsSet == Zero` is false and so produces `Zero`
+                        // * `Zero == Zero` is true and so produces `AllBitsSet`
+                        //
+                        // This means that we are just inverting the input and can reverse
+                        // the condition
+
                         reverseCond = true;
                     }
                 }
                 else if (oper == GT_NE)
                 {
-                    // Handle `Mask != AllBitsSet` and `AllBitsSet != Mask` for integral types
+                    // Handle `~Equals(Mask, AllBitsSet)` for integral types
                     if (op2->IsVectorAllBitsSet())
                     {
+                        // We are comparing something that is known per element to be either
+                        // AllBitsSet or Zero, with AllBitsSet.
+                        //
+                        // In such a case:
+                        // * `AllBitsSet != AllBitsSet` is false and so produces `Zero`
+                        // * `Zero != AllBitsSet` is true and so produces `AllBitsSet`
+                        //
+                        // This means that we are just inverting the input and can reverse
+                        // the condition
+
                         reverseCond = true;
                     }
                 }
 
                 if (reverseCond)
                 {
-                    GenTree* newNode = nullptr;
+                    GenTree*  newNode = nullptr;
+                    var_types op1Type = op1->TypeGet();
 
-                    if (op1Intrinsic->OperIsConvertVectorToMask())
+                    if (op1->OperIsConvertVectorToMask())
                     {
+                        GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic();
+
 #if defined(TARGET_XARCH)
                         op1 = op1Intrinsic->Op(1);
 #elif defined(TARGET_ARM64)

From cdea630d3680001fadc6b7efe22b1ddfbf2f929d Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Mon, 13 Apr 2026 22:24:25 -0700
Subject: [PATCH 3/7] Respond to PR feedback

---
 src/coreclr/jit/gentree.cpp  | 2 +-
 src/coreclr/jit/valuenum.cpp | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 578e03f5fd34f5..734847bb7436ff 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -31594,7 +31594,7 @@ bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize)
             //
             // Consider for example something that expects 2 bits but
             // finds 3 or more set. This ends up somewhat undefined
-            // behavior as we will ignoring the upper bits and this
+            // behavior as we will ignore the upper bits and this
             // could lead to incorrect results.
 
             const GenTreeMskCon* mskCon = AsMskCon();
diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp
index 181912a9d12582..532cd22ed9b7ce 100644
--- a/src/coreclr/jit/valuenum.cpp
+++ b/src/coreclr/jit/valuenum.cpp
@@ -9279,7 +9279,7 @@ bool ValueNumStore::IsVectorPerElementMask(ValueNum vn, var_types simdBaseType,
             //
             // Consider for example something that expects 2 bits but
             // finds 3 or more set. This ends up somewhat undefined
-            // behavior as we will ignoring the upper bits and this
+            // behavior as we will ignore the upper bits and this
             // could lead to incorrect results.
 
             simdmask_t simdMaskVal = GetConstantSimdMask(vn);
@@ -10155,7 +10155,7 @@ bool ValueNumStore::IsVNHWIntrinsicFunc(
     assert(funcApp->m_arity != 0);
     VNFuncApp simdType;
 
-    if (GetVNFunc(funcApp->m_args[funcApp->m_arity - 1], &simdType))
+    if (!GetVNFunc(funcApp->m_args[funcApp->m_arity - 1], &simdType))
     {
         return false;
     }
@@ -10166,8 +10166,8 @@ bool ValueNumStore::IsVNHWIntrinsicFunc(
     assert(IsVNConstant(simdType.m_args[1]));
 
     *intrinsicId  = static_cast<NamedIntrinsic>((funcId - VNF_HWI_FIRST) + (NI_HW_INTRINSIC_START + 1));
-    *simdSize     = static_cast<uint32_t>(GetConstantInt32(funcApp->m_args[0]));
-    *simdBaseType = static_cast<var_types>(GetConstantInt32(funcApp->m_args[1]));
+    *simdSize     = static_cast<uint32_t>(GetConstantInt32(simdType.m_args[0]));
+    *simdBaseType = static_cast<var_types>(GetConstantInt32(simdType.m_args[1]));
 
     return true;
 #else

From d8acc108bc098805096dde9f7987b1d5697e384c Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Tue, 14 Apr 2026 07:10:45 -0700
Subject: [PATCH 4/7] Remove some dead TYP_MASK handling from
 IsVectorPerElementMask

---
 src/coreclr/jit/gentree.cpp  | 166 ++++++++++++-----------------------
 src/coreclr/jit/valuenum.cpp | 161 ++++++++++++---------------------
 2 files changed, 114 insertions(+), 213 deletions(-)

diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 734847bb7436ff..5da82094d6440a 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -31575,131 +31575,81 @@ bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize)
 #ifdef FEATURE_SIMD
     // This should be kept in sync with ValueNumStore::IsVectorPerElementMask
 
-    unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType);
+    var_types simdType     = TypeGet();
+    unsigned  elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType);
 
-    if (OperIsConst())
-    {
-        if (IsCnsVec())
-        {
-            const GenTreeVecCon* vecCon = AsVecCon();
-            return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal, simdBaseType, elementCount);
-        }
-#if defined(FEATURE_MASKED_HW_INTRINSICS)
-        else if (IsCnsMsk())
-        {
-            // For constant MASK we need to ensure that only the
-            // lowest elementCount bits are set. If there are more
-            // then we are likely in a scenario where the mask would
-            // get treated incorrectly.
-            //
-            // Consider for example something that expects 2 bits but
-            // finds 3 or more set. This ends up somewhat undefined
-            // behavior as we will ignore the upper bits and this
-            // could lead to incorrect results.
+    assert(varTypeIsSIMD(simdType));
+    assert(genTypeSize(simdType) == simdSize);
 
-            const GenTreeMskCon* mskCon = AsMskCon();
-            uint64_t             mask   = mskCon->gtSimdMaskVal.GetRawBits();
-            return (mask & simdmask_t::GetBitMask(elementCount)) == mask;
-        }
-#endif // FEATURE_MASKED_HW_INTRINSICS
+    if (IsCnsVec())
+    {
+        const GenTreeVecCon* vecCon = AsVecCon();
+        return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal, simdBaseType, elementCount);
+    }
 
+    if (!OperIsHWIntrinsic())
+    {
         return false;
     }
 
-    NamedIntrinsic intrinsicId           = NI_Illegal;
-    unsigned       intrinsicSimdSize     = 0;
-    var_types      intrinsicSimdBaseType = TYP_UNKNOWN;
+    const GenTreeHWIntrinsic* intrinsic = AsHWIntrinsic();
 
-    if (OperIsHWIntrinsic())
-    {
-        const GenTreeHWIntrinsic* intrinsic = AsHWIntrinsic();
+    NamedIntrinsic intrinsicId           = intrinsic->GetHWIntrinsicId();
+    unsigned       intrinsicSimdSize     = intrinsic->GetSimdSize();
+    var_types      intrinsicSimdBaseType = intrinsic->GetSimdBaseType();
 
-        intrinsicId           = intrinsic->GetHWIntrinsicId();
-        intrinsicSimdSize     = intrinsic->GetSimdSize();
-        intrinsicSimdBaseType = intrinsic->GetSimdBaseType();
+    if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
+    {
+        // When producing a SIMD result, we need for it to
+        // have a base type that is the same size or larger
+        // as what we expect.
+        //
+        // Consider for example us expecting `byte` and the
+        // intrinsic here produces `ushort`. In that case we
+        // expect every byte to be either `0x00` or `0xFF`
+        // and the intrinsic produces either `0x0000` or `0xFFFF`
+        // and so it meets this need.
+        //
+        // However, the inverse is not safe as we would expect
+        // `0x0000` or `0xFFFF`, but the intrinsic could produce
+        // `0x00FF` or `0xFF00` which fails the expectation.
 
-        if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
-        {
-            if (varTypeIsMask(TypeGet()))
-            {
-                // When producing a MASK result, we need to ensure the
-                // element counts line up as otherwise we end up with
-                // a bitmask that may be interpreted incorrectly and
-                // lead to bad codegen.
-                //
-                // Consider for example that we have V128<uint> and
-                // so we expect 4 bits. However we are producing a
-                // mask for a V128<ulong> and so give 2 bits instead.
-                // If we produce all true we get 0b11 but the consumer
-                // needs 0b1111 for it to be used correctly. So the
-                // upper two elements are incorrectly treated as false
-                //
-                // Inversely we would expect 2-bits like 0b11 and would
-                // get four bits like 0b1011 instead. So we'd end up
-                // treating both elements as a match when the upper ulong
-                // only had half of it match and so shouldn't be counted.
+        return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType);
+    }
 
-                return elementCount == GenTreeVecCon::ElementCount(intrinsicSimdSize, intrinsicSimdBaseType);
-            }
-            else
-            {
-                assert(varTypeIsSIMD(TypeGet()));
-                assert(simdSize == intrinsicSimdSize);
+    bool       isScalar = false;
+    genTreeOps oper     = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar);
 
-                // When producing a SIMD result, we need for it to
-                // have a base type that is the same size or larger
-                // as what we expect.
-                //
-                // Consider for example us expecting `byte` and the
-                // intrinsic here produces `ushort`. In that case we
-                // expect every byte to be either `0x00` or `0xFF`
-                // and the intrinsic produces either `0x0000` or `0xFFFF`
-                // and so it meets this need.
-                //
-                // However, the inverse is not safe as we would expect
-                // `0x0000` or `0xFFFF`, but the intrinsic could produce
-                // `0x00FF` or `0xFF00` which fails the expectation.
+    switch (oper)
+    {
+        case GT_AND:
+        case GT_AND_NOT:
+        case GT_OR:
+        case GT_OR_NOT:
+        case GT_XOR:
+        case GT_XOR_NOT:
+        {
+            // We are a binary bitwise operation where both inputs are per-element masks
+            //
+            // While some cases like OR could combine in ways that produce a usable mask
+            // there isn't any way to statically determine this for non-constants and
+            // the constant cases should've already been folded.
 
-                return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType);
-            }
+            return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize) &&
+                   intrinsic->Op(2)->IsVectorPerElementMask(simdBaseType, simdSize);
         }
 
-        bool       isScalar = false;
-        genTreeOps oper     = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar);
-
-        switch (oper)
+        case GT_NOT:
         {
-            case GT_AND:
-            case GT_AND_NOT:
-            case GT_OR:
-            case GT_OR_NOT:
-            case GT_XOR:
-            case GT_XOR_NOT:
-            {
-                // We are a binary bitwise operation where both inputs are per-element masks
-                //
-                // While some cases like OR could combine in ways that produce a usable mask
-                // there isn't any way to statically determine this for non-constants and
-                // the constant cases should've already been folded.
-
-                return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize) &&
-                       intrinsic->Op(2)->IsVectorPerElementMask(simdBaseType, simdSize);
-            }
-
-            case GT_NOT:
-            {
-                // We are an unary bitwise operation where the input is a per-element mask
-                return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize);
-            }
-
-            default:
-            {
-                assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper));
-                break;
-            }
+            // We are an unary bitwise operation where the input is a per-element mask
+            return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize);
         }
 
-        return false;
+        default:
+        {
+            assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper));
+            break;
+        }
     }
 #endif // FEATURE_SIMD
 
diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp
index 532cd22ed9b7ce..4cd3f9e35e9f3e 100644
--- a/src/coreclr/jit/valuenum.cpp
+++ b/src/coreclr/jit/valuenum.cpp
@@ -9258,129 +9258,80 @@ bool ValueNumStore::IsVectorPerElementMask(ValueNum vn, var_types simdBaseType,
 {
     // This should be kept in sync with GenTree::IsVectorPerElementMask
 
-    unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType);
+    var_types simdType     = TypeOfVN(vn);
+    unsigned  elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType);
+
+    assert(varTypeIsSIMD(simdType));
+    assert(genTypeSize(simdType) == simdSize);
 
     if (IsVNConstant(vn))
     {
-        var_types vnType = TypeOfVN(vn);
-
-        if (varTypeIsSIMD(vnType))
-        {
-            simd_t simdVal = GetConstantSimd(vn);
-            return ElementsAreAllBitsSetOrZero(&simdVal, simdBaseType, elementCount);
-        }
-#if defined(FEATURE_MASKED_HW_INTRINSICS)
-        else if (varTypeIsMask(vnType))
-        {
-            // For constant MASK we need to ensure that only the
-            // lowest elementCount bits are set. If there are more
-            // then we are likely in a scenario where the mask would
-            // get treated incorrectly.
-            //
-            // Consider for example something that expects 2 bits but
-            // finds 3 or more set. This ends up somewhat undefined
-            // behavior as we will ignore the upper bits and this
-            // could lead to incorrect results.
+        simd_t simdVal = GetConstantSimd(vn);
+        return ElementsAreAllBitsSetOrZero(&simdVal, simdBaseType, elementCount);
+    }
 
-            simdmask_t simdMaskVal = GetConstantSimdMask(vn);
-            uint64_t   mask        = simdMaskVal.GetRawBits();
-            return (mask & simdmask_t::GetBitMask(elementCount)) == mask;
-        }
-#endif // FEATURE_MASKED_HW_INTRINSICS
+    VNFuncApp      funcApp;
+    NamedIntrinsic intrinsicId;
+    unsigned       intrinsicSimdSize;
+    var_types      intrinsicSimdBaseType;
 
+    if (!IsVNHWIntrinsicFunc(vn, &funcApp, &intrinsicId, &intrinsicSimdSize, &intrinsicSimdBaseType))
+    {
         return false;
     }
 
-    VNFuncApp funcApp;
+    if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
+    {
+        // When producing a SIMD result, we need for it to
+        // have a base type that is the same size or larger
+        // as what we expect.
+        //
+        // Consider for example us expecting `byte` and the
+        // intrinsic here produces `ushort`. In that case we
+        // expect every byte to be either `0x00` or `0xFF`
+        // and the intrinsic produces either `0x0000` or `0xFFFF`
+        // and so it meets this need.
+        //
+        // However, the inverse is not safe as we would expect
+        // `0x0000` or `0xFFFF`, but the intrinsic could produce
+        // `0x00FF` or `0xFF00` which fails the expectation.
 
-    NamedIntrinsic intrinsicId           = NI_Illegal;
-    unsigned       intrinsicSimdSize     = 0;
-    var_types      intrinsicSimdBaseType = TYP_UNKNOWN;
+        return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType);
+    }
 
-    if (IsVNHWIntrinsicFunc(vn, &funcApp, &intrinsicId, &intrinsicSimdSize, &intrinsicSimdBaseType))
+    bool       isScalar = false;
+    genTreeOps oper     = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar);
+
+    switch (oper)
     {
-        if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
+        case GT_AND:
+        case GT_AND_NOT:
+        case GT_OR:
+        case GT_OR_NOT:
+        case GT_XOR:
+        case GT_XOR_NOT:
         {
-            if (varTypeIsMask(TypeOfVN(vn)))
-            {
-                // When producing a MASK result, we need to ensure the
-                // element counts line up as otherwise we end up with
-                // a bitmask that may be interpreted incorrectly and
-                // lead to bad codegen.
-                //
-                // Consider for example that we have V128<uint> and
-                // so we expect 4 bits. However we are producing a
-                // mask for a V128<ulong> and so give 2 bits instead.
-                // If we produce all true we get 0b11 but the consumer
-                // needs 0b1111 for it to be used correctly. So the
-                // upper two elements are incorrectly treated as false
-                //
-                // Inversely we would expect 2-bits like 0b11 and would
-                // get four bits like 0b1011 instead. So we'd end up
-                // treating both elements as a match when the upper ulong
-                // only had half of it match and so shouldn't be counted.
-
-                return elementCount == GenTreeVecCon::ElementCount(intrinsicSimdSize, intrinsicSimdBaseType);
-            }
-            else
-            {
-                assert(varTypeIsSIMD(TypeOfVN(vn)));
-                assert(simdSize == intrinsicSimdSize);
-
-                // When producing a SIMD result, we need for it to
-                // have a base type that is the same size or larger
-                // as what we expect.
-                //
-                // Consider for example us expecting `byte` and the
-                // intrinsic here produces `ushort`. In that case we
-                // expect every byte to be either `0x00` or `0xFF`
-                // and the intrinsic produces either `0x0000` or `0xFFFF`
-                // and so it meets this need.
-                //
-                // However, the inverse is not safe as we would expect
-                // `0x0000` or `0xFFFF`, but the intrinsic could produce
-                // `0x00FF` or `0xFF00` which fails the expectation.
+            // We are a binary bitwise operation where both inputs are per-element masks
+            //
+            // While some cases like OR could combine in ways that produce a usable mask
+            // there isn't any way to statically determine this for non-constants and
+            // the constant cases should've already been folded.
 
-                return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType);
-            }
+            return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize) &&
+                   IsVectorPerElementMask(funcApp.m_args[1], simdBaseType, simdSize);
         }
 
-        bool       isScalar = false;
-        genTreeOps oper     = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar);
-
-        switch (oper)
+        case GT_NOT:
         {
-            case GT_AND:
-            case GT_AND_NOT:
-            case GT_OR:
-            case GT_OR_NOT:
-            case GT_XOR:
-            case GT_XOR_NOT:
-            {
-                // We are a binary bitwise operation where both inputs are per-element masks
-                //
-                // While some cases like OR could combine in ways that produce a usable mask
-                // there isn't any way to statically determine this for non-constants and
-                // the constant cases should've already been folded.
-
-                return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize) &&
-                       IsVectorPerElementMask(funcApp.m_args[1], simdBaseType, simdSize);
-            }
-
-            case GT_NOT:
-            {
-                // We are an unary bitwise operation where the input is a per-element mask
-                return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize);
-            }
-
-            default:
-            {
-                assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper));
-                break;
-            }
+            // We are an unary bitwise operation where the input is a per-element mask
+            return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize);
         }
 
-        return false;
+        default:
+        {
+            assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper));
+            break;
+        }
     }
 
     return false;

From 1d62fabd9cc3d943565931287a66133a80fd26bc Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Tue, 14 Apr 2026 09:11:55 -0700
Subject: [PATCH 5/7] Don't regress codegen for ConditionalSelect when the
 condition is a PerElementMask

---
 src/coreclr/jit/lowerxarch.cpp | 172 +++++++++++++++++++--------------
 1 file changed, 100 insertions(+), 72 deletions(-)

diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 09d6f831bd309a..42af1c1c3c04d8 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -2702,6 +2702,16 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
             GenTree* op2 = node->Op(2);
             GenTree* op3 = node->Op(3);
 
+            if (varTypeIsIntegral(simdBaseType))
+            {
+                // The integral forms are marked with NormalizeSmallTypeToInt
+                // but actually emit an instruction that operates bytewise. So
+                // fixup the base type so that the per element mask check finds
+                // the greatest number of valid matches.
+
+                simdBaseType = TYP_BYTE;
+            }
+
             // If either of the value operands is const zero and the mask is either all
             // zeros or all ones per-element, we can optimize down to AND or AND_NOT.
             if (op3->IsVectorPerElementMask(simdBaseType, simdSize) && (op1->IsVectorZero() || op2->IsVectorZero()))
@@ -3490,53 +3500,95 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node)
     // op1: the condition vector
     // op2: the left vector
     // op3: the right vector
+
     GenTree* op1 = node->Op(1);
     GenTree* op2 = node->Op(2);
     GenTree* op3 = node->Op(3);
 
-    // If the condition vector comes from a hardware intrinsic that
-    // returns a per-element mask, we can optimize the entire
-    // conditional select to a single BlendVariable instruction
-    // (if supported by the architecture)
+    // When op1 isn't a mask, we want to directly optimize the case where op2 or op3
+    // is zero as the operation being done is `(op2 & op1) | (op3 & ~op1)` and doing
+    // `x & 0` always produces `0`, so we can drop half the operation.
+    //
+    // However, if op1 is originally a mask, then we want to skip this as we'll convert
+    // to a BlendVariableMask instruction instead, which can then be embedded in most
+    // operations, improving code density and throughput.
 
-    // First, determine if the condition is a per-element mask
-    if (op1->IsVectorPerElementMask(simdBaseType, simdSize))
+    if (op1->OperIsConvertMaskToVector())
     {
-        // Next, determine if the target architecture supports BlendVariable
-        NamedIntrinsic blendVariableId = NI_Illegal;
+        // If op1 was originally a mask, then we want to prioritize BlendVariableMask
+        // as it not only avoids the conversion from mask to vector, but also allows
+        // embedded masking and other optimizations to kick in, improving code density
 
-        bool isOp1CvtMaskToVector = op1->OperIsConvertMaskToVector();
+        NamedIntrinsic      blendVariableId = NI_AVX512_BlendVariableMask;
+        GenTreeHWIntrinsic* cvtMaskToVector = op1->AsHWIntrinsic();
 
-        if ((simdSize == 64) || isOp1CvtMaskToVector)
-        {
-            GenTree* maskNode;
+        GenTree* maskNode = cvtMaskToVector->Op(1);
+        BlockRange().Remove(op1);
 
-            if (isOp1CvtMaskToVector)
-            {
-                GenTreeHWIntrinsic* cvtMaskToVector = op1->AsHWIntrinsic();
+        // We need to change the base type to match the underlying mask size to ensure
+        // the right instruction variant is picked. If the CndSel was for TYP_INT but
+        // the mask was for TYP_DOUBLE then we'd generate vpblendmd when we really want
+        // vpblendmq. Changing the size is fine since CndSel itself is bitwise and the
+        // the mask is just representing entire elements at a given size.
 
-                maskNode = cvtMaskToVector->Op(1);
-                BlockRange().Remove(op1);
+        simdBaseType = cvtMaskToVector->GetSimdBaseType();
 
-                // We need to change the base type to match the underlying mask size to ensure
-                // the right instruction variant is picked. If the CndSel was for TYP_INT but
-                // the mask was for TYP_DOUBLE then we'd generate vpblendmd when we really want
-                // vpblendmq. Changing the size is fine since CndSel itself is bitwise and the
-                // the mask is just representing entire elements at a given size.
+        resultNode =
+            m_compiler->gtNewSimdHWIntrinsicNode(simdType, op3, op2, op1, blendVariableId, simdBaseType, simdSize);
+    }
+    else if (op3->IsVectorZero())
+    {
+        // The operation is (op2 & op1) | (zero & ~op1), so we can drop the second half
+        BlockRange().Remove(op3);
+        resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize);
+    }
+    else if (op2->IsVectorZero())
+    {
+        // The operation is (zero & op1) | (op3 & ~op1), so we can drop the first half
+        BlockRange().Remove(op2);
+        resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, op1, simdBaseType, simdSize);
+    }
+    else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
+    {
+        // TernaryLogic is always single cycle with a lot of ports availalbe and while
+        // BlendVariable is often also a single cycle it can frequently be 2-3 instead.
+        //
+        // Additionally, for TYP_SIMD64 this avoids needing to convert from vector to mask
+        // which introduces 3-4 cycles of overhead on most hardware and would be strictly worse
 
-                simdBaseType = cvtMaskToVector->GetSimdBaseType();
-            }
-            else
-            {
-                maskNode = m_compiler->gtNewSimdCvtVectorToMaskNode(TYP_MASK, op1, simdBaseType, simdSize);
-                newNodes.InsertAtEnd(maskNode);
-            }
+        GenTree* control = m_compiler->gtNewIconNode(0xCA); // (B & A) | (C & ~A)
+        newNodes.InsertAtEnd(control);
 
-            assert(maskNode->TypeIs(TYP_MASK));
-            blendVariableId = NI_AVX512_BlendVariableMask;
-            op1             = maskNode;
+        resultNode = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op1, op2, op3, control, NI_AVX512_TernaryLogic,
+                                                          simdBaseType, simdSize);
+    }
+    else if (op1->IsVectorPerElementMask(TYP_BYTE, simdSize))
+    {
+        // If the condition vector comes from a hardware intrinsic that
+        // returns a per-element mask, we can optimize the entire
+        // conditional select to a single BlendVariable instruction
+        // (if supported by the architecture).
+        //
+        // We can check using `TYP_BYTE` as we can change the base type
+        // and still get correct codegen since the ConditionalSelect is
+        // itself bitwise.
+
+        // Next, determine if the target architecture supports BlendVariable
+        NamedIntrinsic blendVariableId = NI_Illegal;
+
+        bool isFloating = false;
+
+        if (varTypeIsFloating(simdBaseType) && !op1->IsVectorPerElementMask(simdBaseType, simdSize))
+        {
+            // For floating-point, we want to preserve the base type if the
+            // mask is also compatible with it, otherwise we need to fixup
+            // the base type to ensure the right instruction is selected for
+            // the mask.
+
+            simdBaseType = TYP_BYTE;
         }
-        else if (simdSize == 32)
+
+        if (simdSize == 32)
         {
             // For Vector256 (simdSize == 32), BlendVariable for floats/doubles
             // is available on AVX, whereas other types (integrals) require AVX2
@@ -3568,50 +3620,26 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node)
 
     if (resultNode == nullptr)
     {
-        if (op3->IsVectorZero())
-        {
-            BlockRange().Remove(op3);
+        // We cannot optimize, so produce unoptimized instructions
+        assert(simdSize != 64);
 
-            resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize);
-        }
-        else if (op2->IsVectorZero())
-        {
-            BlockRange().Remove(op2);
-
-            resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, op1, simdBaseType, simdSize);
-        }
-        else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
+        // We'll need the mask twice
+        if (!op1->OperIs(GT_LCL_VAR))
         {
-            // We can't use the mask, but we can emit a ternary logic node
-            GenTree* control = m_compiler->gtNewIconNode(0xCA); // (B & A) | (C & ~A)
-            newNodes.InsertAtEnd(control);
-
-            resultNode = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op1, op2, op3, control, NI_AVX512_TernaryLogic,
-                                                              simdBaseType, simdSize);
+            LIR::Use op1Use;
+            LIR::Use::MakeDummyUse(newNodes, op1, &op1Use);
+            op1Use.ReplaceWithLclVar(m_compiler);
+            op1 = op1Use.Def();
         }
-        else
-        {
-            // We cannot optimize, so produce unoptimized instructions
-            assert(simdSize != 64);
-
-            // We'll need the mask twice
-            if (!op1->OperIs(GT_LCL_VAR))
-            {
-                LIR::Use op1Use;
-                LIR::Use::MakeDummyUse(newNodes, op1, &op1Use);
-                op1Use.ReplaceWithLclVar(m_compiler);
-                op1 = op1Use.Def();
-            }
 
-            GenTree* tmp1 = m_compiler->gtClone(op1);
-            GenTree* tmp2 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize);
-            GenTree* tmp3 = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, tmp1, simdBaseType, simdSize);
-            resultNode    = m_compiler->gtNewSimdBinOpNode(GT_OR, simdType, tmp2, tmp3, simdBaseType, simdSize);
+        GenTree* tmp1 = m_compiler->gtClone(op1);
+        GenTree* tmp2 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize);
+        GenTree* tmp3 = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, tmp1, simdBaseType, simdSize);
+        resultNode    = m_compiler->gtNewSimdBinOpNode(GT_OR, simdType, tmp2, tmp3, simdBaseType, simdSize);
 
-            newNodes.InsertAtEnd(tmp1);
-            newNodes.InsertAtEnd(tmp2);
-            newNodes.InsertAtEnd(tmp3);
-        }
+        newNodes.InsertAtEnd(tmp1);
+        newNodes.InsertAtEnd(tmp2);
+        newNodes.InsertAtEnd(tmp3);
     }
 
     newNodes.InsertAtEnd(resultNode);

From 73f53fc72d697b1f891282e505cf0911b1af1187 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Tue, 14 Apr 2026 09:46:29 -0700
Subject: [PATCH 6/7] Respond to PR feedback

---
 src/coreclr/jit/gentree.cpp    | 5 +++++
 src/coreclr/jit/lowerxarch.cpp | 9 +--------
 src/coreclr/jit/valuenum.cpp   | 5 +++++
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 5da82094d6440a..cd5a198ea57464 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -31598,6 +31598,11 @@ bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize)
     unsigned       intrinsicSimdSize     = intrinsic->GetSimdSize();
     var_types      intrinsicSimdBaseType = intrinsic->GetSimdBaseType();
 
+    if (intrinsicSimdSize != simdSize)
+    {
+        return false;
+    }
+
     if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
     {
         // When producing a SIMD result, we need for it to
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 42af1c1c3c04d8..8cd3a3c99fedeb 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -3505,14 +3505,6 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node)
     GenTree* op2 = node->Op(2);
     GenTree* op3 = node->Op(3);
 
-    // When op1 isn't a mask, we want to directly optimize the case where op2 or op3
-    // is zero as the operation being done is `(op2 & op1) | (op3 & ~op1)` and doing
-    // `x & 0` always produces `0`, so we can drop half the operation.
-    //
-    // However, if op1 is originally a mask, then we want to skip this as we'll convert
-    // to a BlendVariableMask instruction instead, which can then be embedded in most
-    // operations, improving code density and throughput.
-
     if (op1->OperIsConvertMaskToVector())
     {
         // If op1 was originally a mask, then we want to prioritize BlendVariableMask
@@ -3524,6 +3516,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node)
 
         GenTree* maskNode = cvtMaskToVector->Op(1);
         BlockRange().Remove(op1);
+        op1 = maskNode;
 
         // We need to change the base type to match the underlying mask size to ensure
         // the right instruction variant is picked. If the CndSel was for TYP_INT but
diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp
index 4cd3f9e35e9f3e..92d75c08c76bef 100644
--- a/src/coreclr/jit/valuenum.cpp
+++ b/src/coreclr/jit/valuenum.cpp
@@ -9280,6 +9280,11 @@ bool ValueNumStore::IsVectorPerElementMask(ValueNum vn, var_types simdBaseType,
         return false;
     }
 
+    if (intrinsicSimdSize != simdSize)
+    {
+        return false;
+    }
+
     if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
     {
         // When producing a SIMD result, we need for it to

From c16ea95323b32831e7f941b36c5a38453936ae91 Mon Sep 17 00:00:00 2001
From: Tanner Gooding <tagoo@outlook.com>
Date: Tue, 14 Apr 2026 11:23:29 -0700
Subject: [PATCH 7/7] Respond to PR feedback

---
 src/coreclr/jit/lowerxarch.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 8cd3a3c99fedeb..8fbed3c79d41d3 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -2714,7 +2714,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
 
             // If either of the value operands is const zero and the mask is either all
             // zeros or all ones per-element, we can optimize down to AND or AND_NOT.
-            if (op3->IsVectorPerElementMask(simdBaseType, simdSize) && (op1->IsVectorZero() || op2->IsVectorZero()))
+            if (op3->IsVectorPerElementMask(TYP_BYTE, simdSize) && (op1->IsVectorZero() || op2->IsVectorZero()))
             {
                 var_types simdType = node->TypeGet();
                 GenTree*  binOp    = nullptr;
@@ -3522,7 +3522,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node)
         // the right instruction variant is picked. If the CndSel was for TYP_INT but
         // the mask was for TYP_DOUBLE then we'd generate vpblendmd when we really want
         // vpblendmq. Changing the size is fine since CndSel itself is bitwise and the
-        // the mask is just representing entire elements at a given size.
+        // mask is just representing entire elements at a given size.
 
         simdBaseType = cvtMaskToVector->GetSimdBaseType();
 
@@ -3543,7 +3543,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node)
     }
     else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
     {
-        // TernaryLogic is always single cycle with a lot of ports availalbe and while
+        // TernaryLogic is always single cycle with a lot of ports available and while
         // BlendVariable is often also a single cycle it can frequently be 2-3 instead.
         //
         // Additionally, for TYP_SIMD64 this avoids needing to convert from vector to mask
@@ -3569,8 +3569,6 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node)
         // Next, determine if the target architecture supports BlendVariable
         NamedIntrinsic blendVariableId = NI_Illegal;
 
-        bool isFloating = false;
-
         if (varTypeIsFloating(simdBaseType) && !op1->IsVectorPerElementMask(simdBaseType, simdSize))
         {
             // For floating-point, we want to preserve the base type if the