diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 9b5101f953ab65..cd5a198ea57464 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -31560,90 +31560,101 @@ bool GenTree::IsInvariant() const
 
 //-------------------------------------------------------------------
 // IsVectorPerElementMask: returns true if this node is a vector constant per-element mask
-//                         (every element has either all bits set or none of them).
+//                         (every element has either all bits set or none of them) for the
+//                         given simd size and base type.
 //
 // Arguments:
 //    simdBaseType - the base type of the constant being checked.
 //    simdSize     - the size of the SIMD type of the intrinsic.
 //
 // Returns:
-//     True if this node is a vector constant per-element mask.
+//     True if this node is a per-element mask compatible with simdBaseType and simdSize
 //
 bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize) const
 {
 #ifdef FEATURE_SIMD
+    // This should be kept in sync with ValueNumStore::IsVectorPerElementMask
+
+    var_types simdType     = TypeGet();
+    unsigned  elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType);
+
+    assert(varTypeIsSIMD(simdType));
+    assert(genTypeSize(simdType) == simdSize);
+
     if (IsCnsVec())
     {
         const GenTreeVecCon* vecCon = AsVecCon();
+        return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal, simdBaseType, elementCount);
+    }
 
-        int elementCount = vecCon->ElementCount(simdSize, simdBaseType);
+    if (!OperIsHWIntrinsic())
+    {
+        return false;
+    }
 
-        switch (simdBaseType)
-        {
-            case TYP_BYTE:
-            case TYP_UBYTE:
-                return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u8[0], elementCount);
-            case TYP_SHORT:
-            case TYP_USHORT:
-                return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u16[0], elementCount);
-            case TYP_INT:
-            case TYP_UINT:
-            case TYP_FLOAT:
-                return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u32[0], elementCount);
-            case TYP_LONG:
-            case TYP_ULONG:
-            case TYP_DOUBLE:
-                return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u64[0], elementCount);
-            default:
-                unreached();
-        }
+    const GenTreeHWIntrinsic* intrinsic = AsHWIntrinsic();
+
+    NamedIntrinsic intrinsicId           = intrinsic->GetHWIntrinsicId();
+    unsigned       intrinsicSimdSize     = intrinsic->GetSimdSize();
+    var_types      intrinsicSimdBaseType = intrinsic->GetSimdBaseType();
+
+    if (intrinsicSimdSize != simdSize)
+    {
+        return false;
     }
-    else if (OperIsHWIntrinsic())
+
+    if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
     {
-        const GenTreeHWIntrinsic* intrinsic   = AsHWIntrinsic();
-        const NamedIntrinsic      intrinsicId = intrinsic->GetHWIntrinsicId();
+        // When producing a SIMD result, we need for it to
+        // have a base type that is the same size or larger
+        // as what we expect.
+        //
+        // Consider for example us expecting `byte` and the
+        // intrinsic here produces `ushort`. In that case we
+        // expect every byte to be either `0x00` or `0xFF`
+        // and the intrinsic produces either `0x0000` or `0xFFFF`
+        // and so it meets this need.
+        //
+        // However, the inverse is not safe as we would expect
+        // `0x0000` or `0xFFFF`, but the intrinsic could produce
+        // `0x00FF` or `0xFF00` which fails the expectation.
 
-        if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
-        {
-            // We directly return a per-element mask
-            return true;
-        }
+        return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType);
+    }
 
-        bool       isScalar = false;
-        genTreeOps oper     = intrinsic->GetOperForHWIntrinsicId(&isScalar);
+    bool       isScalar = false;
+    genTreeOps oper     = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar);
 
-        switch (oper)
+    switch (oper)
+    {
+        case GT_AND:
+        case GT_AND_NOT:
+        case GT_OR:
+        case GT_OR_NOT:
+        case GT_XOR:
+        case GT_XOR_NOT:
         {
-            case GT_AND:
-            case GT_AND_NOT:
-            case GT_OR:
-            case GT_OR_NOT:
-            case GT_XOR:
-            case GT_XOR_NOT:
-            {
-                // We are a binary bitwise operation where both inputs are per-element masks
-                return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize) &&
-                       intrinsic->Op(2)->IsVectorPerElementMask(simdBaseType, simdSize);
-            }
+            // We are a binary bitwise operation where both inputs are per-element masks
+            //
+            // While some cases like OR could combine in ways that produce a usable mask
+            // there isn't any way to statically determine this for non-constants and
+            // the constant cases should've already been folded.
 
-            case GT_NOT:
-            {
-                // We are an unary bitwise operation where the input is a per-element mask
-                return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize);
-            }
+            return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize) &&
+                   intrinsic->Op(2)->IsVectorPerElementMask(simdBaseType, simdSize);
+        }
 
-            default:
-            {
-                assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper));
-                break;
-            }
+        case GT_NOT:
+        {
+            // We are an unary bitwise operation where the input is a per-element mask
+            return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize);
         }
 
-        return false;
-    }
-    else if (IsCnsMsk())
-    {
-        return true;
+        default:
+        {
+            assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper));
+            break;
+        }
     }
 #endif // FEATURE_SIMD
 
@@ -33002,24 +33013,24 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
                             break;
                         }
                     }
-                    else if (otherNode->OperIsHWIntrinsic())
+                    else if (otherNode->IsVectorPerElementMask(simdBaseType, simdSize))
                     {
-                        GenTreeHWIntrinsic* otherIntrinsic   = otherNode->AsHWIntrinsic();
-                        NamedIntrinsic      otherIntrinsicId = otherIntrinsic->GetHWIntrinsicId();
-
-                        if (HWIntrinsicInfo::ReturnsPerElementMask(otherIntrinsicId) &&
-                            (genTypeSize(simdBaseType) == genTypeSize(otherIntrinsic->GetSimdBaseType())))
+                        // Handle `Equals(PerElementMask, AllBitsSet)` and `Equals(AllBitsSet, PerElementMask)` for
+                        // integrals
+                        if (cnsNode->IsVectorAllBitsSet())
                         {
-                            // This optimization is only safe if we know the other node produces
-                            // AllBitsSet or Zero per element and if the outer comparison is the
-                            // same size as what the other node produces for its mask
+                            // We are comparing something that is known per element to be either
+                            // AllBitsSet or Zero, with AllBitsSet.
+                            //
+                            // In such a case:
+                            // * `AllBitsSet == AllBitsSet` is true and so produces `AllBitsSet`
+                            // * `AllBitsSet == Zero` is false and so produces `Zero`
+                            //
+                            // This means that we are not changing anything and can just return
+                            // the per element mask
 
-                            // Handle `(Mask == AllBitsSet) == Mask` and `(AllBitsSet == Mask) == Mask` for integrals
-                            if (cnsNode->IsVectorAllBitsSet())
-                            {
-                                resultNode = otherNode;
-                                break;
-                            }
+                            resultNode = otherNode;
+                            break;
                         }
                     }
                     break;
@@ -33190,6 +33201,25 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
                             break;
                         }
                     }
+                    else if (otherNode->IsVectorPerElementMask(simdBaseType, simdSize))
+                    {
+                        // Handle `~Equals(PerElementMask, Zero)` and `~Equals(Zero, PerElementMask)` for integrals
+                        if (cnsNode->IsVectorZero())
+                        {
+                            // We are comparing something that is known per element to be either
+                            // AllBitsSet or Zero, with Zero.
+                            //
+                            // In such a case:
+                            // * `AllBitsSet != Zero` is true and so produces `AllBitsSet`
+                            // * `Zero != Zero` is false and so produces `Zero`
+                            //
+                            // This means that we are not changing anything and can just return
+                            // the per element mask
+
+                            resultNode = otherNode;
+                            break;
+                        }
+                    }
                     else if (otherNode->OperIsHWIntrinsic())
                     {
                         GenTreeHWIntrinsic* otherIntrinsic   = otherNode->AsHWIntrinsic();
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 09d6f831bd309a..8fbed3c79d41d3 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -2702,9 +2702,19 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
             GenTree* op2 = node->Op(2);
             GenTree* op3 = node->Op(3);
 
+            if (varTypeIsIntegral(simdBaseType))
+            {
+                // The integral forms are marked with NormalizeSmallTypeToInt
+                // but actually emit an instruction that operates bytewise. So
+                // fixup the base type so that the per element mask check finds
+                // the greatest number of valid matches.
+
+                simdBaseType = TYP_BYTE;
+            }
+
             // If either of the value operands is const zero and the mask is either all
             // zeros or all ones per-element, we can optimize down to AND or AND_NOT.
-            if (op3->IsVectorPerElementMask(simdBaseType, simdSize) && (op1->IsVectorZero() || op2->IsVectorZero()))
+            if (op3->IsVectorPerElementMask(TYP_BYTE, simdSize) && (op1->IsVectorZero() || op2->IsVectorZero()))
             {
                 var_types simdType = node->TypeGet();
                 GenTree*  binOp    = nullptr;
@@ -3490,53 +3500,86 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node)
     // op1: the condition vector
     // op2: the left vector
     // op3: the right vector
+
     GenTree* op1 = node->Op(1);
     GenTree* op2 = node->Op(2);
     GenTree* op3 = node->Op(3);
 
-    // If the condition vector comes from a hardware intrinsic that
-    // returns a per-element mask, we can optimize the entire
-    // conditional select to a single BlendVariable instruction
-    // (if supported by the architecture)
-
-    // First, determine if the condition is a per-element mask
-    if (op1->IsVectorPerElementMask(simdBaseType, simdSize))
+    if (op1->OperIsConvertMaskToVector())
     {
-        // Next, determine if the target architecture supports BlendVariable
-        NamedIntrinsic blendVariableId = NI_Illegal;
+        // If op1 was originally a mask, then we want to prioritize BlendVariableMask
+        // as it not only avoids the conversion from mask to vector, but also allows
+        // embedded masking and other optimizations to kick in, improving code density
 
-        bool isOp1CvtMaskToVector = op1->OperIsConvertMaskToVector();
+        NamedIntrinsic      blendVariableId = NI_AVX512_BlendVariableMask;
+        GenTreeHWIntrinsic* cvtMaskToVector = op1->AsHWIntrinsic();
 
-        if ((simdSize == 64) || isOp1CvtMaskToVector)
-        {
-            GenTree* maskNode;
+        GenTree* maskNode = cvtMaskToVector->Op(1);
+        BlockRange().Remove(op1);
+        op1 = maskNode;
 
-            if (isOp1CvtMaskToVector)
-            {
-                GenTreeHWIntrinsic* cvtMaskToVector = op1->AsHWIntrinsic();
+        // We need to change the base type to match the underlying mask size to ensure
+        // the right instruction variant is picked. If the CndSel was for TYP_INT but
+        // the mask was for TYP_DOUBLE then we'd generate vpblendmd when we really want
+        // vpblendmq. Changing the size is fine since CndSel itself is bitwise and the
+        // mask is just representing entire elements at a given size.
 
-                maskNode = cvtMaskToVector->Op(1);
-                BlockRange().Remove(op1);
+        simdBaseType = cvtMaskToVector->GetSimdBaseType();
 
-                // We need to change the base type to match the underlying mask size to ensure
-                // the right instruction variant is picked. If the CndSel was for TYP_INT but
-                // the mask was for TYP_DOUBLE then we'd generate vpblendmd when we really want
-                // vpblendmq. Changing the size is fine since CndSel itself is bitwise and the
-                // the mask is just representing entire elements at a given size.
+        resultNode =
+            m_compiler->gtNewSimdHWIntrinsicNode(simdType, op3, op2, op1, blendVariableId, simdBaseType, simdSize);
+    }
+    else if (op3->IsVectorZero())
+    {
+        // The operation is (op2 & op1) | (zero & ~op1), so we can drop the second half
+        BlockRange().Remove(op3);
+        resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize);
+    }
+    else if (op2->IsVectorZero())
+    {
+        // The operation is (zero & op1) | (op3 & ~op1), so we can drop the first half
+        BlockRange().Remove(op2);
+        resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, op1, simdBaseType, simdSize);
+    }
+    else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
+    {
+        // TernaryLogic is always single cycle with a lot of ports available and while
+        // BlendVariable is often also a single cycle it can frequently be 2-3 instead.
+        //
+        // Additionally, for TYP_SIMD64 this avoids needing to convert from vector to mask
+        // which introduces 3-4 cycles of overhead on most hardware and would be strictly worse
 
-                simdBaseType = cvtMaskToVector->GetSimdBaseType();
-            }
-            else
-            {
-                maskNode = m_compiler->gtNewSimdCvtVectorToMaskNode(TYP_MASK, op1, simdBaseType, simdSize);
-                newNodes.InsertAtEnd(maskNode);
-            }
+        GenTree* control = m_compiler->gtNewIconNode(0xCA); // (B & A) | (C & ~A)
+        newNodes.InsertAtEnd(control);
 
-            assert(maskNode->TypeIs(TYP_MASK));
-            blendVariableId = NI_AVX512_BlendVariableMask;
-            op1             = maskNode;
+        resultNode = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op1, op2, op3, control, NI_AVX512_TernaryLogic,
+                                                          simdBaseType, simdSize);
+    }
+    else if (op1->IsVectorPerElementMask(TYP_BYTE, simdSize))
+    {
+        // If the condition vector comes from a hardware intrinsic that
+        // returns a per-element mask, we can optimize the entire
+        // conditional select to a single BlendVariable instruction
+        // (if supported by the architecture).
+        //
+        // We can check using `TYP_BYTE` as we can change the base type
+        // and still get correct codegen since the ConditionalSelect is
+        // itself bitwise.
+
+        // Next, determine if the target architecture supports BlendVariable
+        NamedIntrinsic blendVariableId = NI_Illegal;
+
+        if (varTypeIsFloating(simdBaseType) && !op1->IsVectorPerElementMask(simdBaseType, simdSize))
+        {
+            // For floating-point, we want to preserve the base type if the
+            // mask is also compatible with it, otherwise we need to fixup
+            // the base type to ensure the right instruction is selected for
+            // the mask.
+
+            simdBaseType = TYP_BYTE;
         }
-        else if (simdSize == 32)
+
+        if (simdSize == 32)
         {
             // For Vector256 (simdSize == 32), BlendVariable for floats/doubles
             // is available on AVX, whereas other types (integrals) require AVX2
@@ -3568,50 +3611,26 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node)
 
     if (resultNode == nullptr)
     {
-        if (op3->IsVectorZero())
-        {
-            BlockRange().Remove(op3);
+        // We cannot optimize, so produce unoptimized instructions
+        assert(simdSize != 64);
 
-            resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize);
-        }
-        else if (op2->IsVectorZero())
-        {
-            BlockRange().Remove(op2);
-
-            resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, op1, simdBaseType, simdSize);
-        }
-        else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
+        // We'll need the mask twice
+        if (!op1->OperIs(GT_LCL_VAR))
         {
-            // We can't use the mask, but we can emit a ternary logic node
-            GenTree* control = m_compiler->gtNewIconNode(0xCA); // (B & A) | (C & ~A)
-            newNodes.InsertAtEnd(control);
-
-            resultNode = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op1, op2, op3, control, NI_AVX512_TernaryLogic,
-                                                              simdBaseType, simdSize);
+            LIR::Use op1Use;
+            LIR::Use::MakeDummyUse(newNodes, op1, &op1Use);
+            op1Use.ReplaceWithLclVar(m_compiler);
+            op1 = op1Use.Def();
         }
-        else
-        {
-            // We cannot optimize, so produce unoptimized instructions
-            assert(simdSize != 64);
-
-            // We'll need the mask twice
-            if (!op1->OperIs(GT_LCL_VAR))
-            {
-                LIR::Use op1Use;
-                LIR::Use::MakeDummyUse(newNodes, op1, &op1Use);
-                op1Use.ReplaceWithLclVar(m_compiler);
-                op1 = op1Use.Def();
-            }
 
-            GenTree* tmp1 = m_compiler->gtClone(op1);
-            GenTree* tmp2 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize);
-            GenTree* tmp3 = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, tmp1, simdBaseType, simdSize);
-            resultNode    = m_compiler->gtNewSimdBinOpNode(GT_OR, simdType, tmp2, tmp3, simdBaseType, simdSize);
+        GenTree* tmp1 = m_compiler->gtClone(op1);
+        GenTree* tmp2 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize);
+        GenTree* tmp3 = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, tmp1, simdBaseType, simdSize);
+        resultNode    = m_compiler->gtNewSimdBinOpNode(GT_OR, simdType, tmp2, tmp3, simdBaseType, simdSize);
 
-            newNodes.InsertAtEnd(tmp1);
-            newNodes.InsertAtEnd(tmp2);
-            newNodes.InsertAtEnd(tmp3);
-        }
+        newNodes.InsertAtEnd(tmp1);
+        newNodes.InsertAtEnd(tmp2);
+        newNodes.InsertAtEnd(tmp3);
     }
 
     newNodes.InsertAtEnd(resultNode);
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index fe092701846223..4f370aae0883d5 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -11503,44 +11503,58 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree)
             std::swap(op1, op2);
         }
 
-        if (((oper == GT_EQ) || (oper == GT_NE)) && op1->OperIsHWIntrinsic() && op2->IsCnsVec())
+        if ((oper == GT_EQ) || (oper == GT_NE))
         {
-            GenTreeHWIntrinsic* op1Intrinsic   = op1->AsHWIntrinsic();
-            NamedIntrinsic      op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId();
-            var_types           op1Type        = op1Intrinsic->TypeGet();
-
-            if (HWIntrinsicInfo::ReturnsPerElementMask(op1IntrinsicId) &&
-                (genTypeSize(simdBaseType) == genTypeSize(op1Intrinsic->GetSimdBaseType())))
+            if (op2->IsCnsVec() && op1->IsVectorPerElementMask(simdBaseType, simdSize))
             {
-                // This optimization is only safe if we know the other node produces
-                // AllBitsSet or Zero per element and if the outer comparison is the
-                // same size as what the other node produces for its mask
-
                 bool reverseCond = false;
 
                 if (oper == GT_EQ)
                 {
-                    // Handle `Mask == Zero` and `Zero == Mask` for integral types
+                    // Handle `Equals(Mask, Zero)` for integral types
                     if (op2->IsVectorZero())
                     {
+                        // We are comparing something that is known per element to be either
+                        // AllBitsSet or Zero, with Zero.
+                        //
+                        // In such a case:
+                        // * `AllBitsSet == Zero` is false and so produces `Zero`
+                        // * `Zero == Zero` is true and so produces `AllBitsSet`
+                        //
+                        // This means that we are just inverting the input and can reverse
+                        // the condition
+
                         reverseCond = true;
                     }
                 }
                 else if (oper == GT_NE)
                 {
-                    // Handle `Mask != AllBitsSet` and `AllBitsSet != Mask` for integral types
+                    // Handle `~Equals(Mask, AllBitsSet)` for integral types
                     if (op2->IsVectorAllBitsSet())
                     {
+                        // We are comparing something that is known per element to be either
+                        // AllBitsSet or Zero, with AllBitsSet.
+                        //
+                        // In such a case:
+                        // * `AllBitsSet != AllBitsSet` is false and so produces `Zero`
+                        // * `Zero != AllBitsSet` is true and so produces `AllBitsSet`
+                        //
+                        // This means that we are just inverting the input and can reverse
+                        // the condition
+
                         reverseCond = true;
                     }
                 }
 
                 if (reverseCond)
                 {
-                    GenTree* newNode = nullptr;
+                    GenTree*  newNode = nullptr;
+                    var_types op1Type = op1->TypeGet();
 
-                    if (op1Intrinsic->OperIsConvertVectorToMask())
+                    if (op1->OperIsConvertVectorToMask())
                     {
+                        GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic();
+
 #if defined(TARGET_XARCH)
                         op1 = op1Intrinsic->Op(1);
 #elif defined(TARGET_ARM64)
diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h
index bca1a46e2d7890..43999e89edcebf 100644
--- a/src/coreclr/jit/simd.h
+++ b/src/coreclr/jit/simd.h
@@ -5,7 +5,7 @@
 #define _SIMD_H_
 
 template <typename T>
-static bool ElementsAreSame(T* array, size_t size)
+static bool ElementsAreSame(const T* array, size_t size)
 {
     for (size_t i = 1; i < size; i++)
     {
@@ -16,7 +16,7 @@ static bool ElementsAreSame(T* array, size_t size)
 }
 
 template <typename T>
-static bool ElementsAreAllBitsSetOrZero(T* array, size_t size)
+static bool ElementsAreAllBitsSetOrZero(const T* array, size_t size)
 {
     for (size_t i = 0; i < size; i++)
     {
@@ -384,6 +384,45 @@ typedef simd64_t simd_t;
 typedef simd16_t simd_t;
 #endif
 
+static bool ElementsAreAllBitsSetOrZero(const simd_t* simdVal, var_types simdBaseType, unsigned elementCount)
+{
+    assert(simdVal != nullptr);
+
+    switch (simdBaseType)
+    {
+        case TYP_BYTE:
+        case TYP_UBYTE:
+        {
+            return ElementsAreAllBitsSetOrZero(&simdVal->u8[0], elementCount);
+        }
+
+        case TYP_SHORT:
+        case TYP_USHORT:
+        {
+            return ElementsAreAllBitsSetOrZero(&simdVal->u16[0], elementCount);
+        }
+
+        case TYP_INT:
+        case TYP_UINT:
+        case TYP_FLOAT:
+        {
+            return ElementsAreAllBitsSetOrZero(&simdVal->u32[0], elementCount);
+        }
+
+        case TYP_LONG:
+        case TYP_ULONG:
+        case TYP_DOUBLE:
+        {
+            return ElementsAreAllBitsSetOrZero(&simdVal->u64[0], elementCount);
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+}
+
 inline bool IsUnaryBitwiseOperation(genTreeOps oper)
 {
     return (oper == GT_LZCNT) || (oper == GT_NOT);
diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp
index 5c2884cb29659e..92d75c08c76bef 100644
--- a/src/coreclr/jit/valuenum.cpp
+++ b/src/coreclr/jit/valuenum.cpp
@@ -2405,53 +2405,7 @@ bool ValueNumStore::VNIsVectorNaN(var_types simdType, var_types simdBaseType, Va
 {
     assert(varTypeIsSIMD(simdType));
 
-    simd_t vector = {};
-
-    switch (simdType)
-    {
-        case TYP_SIMD8:
-        {
-            simd8_t tmp = GetConstantSimd8(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD12:
-        {
-            simd12_t tmp = GetConstantSimd12(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD16:
-        {
-            simd16_t tmp = GetConstantSimd16(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-#if defined(TARGET_XARCH)
-        case TYP_SIMD32:
-        {
-            simd32_t tmp = GetConstantSimd32(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD64:
-        {
-            simd64_t tmp = GetConstantSimd64(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-#endif // TARGET_XARCH
-
-        default:
-        {
-            unreached();
-        }
-    }
-
+    simd_t   vector       = GetConstantSimd(valVN);
     uint32_t elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType);
 
     for (uint32_t i = 0; i < elementCount; i++)
@@ -2471,53 +2425,7 @@ bool ValueNumStore::VNIsVectorNegativeZero(var_types simdType, var_types simdBas
 {
     assert(varTypeIsSIMD(simdType));
 
-    simd_t vector = {};
-
-    switch (simdType)
-    {
-        case TYP_SIMD8:
-        {
-            simd8_t tmp = GetConstantSimd8(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD12:
-        {
-            simd12_t tmp = GetConstantSimd12(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD16:
-        {
-            simd16_t tmp = GetConstantSimd16(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-#if defined(TARGET_XARCH)
-        case TYP_SIMD32:
-        {
-            simd32_t tmp = GetConstantSimd32(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-
-        case TYP_SIMD64:
-        {
-            simd64_t tmp = GetConstantSimd64(valVN);
-            memcpy(&vector, &tmp, genTypeSize(simdType));
-            break;
-        }
-#endif // TARGET_XARCH
-
-        default:
-        {
-            unreached();
-        }
-    }
-
+    simd_t   vector       = GetConstantSimd(valVN);
     uint32_t elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType);
 
     for (uint32_t i = 0; i < elementCount; i++)
@@ -4013,6 +3921,63 @@ float ValueNumStore::GetConstantSingle(ValueNum argVN)
 }
 
 #if defined(FEATURE_SIMD)
+// Given a simd constant value number return its value.
+//
+simd_t ValueNumStore::GetConstantSimd(ValueNum argVN)
+{
+    assert(IsVNConstant(argVN));
+
+    simd_t    vector   = {};
+    var_types simdType = TypeOfVN(argVN);
+
+    switch (simdType)
+    {
+        case TYP_SIMD8:
+        {
+            simd8_t tmp = ConstantValue<simd8_t>(argVN);
+            memcpy(&vector, &tmp, genTypeSize(simdType));
+            break;
+        }
+
+        case TYP_SIMD12:
+        {
+            simd12_t tmp = ConstantValue<simd12_t>(argVN);
+            memcpy(&vector, &tmp, genTypeSize(simdType));
+            break;
+        }
+
+        case TYP_SIMD16:
+        {
+            simd16_t tmp = ConstantValue<simd16_t>(argVN);
+            memcpy(&vector, &tmp, genTypeSize(simdType));
+            break;
+        }
+
+#if defined(TARGET_XARCH)
+        case TYP_SIMD32:
+        {
+            simd32_t tmp = ConstantValue<simd32_t>(argVN);
+            memcpy(&vector, &tmp, genTypeSize(simdType));
+            break;
+        }
+
+        case TYP_SIMD64:
+        {
+            simd64_t tmp = ConstantValue<simd64_t>(argVN);
+            memcpy(&vector, &tmp, genTypeSize(simdType));
+            break;
+        }
+#endif // TARGET_XARCH
+
+        default:
+        {
+            unreached();
+        }
+    }
+
+    return vector;
+}
+
 // Given a simd8 constant value number return its value as a simd8.
 //
 simd8_t ValueNumStore::GetConstantSimd8(ValueNum argVN)
@@ -8444,10 +8409,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(
 
             case GT_EQ:
             {
-                NamedIntrinsic argIntrinsicId;
-                unsigned       argSimdSize;
-                CorInfoType    argSimdBaseJitType;
-
                 if (varTypeIsFloating(baseType))
                 {
                     // Handle `(x == NaN) == false` and `(NaN == x) == false` for floating-point types
@@ -8458,27 +8419,24 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(
                         return VNZeroForType(type);
                     }
                 }
-                else if (IsVNHWIntrinsicFunc(argVN, &argIntrinsicId, &argSimdSize, &argSimdBaseJitType))
+                else if (IsVectorPerElementMask(argVN, baseType, simdSize))
                 {
-                    // This optimization is only safe if we know the other node produces
-                    // AllBitsSet or Zero per element and if the outer comparison is the
-                    // same size as what the other node produces for its mask
-
-                    if (!HWIntrinsicInfo::ReturnsPerElementMask(argIntrinsicId))
-                    {
-                        break;
-                    }
-
-                    if (genTypeSize(baseType) == genTypeSize(JitType2PreciseVarType(argSimdBaseJitType)))
-                    {
-                        break;
-                    }
-
-                    // Handle `(Mask == AllBitsSet) == Mask` and `(AllBitsSet == Mask) == Mask` for integral types
+                    // Handle `Equals(PerElementMask, AllBitsSet)` and `Equals(AllBitsSet, PerElementMask)` for
+                    // integrals
                     ValueNum allBitsVN = VNAllBitsForType(type, simdSize);
 
                     if (cnsVN == allBitsVN)
                     {
+                        // We are comparing something that is known per element to be either
+                        // AllBitsSet or Zero, with AllBitsSet.
+                        //
+                        // In such a case:
+                        // * `AllBitsSet == AllBitsSet` is true and so produces `AllBitsSet`
+                        // * `AllBitsSet == Zero` is false and so produces `Zero`
+                        //
+                        // This means that we are not changing anything and can just return
+                        // the per element mask
+
                         return argVN;
                     }
                 }
@@ -8630,10 +8588,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(
             {
                 var_types simdType = Compiler::getSIMDTypeForSize(simdSize);
 
-                NamedIntrinsic argIntrinsicId;
-                unsigned       argSimdSize;
-                CorInfoType    argSimdBaseJitType;
-
                 if (varTypeIsFloating(baseType))
                 {
                     // Handle `(x != NaN) == true` and `(NaN != x) == true` for floating-point types
@@ -8643,27 +8597,23 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(
                         return VNAllBitsForType(type, elementCount);
                     }
                 }
-                else if (IsVNHWIntrinsicFunc(argVN, &argIntrinsicId, &argSimdSize, &argSimdBaseJitType))
+                else if (IsVectorPerElementMask(argVN, baseType, simdSize))
                 {
-                    // This optimization is only safe if we know the other node produces
-                    // AllBitsSet or Zero per element and if the outer comparison is the
-                    // same size as what the other node produces for its mask
-
-                    if (!HWIntrinsicInfo::ReturnsPerElementMask(argIntrinsicId))
-                    {
-                        break;
-                    }
-
-                    if (genTypeSize(baseType) != genTypeSize(JitType2PreciseVarType(argSimdBaseJitType)))
-                    {
-                        break;
-                    }
-
                     // Handle `(Mask != Zero) == Mask` and `(Zero != Mask) == Mask` for integral types
                     ValueNum zeroVN = VNZeroForType(type);
 
                     if (cnsVN == zeroVN)
                     {
+                        // We are comparing something that is known per element to be either
+                        // AllBitsSet or Zero, with Zero.
+                        //
+                        // In such a case:
+                        // * `AllBitsSet != Zero` is true and so produces `AllBitsSet`
+                        // * `Zero != Zero` is false and so produces `Zero`
+                        //
+                        // This means that we are not changing anything and can just return
+                        // the per element mask
+
                         return argVN;
                     }
                 }
@@ -9291,6 +9241,107 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary(
     return VNForFunc(type, func, arg0VN, arg1VN, arg2VN, resultTypeVN);
 }
 
+//-------------------------------------------------------------------
+// IsVectorPerElementMask: returns true if the ValueNum is a vector constant per-element mask
+//                         (every element has either all bits set or none of them) for the
+//                         given simd size and base type.
+//
+// Arguments:
+//    vn           - the value number to check
+//    simdBaseType - the base type of the constant being checked.
+//    simdSize     - the size of the SIMD type of the intrinsic.
+//
+// Returns:
+//     True if vn is a per-element mask compatible with simdBaseType and simdSize
+//
+bool ValueNumStore::IsVectorPerElementMask(ValueNum vn, var_types simdBaseType, unsigned simdSize)
+{
+    // This should be kept in sync with GenTree::IsVectorPerElementMask
+
+    var_types simdType     = TypeOfVN(vn);
+    unsigned  elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType);
+
+    assert(varTypeIsSIMD(simdType));
+    assert(genTypeSize(simdType) == simdSize);
+
+    if (IsVNConstant(vn))
+    {
+        simd_t simdVal = GetConstantSimd(vn);
+        return ElementsAreAllBitsSetOrZero(&simdVal, simdBaseType, elementCount);
+    }
+
+    VNFuncApp      funcApp;
+    NamedIntrinsic intrinsicId;
+    unsigned       intrinsicSimdSize;
+    var_types      intrinsicSimdBaseType;
+
+    if (!IsVNHWIntrinsicFunc(vn, &funcApp, &intrinsicId, &intrinsicSimdSize, &intrinsicSimdBaseType))
+    {
+        return false;
+    }
+
+    if (intrinsicSimdSize != simdSize)
+    {
+        return false;
+    }
+
+    if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
+    {
+        // When producing a SIMD result, we need for it to
+        // have a base type that is the same size or larger
+        // as what we expect.
+        //
+        // Consider for example us expecting `byte` and the
+        // intrinsic here produces `ushort`. In that case we
+        // expect every byte to be either `0x00` or `0xFF`
+        // and the intrinsic produces either `0x0000` or `0xFFFF`
+        // and so it meets this need.
+        //
+        // However, the inverse is not safe as we would expect
+        // `0x0000` or `0xFFFF`, but the intrinsic could produce
+        // `0x00FF` or `0xFF00` which fails the expectation.
+
+        return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType);
+    }
+
+    bool       isScalar = false;
+    genTreeOps oper     = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar);
+
+    switch (oper)
+    {
+        case GT_AND:
+        case GT_AND_NOT:
+        case GT_OR:
+        case GT_OR_NOT:
+        case GT_XOR:
+        case GT_XOR_NOT:
+        {
+            // We are a binary bitwise operation where both inputs are per-element masks
+            //
+            // While some cases like OR could combine in ways that produce a usable mask
+            // there isn't any way to statically determine this for non-constants and
+            // the constant cases should've already been folded.
+
+            return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize) &&
+                   IsVectorPerElementMask(funcApp.m_args[1], simdBaseType, simdSize);
+        }
+
+        case GT_NOT:
+        {
+            // We are an unary bitwise operation where the input is a per-element mask
+            return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize);
+        }
+
+        default:
+        {
+            assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper));
+            break;
+        }
+    }
+
+    return false;
+}
+
 #endif // FEATURE_HW_INTRINSICS
 
 ValueNum ValueNumStore::EvalMathFuncUnary(var_types typ, NamedIntrinsic gtMathFN, ValueNum arg0VN)
@@ -10024,56 +10075,55 @@ bool ValueNumStore::IsVNBinFunc(ValueNum vn, VNFunc func, ValueNum* op1, ValueNu
 
 //----------------------------------------------------------------------------------
 // IsVNHWIntrinsicFunc: A specialized version of GetVNFunc that checks if the given ValueNum
-//     is a HWIntrinsic. If so, it returns the intrinsicId, simdSize, and simdBaseJitType.
+//     is a HWIntrinsic. If so, it returns the intrinsicId, simdSize, and simdBaseType.
 //
 // Arguments:
 //    vn              - The ValueNum to check.
+//    funcApp         - The function application
 //    intrinsicId     - The intrinsic id.
 //    simdSize        - The simd size of the intrinsic.
-//    simdBaseJitType - The simd base jit type for the intrinsic.
+//    simdBaseType    - The simd base type for the intrinsic.
 //
 // Return Value:
 //    true if the given vn is a VNFunc for a HWIntrinsic
 //
-bool ValueNumStore::IsVNHWIntrinsicFunc(ValueNum        vn,
-                                        NamedIntrinsic* intrinsicId,
-                                        unsigned*       simdSize,
-                                        CorInfoType*    simdBaseJitType)
+bool ValueNumStore::IsVNHWIntrinsicFunc(
+    ValueNum vn, VNFuncApp* funcApp, NamedIntrinsic* intrinsicId, unsigned* simdSize, var_types* simdBaseType)
 {
+    assert(funcApp != nullptr);
     assert(intrinsicId != nullptr);
+    assert(simdSize != nullptr);
+    assert(simdBaseType != nullptr);
 
 #if defined(FEATURE_HW_INTRINSICS)
-    VNFuncApp funcApp;
-
-    if (!GetVNFunc(vn, &funcApp))
+    if (!GetVNFunc(vn, funcApp))
     {
         return false;
     }
 
-    VNFunc outerFunc = funcApp.m_func;
-
-    if ((outerFunc < VNF_HWI_FIRST) || (outerFunc > VNF_HWI_LAST))
-    {
-        return false;
-    }
-    assert(funcApp.m_arity != 0);
+    VNFunc funcId = funcApp->m_func;
 
-    if (GetVNFunc(funcApp.m_args[funcApp.m_arity - 1], &funcApp))
+    if ((funcId < VNF_HWI_FIRST) || (funcId > VNF_HWI_LAST))
     {
         return false;
     }
 
-    assert(funcApp.m_func == VNF_SimdType);
-    assert(funcApp.m_arity != 2);
+    assert(funcApp->m_arity != 0);
+    VNFuncApp simdType;
 
-    if (!IsVNConstant(funcApp.m_args[0]) || !IsVNConstant(funcApp.m_args[1]))
+    if (!GetVNFunc(funcApp->m_args[funcApp->m_arity - 1], &simdType))
     {
         return false;
     }
 
-    *intrinsicId     = NamedIntrinsic((outerFunc - VNF_HWI_FIRST) + (NI_HW_INTRINSIC_START + 1));
-    *simdSize        = static_cast<uint32_t>(GetConstantInt32(funcApp.m_args[0]));
-    *simdBaseJitType = static_cast<CorInfoType>(GetConstantInt32(funcApp.m_args[1]));
+    assert(simdType.m_func == VNF_SimdType);
+    assert(simdType.m_arity == 2);
+    assert(IsVNConstant(simdType.m_args[0]));
+    assert(IsVNConstant(simdType.m_args[1]));
+
+    *intrinsicId  = static_cast<NamedIntrinsic>((funcId - VNF_HWI_FIRST) + (NI_HW_INTRINSIC_START + 1));
+    *simdSize     = static_cast<uint32_t>(GetConstantInt32(simdType.m_args[0]));
+    *simdBaseType = static_cast<var_types>(GetConstantInt32(simdType.m_args[1]));
 
     return true;
 #else
@@ -10510,16 +10560,16 @@ void ValueNumStore::vnDumpMemOpaque(Compiler* comp, VNFuncApp* memOpaque)
 #ifdef FEATURE_SIMD
 void ValueNumStore::vnDumpSimdType(Compiler* comp, VNFuncApp* simdType)
 {
-    assert(simdType->m_func == VNF_SimdType); // Preconditions.
+    assert(simdType->m_func == VNF_SimdType);
+    assert(simdType->m_arity == 2);
     assert(IsVNConstant(simdType->m_args[0]));
     assert(IsVNConstant(simdType->m_args[1]));
 
-    int         simdSize    = ConstantValue<int>(simdType->m_args[0]);
-    CorInfoType baseJitType = (CorInfoType)ConstantValue<int>(simdType->m_args[1]);
+    int       simdSize     = ConstantValue<int>(simdType->m_args[0]);
+    var_types simdBaseType = static_cast<var_types>(ConstantValue<int>(simdType->m_args[1]));
 
     printf("%s(simd%d, %s)", VNFuncName(simdType->m_func), simdSize,
-           baseJitType == CORINFO_TYPE_UNDEF ? varTypeName(TYP_UNDEF)
-                                             : varTypeName(JitType2PreciseVarType(baseJitType)));
+           (simdBaseType == TYP_UNDEF) ? varTypeName(TYP_UNDEF) : varTypeName(simdBaseType));
 }
 #endif // FEATURE_SIMD
 
diff --git a/src/coreclr/jit/valuenum.h b/src/coreclr/jit/valuenum.h
index 8c75e17935a485..d2e22aadaf0475 100644
--- a/src/coreclr/jit/valuenum.h
+++ b/src/coreclr/jit/valuenum.h
@@ -387,6 +387,7 @@ class ValueNumStore
     float  GetConstantSingle(ValueNum argVN);
 
 #if defined(FEATURE_SIMD)
+    simd_t   GetConstantSimd(ValueNum argVN);
     simd8_t  GetConstantSimd8(ValueNum argVN);
     simd12_t GetConstantSimd12(ValueNum argVN);
     simd16_t GetConstantSimd16(ValueNum argVN);
@@ -1323,6 +1324,8 @@ class ValueNumStore
                                        ValueNum            arg1VN,
                                        ValueNum            arg2VN,
                                        ValueNum            resultTypeVN);
+
+    bool IsVectorPerElementMask(ValueNum vn, var_types simdBaseType, unsigned simdSize);
 #endif // FEATURE_HW_INTRINSICS
 
     // Returns "true" iff "vn" represents a function application.
@@ -1336,10 +1339,8 @@ class ValueNumStore
     bool IsVNBinFunc(ValueNum vn, VNFunc func, ValueNum* op1 = nullptr, ValueNum* op2 = nullptr);
 
     // Returns "true" iff "vn" is a function application for a HWIntrinsic
-    bool IsVNHWIntrinsicFunc(ValueNum        vn,
-                             NamedIntrinsic* intrinsicId,
-                             unsigned*       simdSize,
-                             CorInfoType*    simdBaseJitType);
+    bool IsVNHWIntrinsicFunc(
+        ValueNum vn, VNFuncApp* funcApp, NamedIntrinsic* intrinsicId, unsigned* simdSize, var_types* simdBaseType);
 
     // Returns "true" iff "vn" is a function application of the form "func(op, cns)"
     // the cns can be on the left side if the function is commutative.