From 63447df7e01ae4ae7bb57d83983a463cc3e2f5cc Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Fri, 20 Mar 2026 01:40:26 -0700
Subject: [PATCH 1/3] remove dead vector2/3 lowering code

---
 src/coreclr/jit/gentree.cpp          |   2 +-
 src/coreclr/jit/hwintrinsicarm64.cpp |   2 +-
 src/coreclr/jit/lowerarmarch.cpp     |  66 +----------
 src/coreclr/jit/lowerxarch.cpp       | 168 ++-------------------------
 4 files changed, 13 insertions(+), 225 deletions(-)

diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 3dfca0ba725a0a..457432ec1cb90b 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -30420,7 +30420,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler*  comp,
     else
 #endif // TARGET_XARCH
     {
-        assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
+        assert((simdSize == 8) || (simdSize == 16));
 
 #if defined(TARGET_ARM64)
         assert(!isScalar || (simdSize == 8));
diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp
index c930f30bc3c8cc..3c47fe9a34173c 100644
--- a/src/coreclr/jit/hwintrinsicarm64.cpp
+++ b/src/coreclr/jit/hwintrinsicarm64.cpp
@@ -1344,7 +1344,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
                 op1 = impSIMDPopStack();
 
                 retNode = gtNewSimdDotProdNode(simdType, op1, op2, simdBaseType, simdSize);
-                retNode = gtNewSimdGetElementNode(retType, retNode, gtNewIconNode(0), simdBaseType, simdSize);
+                retNode = gtNewSimdToScalarNode(retType, retNode, simdBaseType, simdSize);
             }
             break;
         }
diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp
index 2f7ec732acb297..cd21b6e13876f0 100644
--- a/src/coreclr/jit/lowerarmarch.cpp
+++ b/src/coreclr/jit/lowerarmarch.cpp
@@ -2093,7 +2093,7 @@ bool Lowering::IsValidConstForMovImm(GenTreeHWIntrinsic* node)
 }
 
 //----------------------------------------------------------------------------------------------
-// Lowering::LowerHWIntrinsicCmpOp: Lowers a Vector128 or Vector256 comparison intrinsic
+// Lowering::LowerHWIntrinsicCmpOp: Lowers a Vector64 or Vector128 comparison intrinsic
 //
 //  Arguments:
 //     node  - The hardware intrinsic node.
@@ -2221,26 +2221,6 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
     BlockRange().InsertBefore(node, cmp);
     LowerNode(cmp);
 
-    if ((simdBaseType == TYP_FLOAT) && (simdSize == 12))
-    {
-        // For TYP_SIMD12 we don't want the upper bits to participate in the comparison. So, we will insert all ones
-        // into those bits of the result, "as if" the upper bits are equal. Then if all lower bits are equal, we get the
-        // expected all-ones result, and will get the expected 0's only where there are non-matching bits.
-
-        GenTree* idxCns = m_compiler->gtNewIconNode(3, TYP_INT);
-        BlockRange().InsertAfter(cmp, idxCns);
-
-        GenTree* insCns = m_compiler->gtNewIconNode(-1, TYP_INT);
-        BlockRange().InsertAfter(idxCns, insCns);
-
-        GenTree* tmp =
-            m_compiler->gtNewSimdHWIntrinsicNode(simdType, cmp, idxCns, insCns, NI_AdvSimd_Insert, TYP_INT, simdSize);
-        BlockRange().InsertAfter(insCns, tmp);
-        LowerNode(tmp);
-
-        cmp = tmp;
-    }
-
     if (simdSize != 8) // we don't need compression for Vector64
     {
         GenTree* msk;
@@ -2339,7 +2319,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
 
     if (isConstant)
     {
-        assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
+        assert((simdSize == 8) || (simdSize == 16));
 
         for (GenTree* arg : node->Operands())
         {
@@ -2497,46 +2477,6 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
     GenTree* tmp1 = nullptr;
     GenTree* tmp2 = nullptr;
 
-    if (simdSize == 12)
-    {
-        assert(simdBaseType == TYP_FLOAT);
-
-        // For 12 byte SIMD, we need to clear the upper 4 bytes:
-        //   idx  =    CNS_INT       int    0x03
-        //   tmp1 = *  CNS_DBL       float  0.0
-        //          /--*  op1  simd16
-        //          +--*  idx  int
-        //          +--*  tmp1 simd16
-        //   op1  = *  HWINTRINSIC   simd16 T Insert
-        //   ...
-
-        // This is roughly the following managed code:
-        //    op1 = AdvSimd.Insert(op1, 0x03, 0.0f);
-        //    ...
-
-        idx = m_compiler->gtNewIconNode(0x03, TYP_INT);
-        BlockRange().InsertAfter(op1, idx);
-
-        tmp1 = m_compiler->gtNewZeroConNode(TYP_FLOAT);
-        BlockRange().InsertAfter(idx, tmp1);
-        LowerNode(tmp1);
-
-        op1 = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op1, idx, tmp1, NI_AdvSimd_Insert, simdBaseType, simdSize);
-        BlockRange().InsertAfter(tmp1, op1);
-        LowerNode(op1);
-
-        idx = m_compiler->gtNewIconNode(0x03, TYP_INT);
-        BlockRange().InsertAfter(op2, idx);
-
-        tmp2 = m_compiler->gtNewZeroConNode(TYP_FLOAT);
-        BlockRange().InsertAfter(idx, tmp2);
-        LowerNode(tmp2);
-
-        op2 = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op2, idx, tmp2, NI_AdvSimd_Insert, simdBaseType, simdSize);
-        BlockRange().InsertAfter(tmp2, op2);
-        LowerNode(op2);
-    }
-
     // We will be constructing the following parts:
     //   ...
     //          /--*  op1  simd16
@@ -2615,7 +2555,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
         }
         else
         {
-            assert((simdSize == 12) || (simdSize == 16));
+            assert(simdSize == 16);
 
             // We will be constructing the following parts:
             //   ...
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index a16db3113fc366..0ee6a0eaa3d30c 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -3373,32 +3373,11 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
         case TYP_USHORT:
         case TYP_INT:
         case TYP_UINT:
-        {
-            cmpType = simdBaseType;
-            mskType = TYP_UBYTE;
-
-            if (simdSize == 32)
-            {
-                cmpIntrinsic = NI_AVX2_CompareEqual;
-                mskIntrinsic = NI_AVX2_MoveMask;
-                mskConstant  = -1;
-            }
-            else
-            {
-                assert(simdSize == 16);
-
-                cmpIntrinsic = NI_X86Base_CompareEqual;
-                mskIntrinsic = NI_X86Base_MoveMask;
-                mskConstant  = 0xFFFF;
-            }
-            break;
-        }
-
         case TYP_LONG:
         case TYP_ULONG:
         {
-            mskType = TYP_UBYTE;
             cmpType = simdBaseType;
+            mskType = TYP_UBYTE;
 
             if (simdSize == 32)
             {
@@ -3430,22 +3409,11 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
             }
             else
             {
+                assert(simdSize == 16);
+
                 cmpIntrinsic = NI_X86Base_CompareEqual;
                 mskIntrinsic = NI_X86Base_MoveMask;
-
-                if (simdSize == 16)
-                {
-                    mskConstant = 0xF;
-                }
-                else if (simdSize == 12)
-                {
-                    mskConstant = 0x7;
-                }
-                else
-                {
-                    assert(simdSize == 8);
-                    mskConstant = 0x3;
-                }
+                mskConstant  = 0xF;
             }
             break;
         }
@@ -3489,20 +3457,6 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
     GenTree* mskCns = m_compiler->gtNewIconNode(mskConstant, TYP_INT);
     BlockRange().InsertAfter(msk, mskCns);
 
-    if ((simdBaseType == TYP_FLOAT) && (simdSize < 16))
-    {
-        // For TYP_SIMD8 and TYP_SIMD12 we need to clear the upper bits and can't assume their value
-
-        GenTree* tmp = m_compiler->gtNewOperNode(GT_AND, TYP_INT, msk, mskCns);
-        BlockRange().InsertAfter(mskCns, tmp);
-        LowerNode(tmp);
-
-        msk = tmp;
-
-        mskCns = m_compiler->gtNewIconNode(mskConstant, TYP_INT);
-        BlockRange().InsertAfter(msk, mskCns);
-    }
-
     node->ChangeOper(cmpOp);
     node->ChangeType(TYP_INT);
     node->AsOp()->gtOp1 = msk;
@@ -4185,7 +4139,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
 
     if (isConstant)
     {
-        assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16) || (simdSize == 32) || (simdSize == 64));
+        assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64));
 
         for (GenTree* arg : node->Operands())
         {
@@ -5829,19 +5783,9 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
                 //   var tmp3 = Avx.DotProduct(op1, op2, 0xFF);
                 //   return tmp3.ToScalar();
 
-                if (simdSize == 8)
-                {
-                    idx = m_compiler->gtNewIconNode(0x3F, TYP_INT);
-                }
-                else if (simdSize == 12)
-                {
-                    idx = m_compiler->gtNewIconNode(0x7F, TYP_INT);
-                }
-                else
-                {
-                    assert(simdSize == 16);
-                    idx = m_compiler->gtNewIconNode(0xFF, TYP_INT);
-                }
+                assert(simdSize == 16);
+
+                idx = m_compiler->gtNewIconNode(0xFF, TYP_INT);
                 BlockRange().InsertBefore(node, idx);
 
                 if (varTypeIsSIMD(node->gtType))
@@ -5913,91 +5857,6 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
                 unreached();
             }
         }
-
-        if (simdSize == 8)
-        {
-            assert(simdBaseType == TYP_FLOAT);
-
-            // If simdSize == 8 then we have only two elements, not the 4 that we got from getSIMDVectorLength,
-            // which we gave a simdSize of 16. So, we set the simd16Count to 2 so that only 1 hadd will
-            // be emitted rather than 2, so that the upper two elements will be ignored.
-
-            simd16Count = 2;
-        }
-        else if (simdSize == 12)
-        {
-            assert(simdBaseType == TYP_FLOAT);
-
-            // We need to mask off the most significant element to avoid the shuffle + add
-            // from including it in the computed result. We need to do this for both op1 and
-            // op2 in case one of them is `NaN` (because Zero * NaN == NaN)
-
-            simd16_t simd16Val = {};
-
-            simd16Val.i32[0] = -1;
-            simd16Val.i32[1] = -1;
-            simd16Val.i32[2] = -1;
-            simd16Val.i32[3] = +0;
-
-            simdType = TYP_SIMD16;
-            simdSize = 16;
-
-            // We will be constructing the following parts:
-            //   ...
-            //          +--*  CNS_INT    int    -1
-            //          +--*  CNS_INT    int    -1
-            //          +--*  CNS_INT    int    -1
-            //          +--*  CNS_INT    int    0
-            //   tmp1 = *  HWINTRINSIC   simd16 T Create
-            //          /--*  op1 simd16
-            //          +--*  tmp1 simd16
-            //   op1  = *  HWINTRINSIC   simd16 T And
-            //   ...
-
-            // This is roughly the following managed code:
-            //   ...
-            //   tmp1 = Vector128.Create(-1, -1, -1, 0);
-            //   op1  = Sse.And(op1, tmp1);
-            //   ...
-
-            GenTreeVecCon* vecCon1 = m_compiler->gtNewVconNode(simdType);
-            memcpy(&vecCon1->gtSimdVal, &simd16Val, sizeof(simd16_t));
-            BlockRange().InsertAfter(op1, vecCon1);
-
-            op1 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, vecCon1, simdBaseType, simdSize);
-            BlockRange().InsertAfter(vecCon1, op1);
-
-            LowerNode(vecCon1);
-            LowerNode(op1);
-
-            // We will be constructing the following parts:
-            //   ...
-            //          +--*  CNS_INT    int    -1
-            //          +--*  CNS_INT    int    -1
-            //          +--*  CNS_INT    int    -1
-            //          +--*  CNS_INT    int    0
-            //   tmp2 = *  HWINTRINSIC   simd16 T Create
-            //          /--*  op2 simd16
-            //          +--*  tmp2 simd16
-            //   op2  = *  HWINTRINSIC   simd16 T And
-            //   ...
-
-            // This is roughly the following managed code:
-            //   ...
-            //   tmp2 = Vector128.Create(-1, -1, -1, 0);
-            //   op2  = Sse.And(op2, tmp2);
-            //   ...
-
-            GenTreeVecCon* vecCon2 = m_compiler->gtNewVconNode(simdType);
-            memcpy(&vecCon2->gtSimdVal, &simd16Val, sizeof(simd16_t));
-            BlockRange().InsertAfter(op2, vecCon2);
-
-            op2 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op2, vecCon2, simdBaseType, simdSize);
-            BlockRange().InsertAfter(vecCon2, op2);
-
-            LowerNode(vecCon2);
-            LowerNode(op2);
-        }
     }
 
     // We will be constructing the following parts:
@@ -9284,17 +9143,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
         }
     }
 
-    if ((simdSize == 8) || (simdSize == 12))
-    {
-        // We want to handle GetElement/ToScalar still for Vector2/3
-        if (!HWIntrinsicInfo::IsVectorToScalar(intrinsicId) && !HWIntrinsicInfo::IsVectorGetElement(intrinsicId))
-        {
-            // TODO-XArch-CQ: Ideally we would key this off of the size the containing node
-            // expects vs the size node actually is or would be if spilled to the stack
-            return;
-        }
-    }
-
     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
 
     const bool isCommutative = node->isCommutativeHWIntrinsic();

From 4c9f71bca044c2572a6292995c957fc0d726974e Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Sat, 21 Mar 2026 17:18:04 -0700
Subject: [PATCH 2/3] remove one more unreachable condition

---
 src/coreclr/jit/lowerarmarch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp
index cd21b6e13876f0..ffdd8b9729d73b 100644
--- a/src/coreclr/jit/lowerarmarch.cpp
+++ b/src/coreclr/jit/lowerarmarch.cpp
@@ -2145,7 +2145,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
     }
 
     // Special case: "vec ==/!= zero_vector"
-    if (!varTypeIsFloating(simdBaseType) && (op != nullptr) && (simdSize != 12))
+    if (!varTypeIsFloating(simdBaseType) && (op != nullptr))
     {
         GenTree* cmp = op;
         if (simdSize != 8) // we don't need compression for Vector64

From e6b533c56a7dc45e737f4d84d06af4edc72c2ea0 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Sat, 21 Mar 2026 17:30:32 -0700
Subject: [PATCH 3/3] and another

---
 src/coreclr/jit/lowerxarch.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 0ee6a0eaa3d30c..2b181486ee7d75 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -2969,10 +2969,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
         }
     }
 
-    // TODO-XARCH-AVX512: We should handle TYP_SIMD12 here under the EVEX path, but doing
-    // so will require us to account for the unused 4th element.
-
-    if ((simdType != TYP_SIMD12) && m_compiler->canUseEvexEncoding())
+    if (m_compiler->canUseEvexEncoding())
     {
         // The EVEX encoded versions of the comparison instructions all return a kmask
         //