From 8ac975e20425617d445aa24d7bfe09dd465b7247 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Wed, 14 May 2025 15:57:02 -0700
Subject: [PATCH 1/3] allow any baseline intrinsics in lowering

---
 src/coreclr/jit/hwintrinsic.h               |  1 +
 src/coreclr/jit/hwintrinsiccodegenxarch.cpp |  6 +--
 src/coreclr/jit/hwintrinsicxarch.cpp        | 33 +++++++++++++
 src/coreclr/jit/lowerxarch.cpp              | 51 +++++++++++----------
 src/coreclr/jit/morph.cpp                   | 39 ++++------------
 src/coreclr/jit/rationalize.cpp             |  4 ++
 6 files changed, 78 insertions(+), 56 deletions(-)

diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h
index d936d579d8e25a..edcb84ce10144e 100644
--- a/src/coreclr/jit/hwintrinsic.h
+++ b/src/coreclr/jit/hwintrinsic.h
@@ -546,6 +546,7 @@ struct HWIntrinsicInfo
     static bool isScalarIsa(CORINFO_InstructionSet isa);
 
 #ifdef TARGET_XARCH
+    static bool                isBaselineIsa(CORINFO_InstructionSet isa);
     static bool                isAVX2GatherIntrinsic(NamedIntrinsic id);
     static FloatComparisonMode lookupFloatComparisonModeForSwappedArgs(FloatComparisonMode comparison);
     static NamedIntrinsic      lookupIdForFloatComparisonMode(NamedIntrinsic      intrinsic,
diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
index a4f22c16ec2c6f..b2b825abc5a261 100644
--- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
+++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
@@ -403,8 +403,9 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
     GenTree*               embMaskNode = nullptr;
     GenTree*               embMaskOp   = nullptr;
 
-    // We need to validate that other phases of the compiler haven't introduced unsupported intrinsics
-    assert(compiler->compIsaSupportedDebugOnly(isa));
+    // We need to validate that other phases of the compiler haven't introduced unsupported intrinsics.
+    // We allow an exception for baseline intrinsics to be introduced unconditionally in LIR.
+    assert(compiler->compIsaSupportedDebugOnly(isa) || HWIntrinsicInfo::isBaselineIsa(isa));
     assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
     assert(!HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsicId) || !varTypeIsSmall(node->GetSimdBaseType()));
 
@@ -1827,7 +1828,6 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
     regNumber      targetReg   = node->GetRegNum();
     var_types      baseType    = node->GetSimdBaseType();
 
-    assert(compiler->compIsaSupportedDebugOnly(InstructionSet_SSE));
     assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
 
     GenTree* op1 = (node->GetOperandCount() >= 1) ? node->Op(1) : nullptr;
diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
index 106a0b952fe139..6c46fe2dd07854 100644
--- a/src/coreclr/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -855,6 +855,39 @@ NamedIntrinsic HWIntrinsicInfo::lookupIdForFloatComparisonMode(NamedIntrinsic
     }
 }
 
+//------------------------------------------------------------------------
+// isBaselineIsa: Gets a value that indicates whether the InstructionSet is
+// part of the required hardware support for this platform
+//
+// Arguments:
+//    isa - The InstructionSet to check
+//
+// Return Value:
+//    true if isa is part of the baseline; otherwise, false
+bool HWIntrinsicInfo::isBaselineIsa(CORINFO_InstructionSet isa)
+{
+    switch (isa)
+    {
+        case InstructionSet_X86Base:
+        case InstructionSet_SSE:
+        case InstructionSet_SSE2:
+#ifdef TARGET_AMD64
+        case InstructionSet_X86Base_X64:
+        case InstructionSet_SSE_X64:
+        case InstructionSet_SSE2_X64:
+#endif // TARGET_AMD64
+        case InstructionSet_Vector128:
+        {
+            return true;
+        }
+
+        default:
+        {
+            return false;
+        }
+    }
+}
+
 //------------------------------------------------------------------------
 // isFullyImplementedIsa: Gets a value that indicates whether the InstructionSet is fully implemented
 //
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 868718636d3454..7da5a199035839 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -868,6 +868,11 @@ void Lowering::LowerCast(GenTree* tree)
     {
         // If we don't have AVX10v2 saturating conversion instructions for
         // floating->integral, we have to handle the saturation logic here.
+        //
+        // Since this implements ordinary casts, we bend the normal rules around ISA support
+        // for HWIntrinsics and assume the baseline ISA set (SSE2 and below) is available.
+        // For this reason, we eschew most gentree convenience methods (e.g. gtNewSimdBinOpNode)
+        // and create the HWIntrinsic nodes explicitly, as most helpers assert ISA support.
 
         JITDUMP("LowerCast before:\n");
         DISPTREERANGE(BlockRange(), tree);
@@ -904,8 +909,8 @@ void Lowering::LowerCast(GenTree* tree)
             GenTree* zero = comp->gtNewZeroConNode(TYP_SIMD16);
             GenTree* fixupVal =
                 comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, maxScalarIntrinsic, srcBaseType, 16);
-
-            GenTree* toScalar = comp->gtNewSimdToScalarNode(srcType, fixupVal, srcBaseType, 16);
+            GenTree* toScalar =
+                comp->gtNewSimdHWIntrinsicNode(srcType, fixupVal, NI_Vector128_ToScalar, srcBaseType, 16);
 
             castRange.InsertAtEnd(zero);
             castRange.InsertAtEnd(fixupVal);
@@ -915,9 +920,6 @@ void Lowering::LowerCast(GenTree* tree)
         }
         else
         {
-            assert(comp->IsBaselineSimdIsaSupportedDebugOnly());
-            assert(!TargetArchitecture::Is64Bit || comp->compIsaSupportedDebugOnly(InstructionSet_SSE2_X64));
-
             // We need to fix up NaN as well as handle possible overflow. Signed conversions
             // return int/long.MinValue for any overflow, which is correct for saturation of
             // negative, but the result must be replaced with MaxValue for positive overflow.
@@ -953,16 +955,14 @@ void Lowering::LowerCast(GenTree* tree)
                     if (srcType == TYP_FLOAT)
                     {
                         maxFloatSimdVal->f32[0] = 4294967296.0f;
-                        convertIntrinsic        = comp->compOpportunisticallyDependsOn(InstructionSet_SSE_X64)
-                                                      ? NI_SSE_X64_ConvertToInt64WithTruncation
-                                                      : NI_SSE2_ConvertToVector128Int32WithTruncation;
+                        convertIntrinsic        = TargetArchitecture::Is64Bit ? NI_SSE_X64_ConvertToInt64WithTruncation
+                                                                              : NI_SSE2_ConvertToVector128Int32WithTruncation;
                     }
                     else
                     {
                         maxFloatSimdVal->f64[0] = 4294967296.0;
-                        convertIntrinsic        = comp->compOpportunisticallyDependsOn(InstructionSet_SSE2_X64)
-                                                      ? NI_SSE2_X64_ConvertToInt64WithTruncation
-                                                      : NI_SSE2_ConvertToVector128Int32WithTruncation;
+                        convertIntrinsic        = TargetArchitecture::Is64Bit ? NI_SSE2_X64_ConvertToInt64WithTruncation
+                                                                              : NI_SSE2_ConvertToVector128Int32WithTruncation;
                     }
                     break;
                 }
@@ -1023,6 +1023,7 @@ void Lowering::LowerCast(GenTree* tree)
                 //   var fixupVal = Sse.And(srcVec, nanMask);
                 //   convertResult = Sse.ConvertToInt32WithTruncation(fixupVal);
 
+                NamedIntrinsic andIntrinsic = (srcType == TYP_FLOAT) ? NI_SSE_And : NI_SSE2_And;
                 NamedIntrinsic compareNaNIntrinsic =
                     (srcType == TYP_FLOAT) ? NI_SSE_CompareScalarOrdered : NI_SSE2_CompareScalarOrdered;
 
@@ -1033,8 +1034,9 @@ void Lowering::LowerCast(GenTree* tree)
                 castRange.InsertAtEnd(srcClone);
                 castRange.InsertAtEnd(nanMask);
 
-                srcClone          = comp->gtClone(srcVector);
-                GenTree* fixupVal = comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, nanMask, srcClone, srcBaseType, 16);
+                srcClone = comp->gtClone(srcVector);
+                GenTree* fixupVal =
+                    comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, nanMask, srcClone, andIntrinsic, srcBaseType, 16);
 
                 castRange.InsertAtEnd(srcClone);
                 castRange.InsertAtEnd(fixupVal);
@@ -1120,15 +1122,16 @@ void Lowering::LowerCast(GenTree* tree)
                             // This creates the equivalent of the following C# code:
                             //   floorVal = ((srcVector.AsUInt64() >>> 21) << 21).AsDouble();
 
-                            GenTree* twentyOne  = comp->gtNewIconNode(21);
-                            GenTree* rightShift = comp->gtNewSimdBinOpNode(GT_RSZ, TYP_SIMD16, floorVal, twentyOne,
-                                                                           CORINFO_TYPE_ULONG, 16);
+                            GenTree* twentyOne = comp->gtNewIconNode(21);
+                            GenTree* rightShift =
+                                comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, floorVal, twentyOne,
+                                                               NI_SSE2_ShiftRightLogical, CORINFO_TYPE_ULONG, 16);
                             castRange.InsertAtEnd(twentyOne);
                             castRange.InsertAtEnd(rightShift);
 
                             twentyOne = comp->gtClone(twentyOne);
-                            floorVal  = comp->gtNewSimdBinOpNode(GT_LSH, TYP_SIMD16, rightShift, twentyOne,
-                                                                 CORINFO_TYPE_ULONG, 16);
+                            floorVal  = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, rightShift, twentyOne,
+                                                                       NI_SSE2_ShiftLeftLogical, CORINFO_TYPE_ULONG, 16);
                             castRange.InsertAtEnd(twentyOne);
                             castRange.InsertAtEnd(floorVal);
                         }
@@ -1191,21 +1194,23 @@ void Lowering::LowerCast(GenTree* tree)
 
                             GenTree* thirtyOne = comp->gtNewIconNode(31);
                             GenTree* mask =
-                                comp->gtNewSimdBinOpNode(GT_RSH, TYP_SIMD16, result, thirtyOne, CORINFO_TYPE_INT, 16);
+                                comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, result, thirtyOne,
+                                                               NI_SSE2_ShiftRightArithmetic, CORINFO_TYPE_INT, 16);
                             GenTree* andMask =
-                                comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, mask, negated, dstBaseType, 16);
+                                comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, mask, negated, NI_SSE2_And, dstBaseType, 16);
 
                             castRange.InsertAtEnd(thirtyOne);
                             castRange.InsertAtEnd(mask);
                             castRange.InsertAtEnd(andMask);
 
-                            convertResult =
-                                comp->gtNewSimdBinOpNode(GT_OR, TYP_SIMD16, andMask, resultClone, dstBaseType, 16);
+                            convertResult = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, andMask, resultClone, NI_SSE2_Or,
+                                                                           dstBaseType, 16);
                         }
 
                         // Because the results are in a SIMD register, we need to ToScalar() them out.
                         castRange.InsertAtEnd(convertResult);
-                        convertResult = comp->gtNewSimdToScalarNode(TYP_INT, convertResult, dstBaseType, 16);
+                        convertResult = comp->gtNewSimdHWIntrinsicNode(TYP_INT, convertResult, NI_Vector128_ToScalar,
+                                                                       dstBaseType, 16);
                     }
                     else
                     {
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 74a4e34d34e5f8..16e9605e964d0f 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -294,20 +294,14 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
     if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType))
     {
         if (srcType == TYP_FLOAT
-#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
-            // Arm64: src = float, dst is overflow conversion.
+#if defined(TARGET_64BIT)
+            // 64-bit: src = float, dst is overflow conversion.
             // This goes through helper and hence src needs to be converted to double.
             && tree->gtOverflow()
-#elif defined(TARGET_AMD64)
-            // Amd64: src = float, dst = overflow conversion or SSE2 is not enabled
-            && (tree->gtOverflow() || !IsBaselineSimdIsaSupported())
-#elif defined(TARGET_ARM)
-            // Arm: src = float, dst = int64/uint64 or overflow conversion.
-            && (tree->gtOverflow() || varTypeIsLong(dstType))
 #else
-            // x86: src = float, dst = int64/uint64 or overflow conversion or SSE2 is not enabled
-            && (tree->gtOverflow() || varTypeIsLong(dstType) || !IsBaselineSimdIsaSupported())
-#endif
+            // 32-bit: src = float, dst = int64/uint64 or overflow conversion.
+            && (tree->gtOverflow() || varTypeIsLong(dstType))
+#endif // TARGET_64BIT
         )
         {
             oper = gtNewCastNode(TYP_DOUBLE, oper, false, TYP_DOUBLE);
@@ -328,39 +322,24 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
         {
             if (!tree->gtOverflow())
             {
-// ARM64 and LoongArch64 optimize all non-overflow checking conversions
-#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+#ifdef TARGET_64BIT
                 return nullptr;
 #else
-#if defined(TARGET_XARCH)
-                if (IsBaselineSimdIsaSupported() && (!varTypeIsLong(dstType) || TargetArchitecture::Is64Bit))
+                if (!varTypeIsLong(dstType))
                 {
                     return nullptr;
                 }
-#endif // TARGET_XARCH
+
                 switch (dstType)
                 {
-                    case TYP_INT:
-#ifdef TARGET_XARCH
-                        return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2INT, oper);
-#endif // TARGET_XARCH
-                        return nullptr;
-
-                    case TYP_UINT:
-#if defined(TARGET_ARM)
-                        return nullptr;
-#endif
-                        return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2UINT, oper);
-
                     case TYP_LONG:
                         return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2LNG, oper);
-
                     case TYP_ULONG:
                         return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2ULNG, oper);
                     default:
                         unreached();
                 }
-#endif // TARGET_ARM64 || TARGET_LOONGARCH64 || TARGET_RISCV64
+#endif // TARGET_64BIT
             }
             else
             {
diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp
index 1c7d31f9eb400d..3df50d5e570f30 100644
--- a/src/coreclr/jit/rationalize.cpp
+++ b/src/coreclr/jit/rationalize.cpp
@@ -826,6 +826,10 @@ Compiler::fgWalkResult Rationalizer::RationalizeVisitor::PreOrderVisit(GenTree**
 #if defined(FEATURE_HW_INTRINSICS)
     else if (node->OperIsHWIntrinsic())
     {
+        // All intrinsics introduced in HIR must be explicitly supported.
+        NamedIntrinsic intrinsicId = node->AsHWIntrinsic()->GetHWIntrinsicId();
+        assert(m_compiler->compIsaSupportedDebugOnly(HWIntrinsicInfo::lookupIsa(intrinsicId)));
+
         if (node->AsHWIntrinsic()->IsUserCall())
         {
             m_rationalizer.RewriteHWIntrinsicAsUserCall(use, this->m_ancestors);

From c89077ef04802d4881d56f7aa3621dfc65ba59cc Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Thu, 15 May 2025 12:59:54 -0700
Subject: [PATCH 2/3] partial revert 8ac975e2

---
 src/coreclr/jit/hwintrinsic.h               |  1 -
 src/coreclr/jit/hwintrinsiccodegenxarch.cpp |  5 ++-
 src/coreclr/jit/hwintrinsicxarch.cpp        | 33 ------------------
 src/coreclr/jit/lowerxarch.cpp              | 38 ++++++++-------------
 src/coreclr/jit/rationalize.cpp             |  4 ---
 5 files changed, 16 insertions(+), 65 deletions(-)

diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h
index edcb84ce10144e..d936d579d8e25a 100644
--- a/src/coreclr/jit/hwintrinsic.h
+++ b/src/coreclr/jit/hwintrinsic.h
@@ -546,7 +546,6 @@ struct HWIntrinsicInfo
     static bool isScalarIsa(CORINFO_InstructionSet isa);
 
 #ifdef TARGET_XARCH
-    static bool                isBaselineIsa(CORINFO_InstructionSet isa);
     static bool                isAVX2GatherIntrinsic(NamedIntrinsic id);
     static FloatComparisonMode lookupFloatComparisonModeForSwappedArgs(FloatComparisonMode comparison);
     static NamedIntrinsic      lookupIdForFloatComparisonMode(NamedIntrinsic      intrinsic,
diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
index b2b825abc5a261..24aede54066303 100644
--- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
+++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
@@ -403,9 +403,8 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
     GenTree*               embMaskNode = nullptr;
     GenTree*               embMaskOp   = nullptr;
 
-    // We need to validate that other phases of the compiler haven't introduced unsupported intrinsics.
-    // We allow an exception for baseline intrinsics to be introduced unconditionally in LIR.
-    assert(compiler->compIsaSupportedDebugOnly(isa) || HWIntrinsicInfo::isBaselineIsa(isa));
+    // We need to validate that other phases of the compiler haven't introduced unsupported intrinsics
+    assert(compiler->compIsaSupportedDebugOnly(isa));
     assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
     assert(!HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsicId) || !varTypeIsSmall(node->GetSimdBaseType()));
 
diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
index 6c46fe2dd07854..106a0b952fe139 100644
--- a/src/coreclr/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -855,39 +855,6 @@ NamedIntrinsic HWIntrinsicInfo::lookupIdForFloatComparisonMode(NamedIntrinsic
     }
 }
 
-//------------------------------------------------------------------------
-// isBaselineIsa: Gets a value that indicates whether the InstructionSet is
-// part of the required hardware support for this platform
-//
-// Arguments:
-//    isa - The InstructionSet to check
-//
-// Return Value:
-//    true if isa is part of the baseline; otherwise, false
-bool HWIntrinsicInfo::isBaselineIsa(CORINFO_InstructionSet isa)
-{
-    switch (isa)
-    {
-        case InstructionSet_X86Base:
-        case InstructionSet_SSE:
-        case InstructionSet_SSE2:
-#ifdef TARGET_AMD64
-        case InstructionSet_X86Base_X64:
-        case InstructionSet_SSE_X64:
-        case InstructionSet_SSE2_X64:
-#endif // TARGET_AMD64
-        case InstructionSet_Vector128:
-        {
-            return true;
-        }
-
-        default:
-        {
-            return false;
-        }
-    }
-}
-
 //------------------------------------------------------------------------
 // isFullyImplementedIsa: Gets a value that indicates whether the InstructionSet is fully implemented
 //
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 7da5a199035839..15e4750fd8f824 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -868,11 +868,6 @@ void Lowering::LowerCast(GenTree* tree)
     {
         // If we don't have AVX10v2 saturating conversion instructions for
         // floating->integral, we have to handle the saturation logic here.
-        //
-        // Since this implements ordinary casts, we bend the normal rules around ISA support
-        // for HWIntrinsics and assume the baseline ISA set (SSE2 and below) is available.
-        // For this reason, we eschew most gentree convenience methods (e.g. gtNewSimdBinOpNode)
-        // and create the HWIntrinsic nodes explicitly, as most helpers assert ISA support.
 
         JITDUMP("LowerCast before:\n");
         DISPTREERANGE(BlockRange(), tree);
@@ -909,8 +904,8 @@ void Lowering::LowerCast(GenTree* tree)
             GenTree* zero = comp->gtNewZeroConNode(TYP_SIMD16);
             GenTree* fixupVal =
                 comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, maxScalarIntrinsic, srcBaseType, 16);
-            GenTree* toScalar =
-                comp->gtNewSimdHWIntrinsicNode(srcType, fixupVal, NI_Vector128_ToScalar, srcBaseType, 16);
+
+            GenTree* toScalar = comp->gtNewSimdToScalarNode(srcType, fixupVal, srcBaseType, 16);
 
             castRange.InsertAtEnd(zero);
             castRange.InsertAtEnd(fixupVal);
@@ -1023,7 +1018,6 @@ void Lowering::LowerCast(GenTree* tree)
                 //   var fixupVal = Sse.And(srcVec, nanMask);
                 //   convertResult = Sse.ConvertToInt32WithTruncation(fixupVal);
 
-                NamedIntrinsic andIntrinsic = (srcType == TYP_FLOAT) ? NI_SSE_And : NI_SSE2_And;
                 NamedIntrinsic compareNaNIntrinsic =
                     (srcType == TYP_FLOAT) ? NI_SSE_CompareScalarOrdered : NI_SSE2_CompareScalarOrdered;
 
@@ -1034,9 +1028,8 @@ void Lowering::LowerCast(GenTree* tree)
                 castRange.InsertAtEnd(srcClone);
                 castRange.InsertAtEnd(nanMask);
 
-                srcClone = comp->gtClone(srcVector);
-                GenTree* fixupVal =
-                    comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, nanMask, srcClone, andIntrinsic, srcBaseType, 16);
+                srcClone          = comp->gtClone(srcVector);
+                GenTree* fixupVal = comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, nanMask, srcClone, srcBaseType, 16);
 
                 castRange.InsertAtEnd(srcClone);
                 castRange.InsertAtEnd(fixupVal);
@@ -1122,16 +1115,15 @@ void Lowering::LowerCast(GenTree* tree)
                             // This creates the equivalent of the following C# code:
                             //   floorVal = ((srcVector.AsUInt64() >>> 21) << 21).AsDouble();
 
-                            GenTree* twentyOne = comp->gtNewIconNode(21);
-                            GenTree* rightShift =
-                                comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, floorVal, twentyOne,
-                                                               NI_SSE2_ShiftRightLogical, CORINFO_TYPE_ULONG, 16);
+                            GenTree* twentyOne  = comp->gtNewIconNode(21);
+                            GenTree* rightShift = comp->gtNewSimdBinOpNode(GT_RSZ, TYP_SIMD16, floorVal, twentyOne,
+                                                                           CORINFO_TYPE_ULONG, 16);
                             castRange.InsertAtEnd(twentyOne);
                             castRange.InsertAtEnd(rightShift);
 
                             twentyOne = comp->gtClone(twentyOne);
-                            floorVal  = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, rightShift, twentyOne,
-                                                                       NI_SSE2_ShiftLeftLogical, CORINFO_TYPE_ULONG, 16);
+                            floorVal  = comp->gtNewSimdBinOpNode(GT_LSH, TYP_SIMD16, rightShift, twentyOne,
+                                                                 CORINFO_TYPE_ULONG, 16);
                             castRange.InsertAtEnd(twentyOne);
                             castRange.InsertAtEnd(floorVal);
                         }
@@ -1194,23 +1186,21 @@ void Lowering::LowerCast(GenTree* tree)
 
                             GenTree* thirtyOne = comp->gtNewIconNode(31);
                             GenTree* mask =
-                                comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, result, thirtyOne,
-                                                               NI_SSE2_ShiftRightArithmetic, CORINFO_TYPE_INT, 16);
+                                comp->gtNewSimdBinOpNode(GT_RSH, TYP_SIMD16, result, thirtyOne, CORINFO_TYPE_INT, 16);
                             GenTree* andMask =
-                                comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, mask, negated, NI_SSE2_And, dstBaseType, 16);
+                                comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, mask, negated, dstBaseType, 16);
 
                             castRange.InsertAtEnd(thirtyOne);
                             castRange.InsertAtEnd(mask);
                             castRange.InsertAtEnd(andMask);
 
-                            convertResult = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, andMask, resultClone, NI_SSE2_Or,
-                                                                           dstBaseType, 16);
+                            convertResult =
+                                comp->gtNewSimdBinOpNode(GT_OR, TYP_SIMD16, andMask, resultClone, dstBaseType, 16);
                         }
 
                         // Because the results are in a SIMD register, we need to ToScalar() them out.
                         castRange.InsertAtEnd(convertResult);
-                        convertResult = comp->gtNewSimdHWIntrinsicNode(TYP_INT, convertResult, NI_Vector128_ToScalar,
-                                                                       dstBaseType, 16);
+                        convertResult = comp->gtNewSimdToScalarNode(TYP_INT, convertResult, dstBaseType, 16);
                     }
                     else
                     {
diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp
index 3df50d5e570f30..1c7d31f9eb400d 100644
--- a/src/coreclr/jit/rationalize.cpp
+++ b/src/coreclr/jit/rationalize.cpp
@@ -826,10 +826,6 @@ Compiler::fgWalkResult Rationalizer::RationalizeVisitor::PreOrderVisit(GenTree**
 #if defined(FEATURE_HW_INTRINSICS)
     else if (node->OperIsHWIntrinsic())
     {
-        // All intrinsics introduced in HIR must be explicitly supported.
-        NamedIntrinsic intrinsicId = node->AsHWIntrinsic()->GetHWIntrinsicId();
-        assert(m_compiler->compIsaSupportedDebugOnly(HWIntrinsicInfo::lookupIsa(intrinsicId)));
-
         if (node->AsHWIntrinsic()->IsUserCall())
         {
             m_rationalizer.RewriteHWIntrinsicAsUserCall(use, this->m_ancestors);

From d5a89657f2dce5786249947840497bc90fb99ad2 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Thu, 15 May 2025 17:41:45 -0700
Subject: [PATCH 3/3] add baseline ISAs to supported set in rationalize

---
 src/coreclr/jit/compiler.cpp       | 27 +++++++++++++++++++++++++++
 src/coreclr/jit/compiler.h         |  1 +
 src/coreclr/jit/decomposelongs.cpp |  2 --
 src/coreclr/jit/lowerxarch.cpp     | 19 -------------------
 src/coreclr/jit/rationalize.cpp    |  3 +++
 5 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index d085ef712a40fb..2755328b8271ac 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2093,6 +2093,33 @@ bool Compiler::notifyInstructionSetUsage(CORINFO_InstructionSet isa, bool suppor
     return info.compCompHnd->notifyInstructionSetUsage(isa, supported);
 }
 
+void Compiler::setBaselineISAsSupported()
+{
+#ifdef FEATURE_HW_INTRINSICS
+    CORINFO_InstructionSetFlags supportedISAs = opts.compSupportsISA;
+
+#if defined(TARGET_XARCH)
+    supportedISAs.AddInstructionSet(InstructionSet_X86Base);
+    supportedISAs.AddInstructionSet(InstructionSet_SSE);
+    supportedISAs.AddInstructionSet(InstructionSet_SSE2);
+#if defined(TARGET_AMD64)
+    supportedISAs.AddInstructionSet(InstructionSet_X86Base_X64);
+    supportedISAs.AddInstructionSet(InstructionSet_SSE_X64);
+    supportedISAs.AddInstructionSet(InstructionSet_SSE2_X64);
+#endif // TARGET_AMD64
+#elif defined(TARGET_ARM64)
+    supportedISAs.AddInstructionSet(InstructionSet_ArmBase);
+    supportedISAs.AddInstructionSet(InstructionSet_AdvSimd);
+    supportedISAs.AddInstructionSet(InstructionSet_ArmBase_Arm64);
+    supportedISAs.AddInstructionSet(InstructionSet_AdvSimd_Arm64);
+    supportedISAs.AddInstructionSet(InstructionSet_Vector64);
+#endif
+    supportedISAs.AddInstructionSet(InstructionSet_Vector128);
+
+    opts.setSupportedISAs(supportedISAs);
+#endif // FEATURE_HW_INTRINSICS
+}
+
 #ifdef PROFILING_SUPPORTED
 // A Dummy routine to receive Enter/Leave/Tailcall profiler callbacks.
 // These are used when DOTNET_JitEltHookEnabled=1
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index de0f6afbdc8fc5..9558e3527dff8c 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9661,6 +9661,7 @@ class Compiler
 #endif // DEBUG
 
     bool notifyInstructionSetUsage(CORINFO_InstructionSet isa, bool supported) const;
+    void setBaselineISAsSupported();
 
     // Answer the question: Is a particular ISA allowed to be used implicitly by optimizations?
     // The result of this api call will exactly match the target machine
diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp
index ddc55e633292fa..639ec3df69bbed 100644
--- a/src/coreclr/jit/decomposelongs.cpp
+++ b/src/coreclr/jit/decomposelongs.cpp
@@ -1979,8 +1979,6 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIn
     }
     else
     {
-        assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
         GenTree* thirtyTwo = m_compiler->gtNewIconNode(32);
         GenTree* shift     = m_compiler->gtNewSimdBinOpNode(GT_RSZ, op1->TypeGet(), simdTmpVar, thirtyTwo,
                                                             node->GetSimdBaseJitType(), simdSize);
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 15e4750fd8f824..ad663148800dbc 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -4443,8 +4443,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
                     break;
                 }
 
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
                 // We will be constructing the following parts:
                 //   ...
                 //          /--*  tmp1 simd16
@@ -4498,8 +4496,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
                 //   tmp1 = Sse2.UnpackLow(tmp1, tmp2);
                 //   ...
 
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
                 node->Op(1) = tmp1;
                 LIR::Use tmp1Use(BlockRange(), &node->Op(1), node);
                 ReplaceWithLclVar(tmp1Use);
@@ -4530,8 +4526,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
                 //   ...
                 //   return Sse2.Shuffle(tmp1, 0x00);
 
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
                 idx = comp->gtNewIconNode(0x00, TYP_INT);
                 BlockRange().InsertAfter(tmp1, idx);
 
@@ -4579,8 +4573,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
                 //   var tmp2 = tmp1;
                 //   return Sse.Shuffle(tmp1, tmp2, 0x00);
 
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE));
-
                 node->Op(1) = tmp1;
                 LIR::Use tmp1Use(BlockRange(), &node->Op(1), node);
                 ReplaceWithLclVar(tmp1Use);
@@ -4617,8 +4609,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
                     break;
                 }
 
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
                 // We will be constructing the following parts:
                 //   ...
                 //          /--*  tmp1 simd16
@@ -4748,7 +4738,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
 
             if ((simdBaseType == TYP_SHORT) || (simdBaseType == TYP_USHORT))
             {
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
                 insIntrinsic = NI_SSE2_Insert;
             }
             else if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
@@ -4808,7 +4797,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
             }
 
             assert((simdBaseType != TYP_SHORT) && (simdBaseType != TYP_USHORT));
-            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
 
             GenTree* op[16];
             op[0] = tmp1;
@@ -5035,8 +5023,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
             //   tmp2 = Sse.UnpackLow(opP, opQ);
             //   return Sse.MoveLowToHigh(tmp1, tmp2);
 
-            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE));
-
             GenTree* op[4];
             op[0] = tmp1;
 
@@ -5100,8 +5086,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
             //   var tmp2 = Vector128.CreateScalarUnsafe(op2);
             //   return Sse.UnpackLow(tmp1, tmp2);
 
-            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
             tmp2 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op2, simdBaseJitType, 16);
             LowerNode(tmp2);
 
@@ -5414,7 +5398,6 @@ GenTree* Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node)
         case TYP_SHORT:
         case TYP_USHORT:
         {
-            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
             break;
         }
 
@@ -6230,8 +6213,6 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
     }
     else
     {
-        assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
         switch (simdBaseType)
         {
             case TYP_SHORT:
diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp
index 1c7d31f9eb400d..d15be0c9699b1b 100644
--- a/src/coreclr/jit/rationalize.cpp
+++ b/src/coreclr/jit/rationalize.cpp
@@ -914,6 +914,9 @@ PhaseStatus Rationalizer::DoPhase()
         block->bbStmtList = nullptr;
 
         assert(BlockRange().CheckLIR(comp, true));
+
+        // Allow unrestricted use of baseline HWIntrinsic ISAs in LIR.
+        comp->setBaselineISAsSupported();
     }
 
     comp->compRationalIRForm = true;