diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp
index 97ba9bb6ad53b7..b14aab6bfb3ec1 100644
--- a/src/coreclr/jit/decomposelongs.cpp
+++ b/src/coreclr/jit/decomposelongs.cpp
@@ -587,40 +587,172 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use)
     }
 
 #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
-    if (varTypeIsFloating(dstType))
+    if (varTypeIsFloating(srcType) || varTypeIsFloating(dstType))
     {
         // We will reach this path only if morph did not convert the cast to a helper call,
         // meaning we can perform the cast using SIMD instructions.
-        // The sequence this creates is simply:
-        //    AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar()
-
-        NamedIntrinsic intrinsicId = NI_Illegal;
-        GenTree*       srcOp       = cast->CastOp();
 
         assert(!cast->gtOverflow());
         assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512));
 
-        intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
+        GenTree*   srcOp      = cast->CastOp();
+        GenTree*   castResult = nullptr;
+        LIR::Range castRange  = LIR::EmptyRange();
 
-        GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16);
-        GenTree* convert  = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, srcType, 16);
-        GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, dstType, 16);
+        // This creates the equivalent of the following C# code:
+        //   var srcVec = Vector128.CreateScalarUnsafe(castOp);
 
-        Range().InsertAfter(cast, createScalar, convert, toScalar);
-        Range().Remove(cast);
+        GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16);
+        castRange.InsertAtEnd(srcVector);
 
-        if (createScalar->IsCnsVec())
+        if (srcVector->IsCnsVec())
         {
             Range().Remove(srcOp);
         }
 
+        if (varTypeIsFloating(dstType))
+        {
+            // long->floating casts don't require any kind of fixup. We simply use the vector
+            // form of the instructions, because the scalar form is not supported on 32-bit.
+
+            NamedIntrinsic intrinsicId =
+                (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
+
+            castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16);
+        }
+        else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
+        {
+            // Likewise, the AVX10.2 saturating floating->long instructions give the correct result,
+            // but we have to use the vector form.
+
+            NamedIntrinsic intrinsicId = (dstType == TYP_ULONG)
+                                             ? NI_AVX10v2_ConvertToVectorUInt64WithTruncatedSaturation
+                                             : NI_AVX10v2_ConvertToVectorInt64WithTruncatedSaturation;
+
+            castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16);
+        }
+        else if (dstType == TYP_ULONG)
+        {
+            // AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so
+            // we only need to fix up negative or NaN values before conversion.
+            //
+            // maxs[sd] will take the value from the second operand if the first operand's value is
+            // NaN, which allows us to fix up both negative and NaN values with a single instruction.
+            //
+            // This creates the equivalent of the following C# code:
+            //   var fixupVal = Sse.MaxScalar(srcVec, Vector128<T>.Zero);
+            //   castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal);
+
+            GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
+            GenTree* fixupVal =
+                m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar, srcType, 16);
+
+            castRange.InsertAtEnd(zero);
+            castRange.InsertAtEnd(fixupVal);
+
+            castResult =
+                m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal,
+                                                     NI_AVX512_ConvertToVector128UInt64WithTruncation, srcType, 16);
+        }
+        else
+        {
+            assert(dstType == TYP_LONG);
+
+            // We will use the input value multiple times, so we replace it with a lclVar.
+            LIR::Use srcUse;
+            LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse);
+            srcUse.ReplaceWithLclVar(m_compiler);
+            srcVector = srcUse.Def();
+
+            // This logic is similar to the floating->long saturating logic in Lowering::LowerCast,
+            // except that here we must keep everything in SIMD registers. We can also take advantage
+            // of EVEX masking since the conversion itself requires AVX-512.
+            //
+            // We fix up NaN values by masking in zero during conversion. Negative saturation is handled
+            // correctly by the conversion instructions. Positive saturation is handled after conversion,
+            // because MaxValue is not precisely representable in the floating format.
+            //
+            // This creates roughly the equivalent of the following C# code:
+            //   var nanMask = Avx.CompareScalar(srcVec, srcVec, FloatComparisonMode.OrderedNonSignaling);
+            //
+            //   var compareMode      = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling;
+            //   var ovfFloatingValue = Vector128.Create(9223372036854775808.0);
+            //   var ovfMask          = Avx.CompareScalar(srcVec, ovfFloatingValue, compareMode);
+
+            GenTree* srcClone = m_compiler->gtClone(srcVector);
+            GenTree* compareMode =
+                m_compiler->gtNewIconNode(static_cast<int32_t>(FloatComparisonMode::OrderedNonSignaling));
+            GenTree* nanMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcVector, srcClone, compareMode,
+                                                                    NI_AVX512_CompareScalarMask, srcType, 16);
+
+            castRange.InsertAtEnd(srcClone);
+            castRange.InsertAtEnd(compareMode);
+            castRange.InsertAtEnd(nanMask);
+
+            compareMode = m_compiler->gtNewIconNode(
+                static_cast<int32_t>(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling));
+
+            GenTreeVecCon* ovfFloatingValue = m_compiler->gtNewVconNode(TYP_SIMD16);
+            ovfFloatingValue->EvaluateBroadcastInPlace(srcType, 9223372036854775808.0); // 2^63
+
+            srcClone         = m_compiler->gtClone(srcVector);
+            GenTree* ovfMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcClone, ovfFloatingValue, compareMode,
+                                                                    NI_AVX512_CompareScalarMask, srcType, 16);
+
+            castRange.InsertAtEnd(srcClone);
+            castRange.InsertAtEnd(ovfFloatingValue);
+            castRange.InsertAtEnd(compareMode);
+            castRange.InsertAtEnd(ovfMask);
+
+            // Now we convert, using the masks created above for NaN and positive overflow saturation.
+            //
+            // This creates roughly the equivalent of the following C# code:
+            //   var convert       = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec);
+            //   var convertMasked = Avx512F.VL.BlendVariable(Vector128<long>.Zero, convert, nanMask);
+            //
+            //   var maxLong       = Vector128.Create(long.MaxValue);
+            //   castResult        = Avx512F.VL.BlendVariable(convertMasked, maxLong, ovfMask);
+
+            GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
+
+            srcClone = m_compiler->gtClone(srcVector);
+            GenTree* convert =
+                m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone,
+                                                     NI_AVX512_ConvertToVector128Int64WithTruncation, srcType, 16);
+
+            castRange.InsertAtEnd(zero);
+            castRange.InsertAtEnd(srcClone);
+            castRange.InsertAtEnd(convert);
+
+            GenTree* convertMasked = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, zero, convert, nanMask,
+                                                                          NI_AVX512_BlendVariableMask, dstType, 16);
+
+            GenTreeVecCon* maxLong = m_compiler->gtNewVconNode(TYP_SIMD16);
+            maxLong->EvaluateBroadcastInPlace(dstType, INT64_MAX);
+
+            castRange.InsertAtEnd(convertMasked);
+            castRange.InsertAtEnd(maxLong);
+
+            castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, convertMasked, maxLong, ovfMask,
+                                                              NI_AVX512_BlendVariableMask, dstType, 16);
+        }
+
+        // Because the results are in a SIMD register, we need to ToScalar() them out.
+        GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(genActualType(dstType), castResult, dstType, 16);
+
+        castRange.InsertAtEnd(castResult);
+        castRange.InsertAtEnd(toScalar);
+
+        Range().InsertAfter(cast, std::move(castRange));
+        Range().Remove(cast);
+
         if (use.IsDummyUse())
         {
             toScalar->SetUnusedValue();
         }
         use.ReplaceWith(toScalar);
 
-        return toScalar->gtNext;
+        return toScalar;
     }
 #endif // FEATURE_HW_INTRINSICS && TARGET_X86
 
diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index 8f5b5be1820ef7..0f427204e96f67 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -1339,12 +1339,8 @@ bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool o
     }
 
 #if defined(TARGET_X86) || defined(TARGET_ARM)
-    if (varTypeIsFloating(fromType) && varTypeIsLong(toType))
-    {
-        return true;
-    }
-
-    if (varTypeIsLong(fromType) && varTypeIsFloating(toType))
+    if ((varTypeIsLong(fromType) && varTypeIsFloating(toType)) ||
+        (varTypeIsFloating(fromType) && varTypeIsLong(toType)))
     {
 #if defined(TARGET_X86)
         return !compOpportunisticallyDependsOn(InstructionSet_AVX512);
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 826fba0d2ce604..1f0ec2db619558 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -6492,7 +6492,38 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
                     {
                         var_types dstType = tree->AsCast()->CastToType();
 
-                        if (varTypeIsLong(dstType))
+                        if (compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
+                        {
+#if defined(TARGET_X86)
+                            if (varTypeIsLong(dstType))
+                            {
+                                // unsigned: vcvttp*2uqqs xmm0, xmm0
+                                //           vmovq        [mem], xmm0
+                                //
+                                // signed:   vcvttp*2qqs  xmm0, xmm0
+                                //           vmovq        [mem], xmm0
+
+                                costEx = 4 + FLT_IND_COST_EX; // 4 + FLT_IND_COST_EX
+                                costSz = 6 + 6;               // 12
+
+                                if (op1Type == TYP_FLOAT)
+                                {
+                                    // vector widening float->long instructions take 1 extra cycle
+                                    // compared to same-size conversion
+                                    costEx += 1;
+                                }
+                            }
+                            else
+#endif
+                            {
+                                // unsigned: vcvtts*2usis eax, xmm0
+                                // signed:   vcvtts*2sis  eax, xmm0
+
+                                costEx = 7;
+                                costSz = 6;
+                            }
+                        }
+                        else if (varTypeIsLong(dstType))
                         {
 #if defined(TARGET_AMD64)
                             if (varTypeIsUnsigned(dstType))
@@ -6540,24 +6571,59 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
                                 costSz = 5 + 4 + 10 + 5 + 8 + 4;                    // 36
                             }
 #else
-                            // unsigned: ...
-                            //           call CORINFO_HELP_DBL2ULNG
-                            //
-                            // signed:   ...
-                            //           call CORINFO_HELP_DBL2ULNG
-
-                            costEx = 5 + (3 * IND_COST_EX); // CALL
-                            costSz = 5;                     // 5
+                            if (compOpportunisticallyDependsOn(InstructionSet_AVX512))
+                            {
+                                if (varTypeIsUnsigned(dstType))
+                                {
+                                    // vxorps       xmm1, xmm1, xmm1
+                                    // vmaxs*       xmm0, xmm0, xmm1
+                                    // vcvttp*2uqq  xmm0, xmm0
+                                    // vmovq        [mem], xmm0
 
-                            level++;
+                                    costEx = 1 + 4 + 4 + FLT_IND_COST_EX; // 9 + FLT_IND_COST_EX
+                                    costSz = 4 + 4 + 6 + 6;               // 20
+                                }
+                                else
+                                {
+                                    // vcmpords*    k1, xmm0, xmm0
+                                    // vcmpge_oqs*  k2, xmm0, qword ptr [@RWD00]
+                                    // vcvttp*2qq   xmm0 {k1}{z}, xmm0
+                                    // vpblendmq    xmm0 {k2}, xmm0, qword ptr [@RWD08] {1to2}
+                                    // vmovq        [mem], xmm0
+
+                                    costEx = 4 + (4 + FLT_IND_COST_EX) + 4 + (1 + FLT_IND_COST_EX) +
+                                             FLT_IND_COST_EX;     // 13 + (3 * FLT_IND_COST_EX)
+                                    costSz = 7 + 11 + 6 + 10 + 6; // 40
+                                }
 
-                            if (op1Type == TYP_FLOAT)
+                                if (op1Type == TYP_FLOAT)
+                                {
+                                    // vector widening float->long instructions take 1 extra cycle
+                                    // compared to same-size conversion
+                                    costEx += 1;
+                                }
+                            }
+                            else
                             {
-                                // vcvtss2sd xmm0, xmm0, xmm0
-                                // ...
+                                // unsigned: ...
+                                //           call CORINFO_HELP_DBL2ULNG
+                                //
+                                // signed:   ...
+                                //           call CORINFO_HELP_DBL2ULNG
+
+                                costEx = 5 + (3 * IND_COST_EX); // CALL
+                                costSz = 5;                     // 5
+
+                                level++;
 
-                                costEx += 4; // 4 + CALL
-                                costSz += 4; // 9
+                                if (op1Type == TYP_FLOAT)
+                                {
+                                    // vcvtss2sd xmm0, xmm0, xmm0
+                                    // ...
+
+                                    costEx += 4; // 4 + CALL
+                                    costSz += 4; // 9
+                                }
                             }
 #endif
                         }
@@ -34946,8 +35012,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
                     break;
                 }
 
-                bool maskIsZero    = false;
-                bool maskIsAllOnes = false;
+                bool maskIsZero       = false;
+                bool maskIsAllBitsSet = false;
 
                 if (op3->IsCnsMsk())
                 {
@@ -34958,7 +35024,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
                         GenTreeMskCon* mask      = op3->AsMskCon();
                         uint32_t       elemCount = simdSize / genTypeSize(simdBaseType);
 
-                        maskIsAllOnes = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount);
+                        maskIsAllBitsSet = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount);
                     }
                 }
                 else
@@ -34969,11 +35035,11 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
 
                     if (!maskIsZero)
                     {
-                        maskIsAllOnes = op3->IsVectorAllBitsSet();
+                        maskIsAllBitsSet = op3->IsVectorAllBitsSet();
                     }
                 }
 
-                if (maskIsAllOnes)
+                if (maskIsAllBitsSet)
                 {
                     if ((op1->gtFlags & GTF_SIDE_EFFECT) != 0)
                     {
diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h
index af6e0c1f86f162..de2322872c60fc 100644
--- a/src/coreclr/jit/hwintrinsiclistxarch.h
+++ b/src/coreclr/jit/hwintrinsiclistxarch.h
@@ -1252,6 +1252,7 @@ HARDWARE_INTRINSIC(AVX512,          CompareNotGreaterThanOrEqualMask,
 HARDWARE_INTRINSIC(AVX512,          CompareNotLessThanMask,                                          -1,              2,     {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},             1,         4,         HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(AVX512,          CompareNotLessThanOrEqualMask,                                   -1,              2,     {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},             1,         4,         HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(AVX512,          CompareOrderedMask,                                              -1,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vcmpps,             INS_vcmppd},            -1,         4,         HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512,          CompareScalarMask,                                               16,              3,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vcmpss,             INS_vcmpsd},            -1,         4,         HW_Category_IMM,                    HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(AVX512,          CompareUnorderedMask,                                            -1,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vcmpps,             INS_vcmppd},            -1,         4,         HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(AVX512,          CompressMask,                                                    -1,              3,     {INS_vpcompressb,       INS_vpcompressb,        INS_vpcompressw,        INS_vpcompressw,        INS_vpcompressd,        INS_vpcompressd,        INS_vpcompressq,        INS_vpcompressq,        INS_vcompressps,        INS_vcompresspd},        3,         3,         HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromSecondArg)
 HARDWARE_INTRINSIC(AVX512,          CompressStoreMask,                                               -1,              3,     {INS_vpcompressb,       INS_vpcompressb,        INS_vpcompressw,        INS_vpcompressw,        INS_vpcompressd,        INS_vpcompressd,        INS_vpcompressq,        INS_vpcompressq,        INS_vcompressps,        INS_vcompresspd},       -1,        -1,         HW_Category_MemoryStore,            HW_Flag_NoFlag)
diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
index ec2d51c27a179b..e14e06e32adabc 100644
--- a/src/coreclr/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -498,6 +498,7 @@ int HWIntrinsicInfo::lookupImmUpperBound(NamedIntrinsic id)
         case NI_AVX_CompareScalar:
         case NI_AVX512_Compare:
         case NI_AVX512_CompareMask:
+        case NI_AVX512_CompareScalarMask:
         case NI_AVX10v2_MinMaxScalar:
         case NI_AVX10v2_MinMax:
         {
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 61fef20d231ccc..dbcccbf478e0d7 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -1140,8 +1140,6 @@ void Lowering::LowerCast(GenTree* tree)
                         // This creates the equivalent of the following C# code:
                         //   var wrapVal = Sse.SubtractScalar(srcVec, ovfFloatingValue);
 
-                        NamedIntrinsic subtractIntrinsic = NI_X86Base_SubtractScalar;
-
                         // We're going to use ovfFloatingValue twice, so replace the constant with a lclVar.
                         castRange.InsertAtEnd(ovfFloatingValue);
 
@@ -1173,7 +1171,7 @@ void Lowering::LowerCast(GenTree* tree)
                         }
 
                         GenTree* wrapVal = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, floorVal, ovfFloatingValue,
-                                                                                subtractIntrinsic, srcType, 16);
+                                                                                NI_X86Base_SubtractScalar, srcType, 16);
                         castRange.InsertAtEnd(wrapVal);
 
                         ovfFloatingValue = m_compiler->gtClone(ovfFloatingValue);
@@ -10496,6 +10494,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
                         case NI_AVX512_Shuffle:
                         case NI_AVX512_SumAbsoluteDifferencesInBlock32:
                         case NI_AVX512_CompareMask:
+                        case NI_AVX512_CompareScalarMask:
                         case NI_AES_CarrylessMultiply:
                         case NI_AES_V256_CarrylessMultiply:
                         case NI_AES_V512_CarrylessMultiply: