From 47a6bc80466466fe270c6acb28292144fec16449 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Tue, 3 Mar 2026 19:35:34 -0800 Subject: [PATCH 1/7] accelerate floating->long casts on x86 --- src/coreclr/jit/decomposelongs.cpp | 163 ++++++++++++++++++++++--- src/coreclr/jit/flowgraph.cpp | 8 +- src/coreclr/jit/hwintrinsiclistxarch.h | 1 + src/coreclr/jit/hwintrinsicxarch.cpp | 1 + src/coreclr/jit/lowerxarch.cpp | 5 +- 5 files changed, 155 insertions(+), 23 deletions(-) diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index 97ba9bb6ad53b7..b97ab8cb3c578b 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -587,40 +587,175 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) } #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) - if (varTypeIsFloating(dstType)) + if (varTypeIsFloating(srcType) || varTypeIsFloating(dstType)) { // We will reach this path only if morph did not convert the cast to a helper call, // meaning we can perform the cast using SIMD instructions. - // The sequence this creates is simply: - // AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar() - - NamedIntrinsic intrinsicId = NI_Illegal; - GenTree* srcOp = cast->CastOp(); assert(!cast->gtOverflow()); assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512)); - intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double; + GenTree* srcOp = cast->CastOp(); + GenTree* castResult = nullptr; + LIR::Range castRange = LIR::EmptyRange(); - GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16); - GenTree* convert = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, srcType, 16); - GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, dstType, 16); + // This creates the equivalent of the following C# code: + // var srcVec = Vector128.CreateScalarUnsafe(castOp); - Range().InsertAfter(cast, createScalar, convert, toScalar); - Range().Remove(cast); + GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16); + castRange.InsertAtEnd(srcVector); - if (createScalar->IsCnsVec()) + if (srcVector->IsCnsVec()) { Range().Remove(srcOp); } + if (varTypeIsFloating(dstType)) + { + // long->floating casts don't require any kind of fixup. We simply use the vector + // form of the instructions, because the scalar form is not supported on 32-bit. + + NamedIntrinsic intrinsicId = + (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double; + + castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16); + } + else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2)) + { + // Likewise, the AVX10.2 saturating floating->long instructions give the correct result, + // but we have to use the vector form. + + NamedIntrinsic intrinsicId = (dstType == TYP_ULONG) + ? NI_AVX10v2_ConvertToVectorUInt64WithTruncatedSaturation + : NI_AVX10v2_ConvertToVectorInt64WithTruncatedSaturation; + + castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16); + } + else if (dstType == TYP_ULONG) + { + // AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so + // we only need to fix up negative or NaN values before conversion. + // + // maxs[sd] will take the value from the second operand if the first operand's value is + // NaN, which allows us to fix up both negative and NaN values with a single instruction. + // + // This creates the equivalent of the following C# code: + // var fixupVal = Sse.MaxScalar(srcVec, Vector128.Zero); + // castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal); + + GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16); + GenTree* fixupVal = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar, srcType, 16); + + castRange.InsertAtEnd(zero); + castRange.InsertAtEnd(fixupVal); + + castResult = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal, + NI_AVX512_ConvertToVector128UInt64WithTruncation, srcType, 16); + } + else + { + assert(dstType == TYP_LONG); + + // We will use the input value multiple times, so we replace it with a lclVar. + LIR::Use srcUse; + LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse); + srcUse.ReplaceWithLclVar(m_compiler); + srcVector = srcUse.Def(); + + // We fix up NaN values by masking in zero during conversion. Negative saturation is handled + // correctly by the conversion instructions. Positive saturation is handled after conversion, + // because MaxValue is not precisely representable in the floating format. + // + // This creates roughly the equivalent of the following C# code: + // var nanMask = Avx.CompareScalar(srcVec, srcVec, FloatComparisonMode.OrderedNonSignaling); + // var convert = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec); + // convertResult = Vector128.ConditionalSelect(nanMask, convert, Vector128.Zero); + + GenTree* srcClone = m_compiler->gtClone(srcVector); + GenTree* compareMode = + m_compiler->gtNewIconNode(static_cast(FloatComparisonMode::OrderedNonSignaling)); + GenTree* nanMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcVector, srcClone, compareMode, + NI_AVX512_CompareScalarMask, srcType, 16); + + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(compareMode); + castRange.InsertAtEnd(nanMask); + + srcClone = m_compiler->gtClone(srcVector); + GenTree* convertResult = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone, + NI_AVX512_ConvertToVector128Int64WithTruncation, srcType, 16); + + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(convertResult); + + nanMask = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, nanMask, dstType, 16); + GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16); + convertResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, nanMask, convertResult, zero, dstType, 16); + + castRange.InsertAtEnd(nanMask); + castRange.InsertAtEnd(zero); + castRange.InsertAtEnd(convertResult); + + // Now we handle saturation of the result for positive overflow. + // This logic is similar to the floating->long saturating logic in Lowering::LowerCast, + // except that here we must keep everything in SIMD registers. We can also take advantage + // of EVEX masking, so the construction and blending of `maxLong` is optimized for that. + // + // This creates roughly the equivalent of the following C# code: + // var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling; + // var ovfFloatingValue = Vector128.Create(9223372036854775808.0); + // var ovfMask = Avx.CompareScalar(srcVec, ovfFloatingValue, compareMode); + // var maxLong = Vector128.AllBitsSet >>> 1; + // castResult = Vector128.ConditionalSelect(ovfMask, maxLong, convertResult); + + compareMode = m_compiler->gtNewIconNode( + static_cast(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling)); + + GenTreeVecCon* ovfFloatingValue = m_compiler->gtNewVconNode(TYP_SIMD16); + ovfFloatingValue->EvaluateBroadcastInPlace(srcType, 9223372036854775808.0); // 2^63 + + srcClone = m_compiler->gtClone(srcVector); + GenTree* ovfMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcClone, ovfFloatingValue, compareMode, + NI_AVX512_CompareScalarMask, srcType, 16); + + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(ovfFloatingValue); + castRange.InsertAtEnd(compareMode); + castRange.InsertAtEnd(ovfMask); + + GenTree* allBitsSet = m_compiler->gtNewAllBitsSetConNode(TYP_SIMD16); + GenTree* one = m_compiler->gtNewIconNode(1); + GenTree* maxLong = m_compiler->gtNewSimdBinOpNode(GT_RSZ, TYP_SIMD16, allBitsSet, one, dstType, 16); + + castRange.InsertAtEnd(allBitsSet); + castRange.InsertAtEnd(one); + castRange.InsertAtEnd(maxLong); + + ovfMask = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, ovfMask, dstType, 16); + castResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, ovfMask, maxLong, convertResult, dstType, 16); + + castRange.InsertAtEnd(ovfMask); + } + + // Because the results are in a SIMD register, we need to ToScalar() them out. + GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(genActualType(dstType), castResult, dstType, 16); + + castRange.InsertAtEnd(castResult); + castRange.InsertAtEnd(toScalar); + + Range().InsertAfter(cast, std::move(castRange)); + Range().Remove(cast); + if (use.IsDummyUse()) { toScalar->SetUnusedValue(); } use.ReplaceWith(toScalar); - return toScalar->gtNext; + return toScalar; } #endif // FEATURE_HW_INTRINSICS && TARGET_X86 diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index cb24388b10af7a..572eb078e49473 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -1289,12 +1289,8 @@ bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool o } #if defined(TARGET_X86) || defined(TARGET_ARM) - if (varTypeIsFloating(fromType) && varTypeIsLong(toType)) - { - return true; - } - - if (varTypeIsLong(fromType) && varTypeIsFloating(toType)) + if ((varTypeIsLong(fromType) && varTypeIsFloating(toType)) || + (varTypeIsFloating(fromType) && varTypeIsLong(toType))) { #if defined(TARGET_X86) return !compOpportunisticallyDependsOn(InstructionSet_AVX512); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 584a1994539d64..07194a141b0639 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -1242,6 +1242,7 @@ HARDWARE_INTRINSIC(AVX512, CompareNotGreaterThanOrEqualMask, HARDWARE_INTRINSIC(AVX512, CompareNotLessThanMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareNotLessThanOrEqualMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareOrderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512, CompareScalarMask, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpss, INS_vcmpsd}, HW_Category_IMM, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareUnorderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompressMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX512, CompressStoreMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, HW_Category_MemoryStore, HW_Flag_NoFlag) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 93c75e263682db..c89789f812bb1a 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -494,6 +494,7 @@ int HWIntrinsicInfo::lookupImmUpperBound(NamedIntrinsic id) case NI_AVX_CompareScalar: case NI_AVX512_Compare: case NI_AVX512_CompareMask: + case NI_AVX512_CompareScalarMask: case NI_AVX10v2_MinMaxScalar: case NI_AVX10v2_MinMax: { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index b72ee0f4086ea8..ca185cb7962332 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1140,8 +1140,6 @@ void Lowering::LowerCast(GenTree* tree) // This creates the equivalent of the following C# code: // var wrapVal = Sse.SubtractScalar(srcVec, ovfFloatingValue); - NamedIntrinsic subtractIntrinsic = NI_X86Base_SubtractScalar; - // We're going to use ovfFloatingValue twice, so replace the constant with a lclVar. castRange.InsertAtEnd(ovfFloatingValue); @@ -1173,7 +1171,7 @@ void Lowering::LowerCast(GenTree* tree) } GenTree* wrapVal = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, floorVal, ovfFloatingValue, - subtractIntrinsic, srcType, 16); + NI_X86Base_SubtractScalar, srcType, 16); castRange.InsertAtEnd(wrapVal); ovfFloatingValue = m_compiler->gtClone(ovfFloatingValue); @@ -10427,6 +10425,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX512_Shuffle: case NI_AVX512_SumAbsoluteDifferencesInBlock32: case NI_AVX512_CompareMask: + case NI_AVX512_CompareScalarMask: case NI_AES_CarrylessMultiply: case NI_AES_V256_CarrylessMultiply: case NI_AES_V512_CarrylessMultiply: From 9b393fb62c2bc7c44a22ce764e8094097bd5558b Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 4 Mar 2026 09:48:50 -0800 Subject: [PATCH 2/7] rename variable --- src/coreclr/jit/gentree.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 5a65ddcadf0943..a16031fa8a891c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33870,8 +33870,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } - bool maskIsZero = false; - bool maskIsAllOnes = false; + bool maskIsZero = false; + bool maskIsAllBitsSet = false; if (op3->IsCnsMsk()) { @@ -33882,7 +33882,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) GenTreeMskCon* mask = op3->AsMskCon(); uint32_t elemCount = simdSize / genTypeSize(simdBaseType); - maskIsAllOnes = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount); + maskIsAllBitsSet = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount); } } else @@ -33893,11 +33893,11 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) if (!maskIsZero) { - maskIsAllOnes = op3->IsVectorAllBitsSet(); + maskIsAllBitsSet = op3->IsVectorAllBitsSet(); } } - if (maskIsAllOnes) + if (maskIsAllBitsSet) { if ((op1->gtFlags & GTF_SIDE_EFFECT) != 0) { From bfede63569dd71d81371b846f3834dd35d409d99 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 16 Apr 2026 12:55:16 -0700 Subject: [PATCH 3/7] Update src/coreclr/jit/hwintrinsiclistxarch.h Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/coreclr/jit/hwintrinsiclistxarch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index cd658d7588d263..18f680fdaa2207 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -1285,7 +1285,7 @@ HARDWARE_INTRINSIC(AVX512, CompareNotGreaterThanOrEqualMask, HARDWARE_INTRINSIC(AVX512, CompareNotLessThanMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareNotLessThanOrEqualMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareOrderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX512, CompareScalarMask, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpss, INS_vcmpsd INS_invalid}, HW_Category_IMM, HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512, CompareScalarMask, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpss, INS_vcmpsd, INS_invalid}, HW_Category_IMM, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareUnorderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompressMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX512, CompressStoreMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoFlag) From 27c7ff2fc755d1d5d5ad27323fcaf5f10d913d28 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Sun, 19 Apr 2026 21:35:43 -0700 Subject: [PATCH 4/7] add costing info --- src/coreclr/jit/gentree.cpp | 97 +++++++++++++++++++++++++++++++------ 1 file changed, 82 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 3e5c2dd103ed53..e74d162b58f2d3 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -6492,7 +6492,38 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) { var_types dstType = tree->AsCast()->CastToType(); - if (varTypeIsLong(dstType)) + if (compOpportunisticallyDependsOn(InstructionSet_AVX10v2)) + { +#if defined(TARGET_X86) + if (varTypeIsLong(dstType)) + { + // unsigned: vcvttp*2uqqs xmm0, xmm0 + // vmovq [mem], xmm0 + // + // unsigned: vcvttp*2qqs xmm0, xmm0 + // vmovq [mem], xmm0 + + costEx = 4 + FLT_IND_COST_EX; // 4 + FLT_IND_COST_EX + costSz = 6 + 6; // 12 + + if (op1Type == TYP_FLOAT) + { + // vector widening float->long instructions take 1 extra cycle + // compared to same-size conversion + costEx += 1; + } + } + else +#endif + { + // signed: vcvtts*2sis eax, xmm0 + // unsigned: vcvtts*2usis eax, xmm0 + + costEx = 7; + costSz = 6; + } + } + else if (varTypeIsLong(dstType)) { #if defined(TARGET_AMD64) if (varTypeIsUnsigned(dstType)) @@ -6540,24 +6571,60 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) costSz = 5 + 4 + 10 + 5 + 8 + 4; // 36 } #else - // unsigned: ... - // call CORINFO_HELP_DBL2ULNG - // - // signed: ... - // call CORINFO_HELP_DBL2ULNG - - costEx = 5 + (3 * IND_COST_EX); // CALL - costSz = 5; // 5 + if (compOpportunisticallyDependsOn(InstructionSet_AVX512)) + { + if (varTypeIsUnsigned(dstType)) + { + // vxorps xmm1, xmm1, xmm1 + // vmaxs* xmm0, xmm0, xmm1 + // vcvttp*2uqq xmm0, xmm0 + // vmovq [mem], xmm0 - level++; + costEx = 1 + 4 + 4 + FLT_IND_COST_EX; // 9 + FLT_IND_COST_EX + costSz = 4 + 4 + 6 + 6; // 20 + } + else + { + // vcmpords* k1, xmm0, xmm0 + // vcvttp*2qq xmm1 {k1}{z}, xmm0 + // vcmpge_oqs* k1, xmm0, qword ptr [@RWD00] + // vpcmpeqd xmm0, xmm0, xmm0 + // vpsrlq xmm1 {k1}, xmm0, 1 + // vmovq [mem], xmm0 + + costEx = 4 + 4 + 4 + FLT_IND_COST_EX + 1 + 1 + + FLT_IND_COST_EX; // 14 + (2 * FLT_IND_COST_EX) + costSz = 7 + 6 + 11 + 4 + 7 + 6; // 41 + } - if (op1Type == TYP_FLOAT) + if (op1Type == TYP_FLOAT) + { + // vector widening float->long instructions take 1 extra cycle + // compared to same-size conversion + costEx += 1; + } + } + else { - // vcvtss2sd xmm0, xmm0, xmm0 - // ... + // unsigned: ... + // call CORINFO_HELP_DBL2ULNG + // + // signed: ... + // call CORINFO_HELP_DBL2ULNG + + costEx = 5 + (3 * IND_COST_EX); // CALL + costSz = 5; // 5 + + level++; - costEx += 4; // 4 + CALL - costSz += 4; // 9 + if (op1Type == TYP_FLOAT) + { + // vcvtss2sd xmm0, xmm0, xmm0 + // ... + + costEx += 4; // 4 + CALL + costSz += 4; // 9 + } } #endif } From 23fbdbfbfc34cee54e0cbf179c28229c76d4bc14 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Sun, 19 Apr 2026 21:47:13 -0700 Subject: [PATCH 5/7] fix comment typo --- src/coreclr/jit/gentree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index e74d162b58f2d3..7a4927ff8692c8 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -6500,7 +6500,7 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) // unsigned: vcvttp*2uqqs xmm0, xmm0 // vmovq [mem], xmm0 // - // unsigned: vcvttp*2qqs xmm0, xmm0 + // signed: vcvttp*2qqs xmm0, xmm0 // vmovq [mem], xmm0 costEx = 4 + FLT_IND_COST_EX; // 4 + FLT_IND_COST_EX From c8f26cb18f199853a336af1e70849e56d4a11204 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Tue, 21 Apr 2026 17:47:17 -0700 Subject: [PATCH 6/7] use normal VEC_CNS for long.MaxValue --- src/coreclr/jit/decomposelongs.cpp | 76 +++++++++++++++--------------- src/coreclr/jit/gentree.cpp | 15 +++--- 2 files changed, 45 insertions(+), 46 deletions(-) diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index b97ab8cb3c578b..9795672c70c88c 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -664,14 +664,20 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) srcUse.ReplaceWithLclVar(m_compiler); srcVector = srcUse.Def(); + // This logic is similar to the floating->long saturating logic in Lowering::LowerCast, + // except that here we must keep everything in SIMD registers. We can also take advantage + // of EVEX masking since the conversion itself requires AVX-512. + // // We fix up NaN values by masking in zero during conversion. Negative saturation is handled // correctly by the conversion instructions. Positive saturation is handled after conversion, // because MaxValue is not precisely representable in the floating format. // // This creates roughly the equivalent of the following C# code: // var nanMask = Avx.CompareScalar(srcVec, srcVec, FloatComparisonMode.OrderedNonSignaling); - // var convert = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec); - // convertResult = Vector128.ConditionalSelect(nanMask, convert, Vector128.Zero); + // + // var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling; + // var ovfFloatingValue = Vector128.Create(9223372036854775808.0); + // var ovfMask = Avx.CompareScalar(srcVec, ovfFloatingValue, compareMode); GenTree* srcClone = m_compiler->gtClone(srcVector); GenTree* compareMode = @@ -683,34 +689,6 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) castRange.InsertAtEnd(compareMode); castRange.InsertAtEnd(nanMask); - srcClone = m_compiler->gtClone(srcVector); - GenTree* convertResult = - m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone, - NI_AVX512_ConvertToVector128Int64WithTruncation, srcType, 16); - - castRange.InsertAtEnd(srcClone); - castRange.InsertAtEnd(convertResult); - - nanMask = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, nanMask, dstType, 16); - GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16); - convertResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, nanMask, convertResult, zero, dstType, 16); - - castRange.InsertAtEnd(nanMask); - castRange.InsertAtEnd(zero); - castRange.InsertAtEnd(convertResult); - - // Now we handle saturation of the result for positive overflow. - // This logic is similar to the floating->long saturating logic in Lowering::LowerCast, - // except that here we must keep everything in SIMD registers. We can also take advantage - // of EVEX masking, so the construction and blending of `maxLong` is optimized for that. - // - // This creates roughly the equivalent of the following C# code: - // var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling; - // var ovfFloatingValue = Vector128.Create(9223372036854775808.0); - // var ovfMask = Avx.CompareScalar(srcVec, ovfFloatingValue, compareMode); - // var maxLong = Vector128.AllBitsSet >>> 1; - // castResult = Vector128.ConditionalSelect(ovfMask, maxLong, convertResult); - compareMode = m_compiler->gtNewIconNode( static_cast(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling)); @@ -726,18 +704,40 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) castRange.InsertAtEnd(compareMode); castRange.InsertAtEnd(ovfMask); - GenTree* allBitsSet = m_compiler->gtNewAllBitsSetConNode(TYP_SIMD16); - GenTree* one = m_compiler->gtNewIconNode(1); - GenTree* maxLong = m_compiler->gtNewSimdBinOpNode(GT_RSZ, TYP_SIMD16, allBitsSet, one, dstType, 16); + // Now we convert, using the masks created above for NaN and overflow saturation. + // + // This creates roughly the equivalent of the following C# code: + // var convert = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec); + // convertResult = Vector128.ConditionalSelect(nanMask, convert, Vector128.Zero); + // + // var maxLong = Vector128.Create(long.MaxValue); + // castResult = Vector128.ConditionalSelect(ovfMask, maxLong, convertResult); - castRange.InsertAtEnd(allBitsSet); - castRange.InsertAtEnd(one); - castRange.InsertAtEnd(maxLong); + srcClone = m_compiler->gtClone(srcVector); + GenTree* convertResult = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone, + NI_AVX512_ConvertToVector128Int64WithTruncation, srcType, 16); - ovfMask = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, ovfMask, dstType, 16); - castResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, ovfMask, maxLong, convertResult, dstType, 16); + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(convertResult); + + nanMask = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, nanMask, dstType, 16); + GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16); + + castRange.InsertAtEnd(nanMask); + castRange.InsertAtEnd(zero); + GenTreeVecCon* maxLong = m_compiler->gtNewVconNode(TYP_SIMD16); + maxLong->EvaluateBroadcastInPlace(dstType, INT64_MAX); + + ovfMask = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, ovfMask, dstType, 16); + convertResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, nanMask, convertResult, zero, dstType, 16); + + castRange.InsertAtEnd(maxLong); castRange.InsertAtEnd(ovfMask); + castRange.InsertAtEnd(convertResult); + + castResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, ovfMask, maxLong, convertResult, dstType, 16); } // Because the results are in a SIMD register, we need to ToScalar() them out. diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 7a4927ff8692c8..1f0ec2db619558 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -6516,8 +6516,8 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) else #endif { - // signed: vcvtts*2sis eax, xmm0 // unsigned: vcvtts*2usis eax, xmm0 + // signed: vcvtts*2sis eax, xmm0 costEx = 7; costSz = 6; @@ -6586,15 +6586,14 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) else { // vcmpords* k1, xmm0, xmm0 - // vcvttp*2qq xmm1 {k1}{z}, xmm0 - // vcmpge_oqs* k1, xmm0, qword ptr [@RWD00] - // vpcmpeqd xmm0, xmm0, xmm0 - // vpsrlq xmm1 {k1}, xmm0, 1 + // vcmpge_oqs* k2, xmm0, qword ptr [@RWD00] + // vcvttp*2qq xmm0 {k1}{z}, xmm0 + // vpblendmq xmm0 {k2}, xmm0, qword ptr [@RWD08] {1to2} // vmovq [mem], xmm0 - costEx = 4 + 4 + 4 + FLT_IND_COST_EX + 1 + 1 + - FLT_IND_COST_EX; // 14 + (2 * FLT_IND_COST_EX) - costSz = 7 + 6 + 11 + 4 + 7 + 6; // 41 + costEx = 4 + (4 + FLT_IND_COST_EX) + 4 + (1 + FLT_IND_COST_EX) + + FLT_IND_COST_EX; // 13 + (3 * FLT_IND_COST_EX) + costSz = 7 + 11 + 6 + 10 + 6; // 40 } if (op1Type == TYP_FLOAT) From 858db2caee82aa6ecaf6eaa6c7ae8cca7b241046 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 22 Apr 2026 12:01:03 -0700 Subject: [PATCH 7/7] use BlendVariableMask directly --- src/coreclr/jit/decomposelongs.cpp | 33 ++++++++++++++---------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index 9795672c70c88c..b14aab6bfb3ec1 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -704,40 +704,37 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) castRange.InsertAtEnd(compareMode); castRange.InsertAtEnd(ovfMask); - // Now we convert, using the masks created above for NaN and overflow saturation. + // Now we convert, using the masks created above for NaN and positive overflow saturation. // // This creates roughly the equivalent of the following C# code: - // var convert = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec); - // convertResult = Vector128.ConditionalSelect(nanMask, convert, Vector128.Zero); + // var convert = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec); + // var convertMasked = Avx512F.VL.BlendVariable(Vector128.Zero, convert, nanMask); // - // var maxLong = Vector128.Create(long.MaxValue); - // castResult = Vector128.ConditionalSelect(ovfMask, maxLong, convertResult); + // var maxLong = Vector128.Create(long.MaxValue); + // castResult = Avx512F.VL.BlendVariable(convertMasked, maxLong, ovfMask); + + GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16); srcClone = m_compiler->gtClone(srcVector); - GenTree* convertResult = + GenTree* convert = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone, NI_AVX512_ConvertToVector128Int64WithTruncation, srcType, 16); + castRange.InsertAtEnd(zero); castRange.InsertAtEnd(srcClone); - castRange.InsertAtEnd(convertResult); + castRange.InsertAtEnd(convert); - nanMask = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, nanMask, dstType, 16); - GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16); - - castRange.InsertAtEnd(nanMask); - castRange.InsertAtEnd(zero); + GenTree* convertMasked = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, zero, convert, nanMask, + NI_AVX512_BlendVariableMask, dstType, 16); GenTreeVecCon* maxLong = m_compiler->gtNewVconNode(TYP_SIMD16); maxLong->EvaluateBroadcastInPlace(dstType, INT64_MAX); - ovfMask = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, ovfMask, dstType, 16); - convertResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, nanMask, convertResult, zero, dstType, 16); - + castRange.InsertAtEnd(convertMasked); castRange.InsertAtEnd(maxLong); - castRange.InsertAtEnd(ovfMask); - castRange.InsertAtEnd(convertResult); - castResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, ovfMask, maxLong, convertResult, dstType, 16); + castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, convertMasked, maxLong, ovfMask, + NI_AVX512_BlendVariableMask, dstType, 16); } // Because the results are in a SIMD register, we need to ToScalar() them out.