From 04b1bbe9070a7c916352802a6f404ef6d936c453 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 26 Mar 2025 10:09:09 -0700 Subject: [PATCH 1/4] use SIMD conversion instructions for long -> floating casts --- src/coreclr/jit/morph.cpp | 40 +++++++++++++++++-- .../JitBlue/Runtime_106338/Runtime_106338.cs | 2 +- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 1fd3c5fef69f26..a0354561debacc 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -417,15 +417,20 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) // Because there is no IL instruction conv.r4.un, uint/ulong -> float // casts are always imported as CAST(float <- CAST(double <- uint/ulong)). // We can usually eliminate the redundant intermediate cast as an optimization. + // // AArch and xarch+EVEX have instructions that can cast directly from - // all integers (except for longs on 32-bit of course) to floats. + // all integers (except for longs on ARM32) to floats. // On x64, we also have the option of widening uint -> long and // using the signed conversion instructions, and ulong -> float/double // is handled directly in codegen, so we can allow all casts. + // + // This logic will also catch CAST(float <- CAST(double <- float)) + // and reduce it to CAST(float <- float), which is handled in codegen as + // an optional mov. else if ((dstType == TYP_FLOAT) && (srcType == TYP_DOUBLE) && oper->OperIs(GT_CAST) -#ifndef TARGET_64BIT +#ifdef TARGET_ARM && !varTypeIsLong(oper->AsCast()->CastOp()) -#endif // !TARGET_64BIT +#endif // TARGET_ARM #ifdef TARGET_X86 && canUseEvexEncoding() #endif // TARGET_X86 @@ -481,6 +486,35 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) #endif // TARGET_AMD64 #ifdef TARGET_X86 +#ifdef FEATURE_HW_INTRINSICS + else if (varTypeIsLong(srcType) && varTypeIsFloating(dstType) && canUseEvexEncoding()) + { + // We can avoid helper calls by using SIMD conversion instructions. The result needs to end up + // in a SIMD/floating register anyway. + NamedIntrinsic intrinsicId = NI_Illegal; + CorInfoType baseFloatingType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE; + CorInfoType baseIntegralType = tree->IsUnsigned() ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; + + if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ_VL)) + { + intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512DQ_VL_ConvertToVector128Single + : NI_AVX512DQ_VL_ConvertToVector128Double; + } + else + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); + intrinsicId = + (dstType == TYP_FLOAT) ? NI_AVX10v1_ConvertToVector128Single : NI_AVX10v1_ConvertToVector128Double; + } + + GenTree* createScalar = gtNewSimdCreateScalarNode(TYP_SIMD16, oper, baseIntegralType, 16); + GenTree* convert = gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16); + GenTree* toScalar = gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16); + + return fgMorphHWIntrinsic(toScalar->AsHWIntrinsic()); + } +#endif // FEATURE_HW_INTRINSICS + // Do we have to do two step U4/8 -> R4/8 ? else if (tree->IsUnsigned() && varTypeIsFloating(dstType)) { diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs b/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs index ed1b5cd56ace7a..504c84c3ab2959 100644 --- a/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs +++ b/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs @@ -22,7 +22,7 @@ public static void TestEntryPoint() float vr11 = 4294967295U | vr10; uint result = BitConverter.SingleToUInt32Bits(vr11); - if ((RuntimeInformation.ProcessArchitecture == Architecture.Arm64) || (RuntimeInformation.ProcessArchitecture == Architecture.X64)) + if ((RuntimeInformation.ProcessArchitecture == Architecture.Arm64) || (RuntimeInformation.ProcessArchitecture == Architecture.X64) || Avx512DQ.VL.IsSupported || Avx10v1.IsSupported) { // Expected to cast ulong -> float directly Assert.Equal(1600094603U, result); From ab5d9194eb1ca9e2d49a706eb5b8a48f7a4d09b6 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Fri, 28 Mar 2025 00:47:21 -0700 Subject: [PATCH 2/4] move transform to DecomposeLongs, restore double intermediate --- src/coreclr/jit/decomposelongs.cpp | 92 +++++++++++++++---- src/coreclr/jit/morph.cpp | 30 +----- .../JitBlue/Runtime_106338/Runtime_106338.cs | 2 +- 3 files changed, 78 insertions(+), 46 deletions(-) diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index 0461a12d181d3a..d8c0e7b31ac168 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -137,7 +137,11 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) } } +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) + if (!tree->TypeIs(TYP_LONG) && !(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree))) +#else if (!tree->TypeIs(TYP_LONG)) +#endif // FEATURE_HW_INTRINSICS && TARGET_X86 { return tree->gtNext; } @@ -157,15 +161,18 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) GenTree* user = use.User(); - if (user->OperIsHWIntrinsic()) + if (tree->TypeIs(TYP_LONG) && (user->OperIsHWIntrinsic() || (user->OperIs(GT_CAST) && varTypeIsFloating(user)))) { if (tree->OperIs(GT_CNS_LNG) || (tree->OperIs(GT_IND, GT_LCL_FLD) && m_lowering->IsSafeToContainMem(user, tree))) { - NamedIntrinsic intrinsicId = user->AsHWIntrinsic()->GetHWIntrinsicId(); - assert(HWIntrinsicInfo::IsVectorCreate(intrinsicId) || - HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) || - HWIntrinsicInfo::IsVectorCreateScalarUnsafe(intrinsicId)); + if (user->OperIsHWIntrinsic()) + { + NamedIntrinsic intrinsicId = user->AsHWIntrinsic()->GetHWIntrinsicId(); + assert(HWIntrinsicInfo::IsVectorCreate(intrinsicId) || + HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) || + HWIntrinsicInfo::IsVectorCreateScalarUnsafe(intrinsicId)); + } return tree->gtNext; } @@ -562,28 +569,73 @@ GenTree* DecomposeLongs::DecomposeStoreLclFld(LIR::Use& use) GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) { assert(use.IsInitialized()); - assert(use.Def()->OperGet() == GT_CAST); - - GenTree* cast = use.Def()->AsCast(); - GenTree* loResult = nullptr; - GenTree* hiResult = nullptr; + assert(use.Def()->OperIs(GT_CAST)); - var_types srcType = cast->CastFromType(); - var_types dstType = cast->CastToType(); + GenTreeCast* cast = use.Def()->AsCast(); + var_types srcType = cast->CastFromType(); + var_types dstType = cast->CastToType(); - if ((cast->gtFlags & GTF_UNSIGNED) != 0) + if (cast->IsUnsigned()) { srcType = varTypeToUnsigned(srcType); } - bool skipDecomposition = false; +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) + if (varTypeIsFloating(dstType)) + { + // We will reach this path only if morph did not convert the cast to a helper call, + // meaning we can perform the cast using SIMD instructions. + // The sequence this creates is simply: + // AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalar(LONG)).ToScalar() + + NamedIntrinsic intrinsicId = NI_Illegal; + GenTree* srcOp = cast->CastOp(); + var_types dstType = cast->CastToType(); + CorInfoType baseFloatingType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE; + CorInfoType baseIntegralType = cast->IsUnsigned() ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; + + assert(!cast->gtOverflow()); + + if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512DQ_VL)) + { + intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512DQ_VL_ConvertToVector128Single + : NI_AVX512DQ_VL_ConvertToVector128Double; + } + else + { + assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); + intrinsicId = + (dstType == TYP_FLOAT) ? NI_AVX10v1_ConvertToVector128Single : NI_AVX10v1_ConvertToVector128Double; + } + + GenTree* createScalar = m_compiler->gtNewSimdCreateScalarNode(TYP_SIMD16, srcOp, baseIntegralType, 16); + GenTree* convert = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16); + GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16); + + Range().InsertAfter(cast, createScalar, convert, toScalar); + Range().Remove(cast); + + if (use.IsDummyUse()) + { + toScalar->SetUnusedValue(); + } + use.ReplaceWith(toScalar); + + return toScalar->gtNext; + } +#endif // FEATURE_HW_INTRINSICS && TARGET_X86 + + bool skipDecomposition = false; + GenTree* loResult = nullptr; + GenTree* hiResult = nullptr; if (varTypeIsLong(srcType)) { if (cast->gtOverflow() && (varTypeIsUnsigned(srcType) != varTypeIsUnsigned(dstType))) { - GenTree* srcOp = cast->gtGetOp1(); - noway_assert(srcOp->OperGet() == GT_LONG); + GenTree* srcOp = cast->CastOp(); + noway_assert(srcOp->OperIs(GT_LONG)); GenTree* loSrcOp = srcOp->gtGetOp1(); GenTree* hiSrcOp = srcOp->gtGetOp2(); @@ -595,13 +647,13 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) // check provided by codegen. // - const bool signExtend = (cast->gtFlags & GTF_UNSIGNED) == 0; + const bool signExtend = !cast->IsUnsigned(); loResult = EnsureIntSized(loSrcOp, signExtend); hiResult = cast; hiResult->gtType = TYP_INT; hiResult->AsCast()->gtCastType = TYP_UINT; - hiResult->gtFlags &= ~GTF_UNSIGNED; + hiResult->ClearUnsigned(); hiResult->AsOp()->gtOp1 = hiSrcOp; Range().Remove(srcOp); @@ -631,7 +683,7 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) } else { - if (!use.IsDummyUse() && (use.User()->OperGet() == GT_MUL)) + if (!use.IsDummyUse() && use.User()->OperIs(GT_MUL)) { // // This int->long cast is used by a GT_MUL that will be transformed by DecomposeMul into a @@ -646,7 +698,7 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) } else if (varTypeIsUnsigned(srcType)) { - const bool signExtend = (cast->gtFlags & GTF_UNSIGNED) == 0; + const bool signExtend = !cast->IsUnsigned(); loResult = EnsureIntSized(cast->gtGetOp1(), signExtend); hiResult = m_compiler->gtNewZeroConNode(TYP_INT); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 9bf67522b41a4a..78f0f04d57c2d6 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -428,9 +428,9 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) // and reduce it to CAST(float <- float), which is handled in codegen as // an optional mov. else if ((dstType == TYP_FLOAT) && (srcType == TYP_DOUBLE) && oper->OperIs(GT_CAST) -#ifdef TARGET_ARM +#ifndef TARGET_64BIT && !varTypeIsLong(oper->AsCast()->CastOp()) -#endif // TARGET_ARM +#endif // !TARGET_64BIT #ifdef TARGET_X86 && canUseEvexEncoding() #endif // TARGET_X86 @@ -489,29 +489,9 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) #ifdef FEATURE_HW_INTRINSICS else if (varTypeIsLong(srcType) && varTypeIsFloating(dstType) && canUseEvexEncoding()) { - // We can avoid helper calls by using SIMD conversion instructions. The result needs to end up - // in a SIMD/floating register anyway. - NamedIntrinsic intrinsicId = NI_Illegal; - CorInfoType baseFloatingType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE; - CorInfoType baseIntegralType = tree->IsUnsigned() ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; - - if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ_VL)) - { - intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512DQ_VL_ConvertToVector128Single - : NI_AVX512DQ_VL_ConvertToVector128Double; - } - else - { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); - intrinsicId = - (dstType == TYP_FLOAT) ? NI_AVX10v1_ConvertToVector128Single : NI_AVX10v1_ConvertToVector128Double; - } - - GenTree* createScalar = gtNewSimdCreateScalarNode(TYP_SIMD16, oper, baseIntegralType, 16); - GenTree* convert = gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16); - GenTree* toScalar = gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16); - - return fgMorphHWIntrinsic(toScalar->AsHWIntrinsic()); + // We can handle these casts directly using SIMD instructions. + // The transform to SIMD is done in DecomposeLongs. + return nullptr; } #endif // FEATURE_HW_INTRINSICS diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs b/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs index 504c84c3ab2959..ed1b5cd56ace7a 100644 --- a/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs +++ b/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs @@ -22,7 +22,7 @@ public static void TestEntryPoint() float vr11 = 4294967295U | vr10; uint result = BitConverter.SingleToUInt32Bits(vr11); - if ((RuntimeInformation.ProcessArchitecture == Architecture.Arm64) || (RuntimeInformation.ProcessArchitecture == Architecture.X64) || Avx512DQ.VL.IsSupported || Avx10v1.IsSupported) + if ((RuntimeInformation.ProcessArchitecture == Architecture.Arm64) || (RuntimeInformation.ProcessArchitecture == Architecture.X64)) { // Expected to cast ulong -> float directly Assert.Equal(1600094603U, result); From 1bd047ad60ad56de7a50218ad762957d3ccd6f5d Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Fri, 28 Mar 2025 00:56:24 -0700 Subject: [PATCH 3/4] formatting --- src/coreclr/jit/decomposelongs.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index d8c0e7b31ac168..45a236f9b8c1d8 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -138,7 +138,8 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) } #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) - if (!tree->TypeIs(TYP_LONG) && !(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree))) + if (!tree->TypeIs(TYP_LONG) && + !(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree))) #else if (!tree->TypeIs(TYP_LONG)) #endif // FEATURE_HW_INTRINSICS && TARGET_X86 From c73acd5a7f6dda51595d5ddb3ae67bd275928e16 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Fri, 28 Mar 2025 12:35:59 -0700 Subject: [PATCH 4/4] handle constants --- src/coreclr/jit/decomposelongs.cpp | 9 +++++++-- src/coreclr/jit/morph.cpp | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index 45a236f9b8c1d8..ddc55e633292fa 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -587,7 +587,7 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) // We will reach this path only if morph did not convert the cast to a helper call, // meaning we can perform the cast using SIMD instructions. // The sequence this creates is simply: - // AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalar(LONG)).ToScalar() + // AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar() NamedIntrinsic intrinsicId = NI_Illegal; GenTree* srcOp = cast->CastOp(); @@ -609,7 +609,7 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) (dstType == TYP_FLOAT) ? NI_AVX10v1_ConvertToVector128Single : NI_AVX10v1_ConvertToVector128Double; } - GenTree* createScalar = m_compiler->gtNewSimdCreateScalarNode(TYP_SIMD16, srcOp, baseIntegralType, 16); + GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, baseIntegralType, 16); GenTree* convert = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16); GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16); @@ -617,6 +617,11 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) Range().InsertAfter(cast, createScalar, convert, toScalar); Range().Remove(cast); + if (createScalar->IsCnsVec()) + { + Range().Remove(srcOp); + } + if (use.IsDummyUse()) { toScalar->SetUnusedValue(); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 78f0f04d57c2d6..8993825596d34e 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -508,7 +508,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) { oper = gtNewCastNode(TYP_LONG, oper, true, TYP_LONG); oper->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT)); - tree->gtFlags &= ~GTF_UNSIGNED; + tree->ClearUnsigned(); return fgMorphCastIntoHelper(tree, CORINFO_HELP_LNG2DBL, oper); } }