diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index 0461a12d181d3a..ddc55e633292fa 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -137,7 +137,12 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) } } +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) + if (!tree->TypeIs(TYP_LONG) && + !(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree))) +#else if (!tree->TypeIs(TYP_LONG)) +#endif // FEATURE_HW_INTRINSICS && TARGET_X86 { return tree->gtNext; } @@ -157,15 +162,18 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) GenTree* user = use.User(); - if (user->OperIsHWIntrinsic()) + if (tree->TypeIs(TYP_LONG) && (user->OperIsHWIntrinsic() || (user->OperIs(GT_CAST) && varTypeIsFloating(user)))) { if (tree->OperIs(GT_CNS_LNG) || (tree->OperIs(GT_IND, GT_LCL_FLD) && m_lowering->IsSafeToContainMem(user, tree))) { - NamedIntrinsic intrinsicId = user->AsHWIntrinsic()->GetHWIntrinsicId(); - assert(HWIntrinsicInfo::IsVectorCreate(intrinsicId) || - HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) || - HWIntrinsicInfo::IsVectorCreateScalarUnsafe(intrinsicId)); + if (user->OperIsHWIntrinsic()) + { + NamedIntrinsic intrinsicId = user->AsHWIntrinsic()->GetHWIntrinsicId(); + assert(HWIntrinsicInfo::IsVectorCreate(intrinsicId) || + HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) || + HWIntrinsicInfo::IsVectorCreateScalarUnsafe(intrinsicId)); + } return tree->gtNext; } @@ -562,28 +570,78 @@ GenTree* DecomposeLongs::DecomposeStoreLclFld(LIR::Use& use) GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) { assert(use.IsInitialized()); - assert(use.Def()->OperGet() == GT_CAST); + assert(use.Def()->OperIs(GT_CAST)); - GenTree* cast = use.Def()->AsCast(); - GenTree* loResult = nullptr; - GenTree* hiResult = nullptr; + GenTreeCast* cast = use.Def()->AsCast(); + var_types srcType = cast->CastFromType(); + var_types dstType = cast->CastToType(); - var_types srcType = cast->CastFromType(); - var_types dstType = cast->CastToType(); - - if ((cast->gtFlags & GTF_UNSIGNED) != 0) + if (cast->IsUnsigned()) { srcType = varTypeToUnsigned(srcType); } - bool skipDecomposition = false; +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) + if (varTypeIsFloating(dstType)) + { + // We will reach this path only if morph did not convert the cast to a helper call, + // meaning we can perform the cast using SIMD instructions. + // The sequence this creates is simply: + // AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar() + + NamedIntrinsic intrinsicId = NI_Illegal; + GenTree* srcOp = cast->CastOp(); + var_types dstType = cast->CastToType(); + CorInfoType baseFloatingType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE; + CorInfoType baseIntegralType = cast->IsUnsigned() ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; + + assert(!cast->gtOverflow()); + + if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512DQ_VL)) + { + intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512DQ_VL_ConvertToVector128Single + : NI_AVX512DQ_VL_ConvertToVector128Double; + } + else + { + assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); + intrinsicId = + (dstType == TYP_FLOAT) ? NI_AVX10v1_ConvertToVector128Single : NI_AVX10v1_ConvertToVector128Double; + } + + GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, baseIntegralType, 16); + GenTree* convert = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16); + GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16); + + Range().InsertAfter(cast, createScalar, convert, toScalar); + Range().Remove(cast); + + if (createScalar->IsCnsVec()) + { + Range().Remove(srcOp); + } + + if (use.IsDummyUse()) + { + toScalar->SetUnusedValue(); + } + use.ReplaceWith(toScalar); + + return toScalar->gtNext; + } +#endif // FEATURE_HW_INTRINSICS && TARGET_X86 + + bool skipDecomposition = false; + GenTree* loResult = nullptr; + GenTree* hiResult = nullptr; if (varTypeIsLong(srcType)) { if (cast->gtOverflow() && (varTypeIsUnsigned(srcType) != varTypeIsUnsigned(dstType))) { - GenTree* srcOp = cast->gtGetOp1(); - noway_assert(srcOp->OperGet() == GT_LONG); + GenTree* srcOp = cast->CastOp(); + noway_assert(srcOp->OperIs(GT_LONG)); GenTree* loSrcOp = srcOp->gtGetOp1(); GenTree* hiSrcOp = srcOp->gtGetOp2(); @@ -595,13 +653,13 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) // check provided by codegen. // - const bool signExtend = (cast->gtFlags & GTF_UNSIGNED) == 0; + const bool signExtend = !cast->IsUnsigned(); loResult = EnsureIntSized(loSrcOp, signExtend); hiResult = cast; hiResult->gtType = TYP_INT; hiResult->AsCast()->gtCastType = TYP_UINT; - hiResult->gtFlags &= ~GTF_UNSIGNED; + hiResult->ClearUnsigned(); hiResult->AsOp()->gtOp1 = hiSrcOp; Range().Remove(srcOp); @@ -631,7 +689,7 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) } else { - if (!use.IsDummyUse() && (use.User()->OperGet() == GT_MUL)) + if (!use.IsDummyUse() && use.User()->OperIs(GT_MUL)) { // // This int->long cast is used by a GT_MUL that will be transformed by DecomposeMul into a @@ -646,7 +704,7 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) } else if (varTypeIsUnsigned(srcType)) { - const bool signExtend = (cast->gtFlags & GTF_UNSIGNED) == 0; + const bool signExtend = !cast->IsUnsigned(); loResult = EnsureIntSized(cast->gtGetOp1(), signExtend); hiResult = m_compiler->gtNewZeroConNode(TYP_INT); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index f18e5935f0de4b..b016045179e99b 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -417,11 +417,16 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) // Because there is no IL instruction conv.r4.un, uint/ulong -> float // casts are always imported as CAST(float <- CAST(double <- uint/ulong)). // We can usually eliminate the redundant intermediate cast as an optimization. + // // AArch and xarch+EVEX have instructions that can cast directly from - // all integers (except for longs on 32-bit of course) to floats. + // all integers (except for longs on ARM32) to floats. // On x64, we also have the option of widening uint -> long and // using the signed conversion instructions, and ulong -> float/double // is handled directly in codegen, so we can allow all casts. + // + // This logic will also catch CAST(float <- CAST(double <- float)) + // and reduce it to CAST(float <- float), which is handled in codegen as + // an optional mov. else if ((dstType == TYP_FLOAT) && (srcType == TYP_DOUBLE) && oper->OperIs(GT_CAST) #ifndef TARGET_64BIT && !varTypeIsLong(oper->AsCast()->CastOp()) @@ -481,6 +486,15 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) #endif // TARGET_AMD64 #ifdef TARGET_X86 +#ifdef FEATURE_HW_INTRINSICS + else if (varTypeIsLong(srcType) && varTypeIsFloating(dstType) && canUseEvexEncoding()) + { + // We can handle these casts directly using SIMD instructions. + // The transform to SIMD is done in DecomposeLongs. + return nullptr; + } +#endif // FEATURE_HW_INTRINSICS + // Do we have to do two step U4/8 -> R4/8 ? else if (tree->IsUnsigned() && varTypeIsFloating(dstType)) { @@ -494,7 +508,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) { oper = gtNewCastNode(TYP_LONG, oper, true, TYP_LONG); oper->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT)); - tree->gtFlags &= ~GTF_UNSIGNED; + tree->ClearUnsigned(); return fgMorphCastIntoHelper(tree, CORINFO_HELP_LNG2DBL, oper); } }