diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index 97ba9bb6ad53b7..b14aab6bfb3ec1 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -587,40 +587,172 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) } #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) - if (varTypeIsFloating(dstType)) + if (varTypeIsFloating(srcType) || varTypeIsFloating(dstType)) { // We will reach this path only if morph did not convert the cast to a helper call, // meaning we can perform the cast using SIMD instructions. - // The sequence this creates is simply: - // AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar() - - NamedIntrinsic intrinsicId = NI_Illegal; - GenTree* srcOp = cast->CastOp(); assert(!cast->gtOverflow()); assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512)); - intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double; + GenTree* srcOp = cast->CastOp(); + GenTree* castResult = nullptr; + LIR::Range castRange = LIR::EmptyRange(); - GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16); - GenTree* convert = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, srcType, 16); - GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, dstType, 16); + // This creates the equivalent of the following C# code: + // var srcVec = Vector128.CreateScalarUnsafe(castOp); - Range().InsertAfter(cast, createScalar, convert, toScalar); - Range().Remove(cast); + GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16); + castRange.InsertAtEnd(srcVector); - if (createScalar->IsCnsVec()) + if (srcVector->IsCnsVec()) { Range().Remove(srcOp); } + if (varTypeIsFloating(dstType)) + { + // long->floating casts don't require any kind of fixup. We simply use the vector + // form of the instructions, because the scalar form is not supported on 32-bit. + + NamedIntrinsic intrinsicId = + (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double; + + castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16); + } + else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2)) + { + // Likewise, the AVX10.2 saturating floating->long instructions give the correct result, + // but we have to use the vector form. + + NamedIntrinsic intrinsicId = (dstType == TYP_ULONG) + ? NI_AVX10v2_ConvertToVectorUInt64WithTruncatedSaturation + : NI_AVX10v2_ConvertToVectorInt64WithTruncatedSaturation; + + castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16); + } + else if (dstType == TYP_ULONG) + { + // AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so + // we only need to fix up negative or NaN values before conversion. + // + // maxs[sd] will take the value from the second operand if the first operand's value is + // NaN, which allows us to fix up both negative and NaN values with a single instruction. + // + // This creates the equivalent of the following C# code: + // var fixupVal = Sse.MaxScalar(srcVec, Vector128.Zero); + // castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal); + + GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16); + GenTree* fixupVal = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar, srcType, 16); + + castRange.InsertAtEnd(zero); + castRange.InsertAtEnd(fixupVal); + + castResult = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal, + NI_AVX512_ConvertToVector128UInt64WithTruncation, srcType, 16); + } + else + { + assert(dstType == TYP_LONG); + + // We will use the input value multiple times, so we replace it with a lclVar. + LIR::Use srcUse; + LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse); + srcUse.ReplaceWithLclVar(m_compiler); + srcVector = srcUse.Def(); + + // This logic is similar to the floating->long saturating logic in Lowering::LowerCast, + // except that here we must keep everything in SIMD registers. We can also take advantage + // of EVEX masking since the conversion itself requires AVX-512. + // + // We fix up NaN values by masking in zero during conversion. Negative saturation is handled + // correctly by the conversion instructions. Positive saturation is handled after conversion, + // because MaxValue is not precisely representable in the floating format. + // + // This creates roughly the equivalent of the following C# code: + // var nanMask = Avx.CompareScalar(srcVec, srcVec, FloatComparisonMode.OrderedNonSignaling); + // + // var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling; + // var ovfFloatingValue = Vector128.Create(9223372036854775808.0); + // var ovfMask = Avx.CompareScalar(srcVec, ovfFloatingValue, compareMode); + + GenTree* srcClone = m_compiler->gtClone(srcVector); + GenTree* compareMode = + m_compiler->gtNewIconNode(static_cast(FloatComparisonMode::OrderedNonSignaling)); + GenTree* nanMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcVector, srcClone, compareMode, + NI_AVX512_CompareScalarMask, srcType, 16); + + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(compareMode); + castRange.InsertAtEnd(nanMask); + + compareMode = m_compiler->gtNewIconNode( + static_cast(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling)); + + GenTreeVecCon* ovfFloatingValue = m_compiler->gtNewVconNode(TYP_SIMD16); + ovfFloatingValue->EvaluateBroadcastInPlace(srcType, 9223372036854775808.0); // 2^63 + + srcClone = m_compiler->gtClone(srcVector); + GenTree* ovfMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcClone, ovfFloatingValue, compareMode, + NI_AVX512_CompareScalarMask, srcType, 16); + + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(ovfFloatingValue); + castRange.InsertAtEnd(compareMode); + castRange.InsertAtEnd(ovfMask); + + // Now we convert, using the masks created above for NaN and positive overflow saturation. + // + // This creates roughly the equivalent of the following C# code: + // var convert = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec); + // var convertMasked = Avx512F.VL.BlendVariable(Vector128.Zero, convert, nanMask); + // + // var maxLong = Vector128.Create(long.MaxValue); + // castResult = Avx512F.VL.BlendVariable(convertMasked, maxLong, ovfMask); + + GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16); + + srcClone = m_compiler->gtClone(srcVector); + GenTree* convert = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone, + NI_AVX512_ConvertToVector128Int64WithTruncation, srcType, 16); + + castRange.InsertAtEnd(zero); + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(convert); + + GenTree* convertMasked = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, zero, convert, nanMask, + NI_AVX512_BlendVariableMask, dstType, 16); + + GenTreeVecCon* maxLong = m_compiler->gtNewVconNode(TYP_SIMD16); + maxLong->EvaluateBroadcastInPlace(dstType, INT64_MAX); + + castRange.InsertAtEnd(convertMasked); + castRange.InsertAtEnd(maxLong); + + castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, convertMasked, maxLong, ovfMask, + NI_AVX512_BlendVariableMask, dstType, 16); + } + + // Because the results are in a SIMD register, we need to ToScalar() them out. + GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(genActualType(dstType), castResult, dstType, 16); + + castRange.InsertAtEnd(castResult); + castRange.InsertAtEnd(toScalar); + + Range().InsertAfter(cast, std::move(castRange)); + Range().Remove(cast); + if (use.IsDummyUse()) { toScalar->SetUnusedValue(); } use.ReplaceWith(toScalar); - return toScalar->gtNext; + return toScalar; } #endif // FEATURE_HW_INTRINSICS && TARGET_X86 diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index 8f5b5be1820ef7..0f427204e96f67 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -1339,12 +1339,8 @@ bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool o } #if defined(TARGET_X86) || defined(TARGET_ARM) - if (varTypeIsFloating(fromType) && varTypeIsLong(toType)) - { - return true; - } - - if (varTypeIsLong(fromType) && varTypeIsFloating(toType)) + if ((varTypeIsLong(fromType) && varTypeIsFloating(toType)) || + (varTypeIsFloating(fromType) && varTypeIsLong(toType))) { #if defined(TARGET_X86) return !compOpportunisticallyDependsOn(InstructionSet_AVX512); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 826fba0d2ce604..1f0ec2db619558 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -6492,7 +6492,38 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) { var_types dstType = tree->AsCast()->CastToType(); - if (varTypeIsLong(dstType)) + if (compOpportunisticallyDependsOn(InstructionSet_AVX10v2)) + { +#if defined(TARGET_X86) + if (varTypeIsLong(dstType)) + { + // unsigned: vcvttp*2uqqs xmm0, xmm0 + // vmovq [mem], xmm0 + // + // signed: vcvttp*2qqs xmm0, xmm0 + // vmovq [mem], xmm0 + + costEx = 4 + FLT_IND_COST_EX; // 4 + FLT_IND_COST_EX + costSz = 6 + 6; // 12 + + if (op1Type == TYP_FLOAT) + { + // vector widening float->long instructions take 1 extra cycle + // compared to same-size conversion + costEx += 1; + } + } + else +#endif + { + // unsigned: vcvtts*2usis eax, xmm0 + // signed: vcvtts*2sis eax, xmm0 + + costEx = 7; + costSz = 6; + } + } + else if (varTypeIsLong(dstType)) { #if defined(TARGET_AMD64) if (varTypeIsUnsigned(dstType)) @@ -6540,24 +6571,59 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) costSz = 5 + 4 + 10 + 5 + 8 + 4; // 36 } #else - // unsigned: ... - // call CORINFO_HELP_DBL2ULNG - // - // signed: ... - // call CORINFO_HELP_DBL2ULNG - - costEx = 5 + (3 * IND_COST_EX); // CALL - costSz = 5; // 5 + if (compOpportunisticallyDependsOn(InstructionSet_AVX512)) + { + if (varTypeIsUnsigned(dstType)) + { + // vxorps xmm1, xmm1, xmm1 + // vmaxs* xmm0, xmm0, xmm1 + // vcvttp*2uqq xmm0, xmm0 + // vmovq [mem], xmm0 - level++; + costEx = 1 + 4 + 4 + FLT_IND_COST_EX; // 9 + FLT_IND_COST_EX + costSz = 4 + 4 + 6 + 6; // 20 + } + else + { + // vcmpords* k1, xmm0, xmm0 + // vcmpge_oqs* k2, xmm0, qword ptr [@RWD00] + // vcvttp*2qq xmm0 {k1}{z}, xmm0 + // vpblendmq xmm0 {k2}, xmm0, qword ptr [@RWD08] {1to2} + // vmovq [mem], xmm0 + + costEx = 4 + (4 + FLT_IND_COST_EX) + 4 + (1 + FLT_IND_COST_EX) + + FLT_IND_COST_EX; // 13 + (3 * FLT_IND_COST_EX) + costSz = 7 + 11 + 6 + 10 + 6; // 40 + } - if (op1Type == TYP_FLOAT) + if (op1Type == TYP_FLOAT) + { + // vector widening float->long instructions take 1 extra cycle + // compared to same-size conversion + costEx += 1; + } + } + else { - // vcvtss2sd xmm0, xmm0, xmm0 - // ... + // unsigned: ... + // call CORINFO_HELP_DBL2ULNG + // + // signed: ... + // call CORINFO_HELP_DBL2ULNG + + costEx = 5 + (3 * IND_COST_EX); // CALL + costSz = 5; // 5 + + level++; - costEx += 4; // 4 + CALL - costSz += 4; // 9 + if (op1Type == TYP_FLOAT) + { + // vcvtss2sd xmm0, xmm0, xmm0 + // ... + + costEx += 4; // 4 + CALL + costSz += 4; // 9 + } } #endif } @@ -34946,8 +35012,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } - bool maskIsZero = false; - bool maskIsAllOnes = false; + bool maskIsZero = false; + bool maskIsAllBitsSet = false; if (op3->IsCnsMsk()) { @@ -34958,7 +35024,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) GenTreeMskCon* mask = op3->AsMskCon(); uint32_t elemCount = simdSize / genTypeSize(simdBaseType); - maskIsAllOnes = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount); + maskIsAllBitsSet = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount); } } else @@ -34969,11 +35035,11 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) if (!maskIsZero) { - maskIsAllOnes = op3->IsVectorAllBitsSet(); + maskIsAllBitsSet = op3->IsVectorAllBitsSet(); } } - if (maskIsAllOnes) + if (maskIsAllBitsSet) { if ((op1->gtFlags & GTF_SIDE_EFFECT) != 0) { diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index af6e0c1f86f162..de2322872c60fc 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -1252,6 +1252,7 @@ HARDWARE_INTRINSIC(AVX512, CompareNotGreaterThanOrEqualMask, HARDWARE_INTRINSIC(AVX512, CompareNotLessThanMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, 1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareNotLessThanOrEqualMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, 1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareOrderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, -1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512, CompareScalarMask, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpss, INS_vcmpsd}, -1, 4, HW_Category_IMM, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareUnorderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, -1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompressMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, 3, 3, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX512, CompressStoreMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, -1, -1, HW_Category_MemoryStore, HW_Flag_NoFlag) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index ec2d51c27a179b..e14e06e32adabc 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -498,6 +498,7 @@ int HWIntrinsicInfo::lookupImmUpperBound(NamedIntrinsic id) case NI_AVX_CompareScalar: case NI_AVX512_Compare: case NI_AVX512_CompareMask: + case NI_AVX512_CompareScalarMask: case NI_AVX10v2_MinMaxScalar: case NI_AVX10v2_MinMax: { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 61fef20d231ccc..dbcccbf478e0d7 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1140,8 +1140,6 @@ void Lowering::LowerCast(GenTree* tree) // This creates the equivalent of the following C# code: // var wrapVal = Sse.SubtractScalar(srcVec, ovfFloatingValue); - NamedIntrinsic subtractIntrinsic = NI_X86Base_SubtractScalar; - // We're going to use ovfFloatingValue twice, so replace the constant with a lclVar. castRange.InsertAtEnd(ovfFloatingValue); @@ -1173,7 +1171,7 @@ void Lowering::LowerCast(GenTree* tree) } GenTree* wrapVal = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, floorVal, ovfFloatingValue, - subtractIntrinsic, srcType, 16); + NI_X86Base_SubtractScalar, srcType, 16); castRange.InsertAtEnd(wrapVal); ovfFloatingValue = m_compiler->gtClone(ovfFloatingValue); @@ -10496,6 +10494,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX512_Shuffle: case NI_AVX512_SumAbsoluteDifferencesInBlock32: case NI_AVX512_CompareMask: + case NI_AVX512_CompareScalarMask: case NI_AES_CarrylessMultiply: case NI_AES_V256_CarrylessMultiply: case NI_AES_V512_CarrylessMultiply: