Skip to content
160 changes: 146 additions & 14 deletions src/coreclr/jit/decomposelongs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -587,40 +587,172 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use)
}

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
if (varTypeIsFloating(dstType))
if (varTypeIsFloating(srcType) || varTypeIsFloating(dstType))
{
// We will reach this path only if morph did not convert the cast to a helper call,
// meaning we can perform the cast using SIMD instructions.
// The sequence this creates is simply:
// AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar()

NamedIntrinsic intrinsicId = NI_Illegal;
GenTree* srcOp = cast->CastOp();

assert(!cast->gtOverflow());
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512));
Comment thread
saucecontrol marked this conversation as resolved.

intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
GenTree* srcOp = cast->CastOp();
GenTree* castResult = nullptr;
LIR::Range castRange = LIR::EmptyRange();

GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16);
GenTree* convert = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, srcType, 16);
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, dstType, 16);
// This creates the equivalent of the following C# code:
// var srcVec = Vector128.CreateScalarUnsafe(castOp);

Range().InsertAfter(cast, createScalar, convert, toScalar);
Range().Remove(cast);
GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16);
castRange.InsertAtEnd(srcVector);
Comment thread
saucecontrol marked this conversation as resolved.

if (createScalar->IsCnsVec())
if (srcVector->IsCnsVec())
{
Range().Remove(srcOp);
}

if (varTypeIsFloating(dstType))
{
// long->floating casts don't require any kind of fixup. We simply use the vector
// form of the instructions, because the scalar form is not supported on 32-bit.

NamedIntrinsic intrinsicId =
(dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;

castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16);
}
else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
{
// Likewise, the AVX10.2 saturating floating->long instructions give the correct result,
// but we have to use the vector form.

NamedIntrinsic intrinsicId = (dstType == TYP_ULONG)
? NI_AVX10v2_ConvertToVectorUInt64WithTruncatedSaturation
: NI_AVX10v2_ConvertToVectorInt64WithTruncatedSaturation;

castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcType, 16);
}
else if (dstType == TYP_ULONG)
{
// AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so
// we only need to fix up negative or NaN values before conversion.
//
// maxs[sd] will take the value from the second operand if the first operand's value is
// NaN, which allows us to fix up both negative and NaN values with a single instruction.
//
// This creates the equivalent of the following C# code:
// var fixupVal = Sse.MaxScalar(srcVec, Vector128<T>.Zero);
// castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal);

GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
GenTree* fixupVal =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar, srcType, 16);

castRange.InsertAtEnd(zero);
castRange.InsertAtEnd(fixupVal);

castResult =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal,
NI_AVX512_ConvertToVector128UInt64WithTruncation, srcType, 16);
}
else
{
assert(dstType == TYP_LONG);

// We will use the input value multiple times, so we replace it with a lclVar.
LIR::Use srcUse;
LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse);
srcUse.ReplaceWithLclVar(m_compiler);
srcVector = srcUse.Def();

// This logic is similar to the floating->long saturating logic in Lowering::LowerCast,
// except that here we must keep everything in SIMD registers. We can also take advantage
// of EVEX masking since the conversion itself requires AVX-512.
//
// We fix up NaN values by masking in zero during conversion. Negative saturation is handled
// correctly by the conversion instructions. Positive saturation is handled after conversion,
// because MaxValue is not precisely representable in the floating format.
//
// This creates roughly the equivalent of the following C# code:
// var nanMask = Avx.CompareScalar(srcVec, srcVec, FloatComparisonMode.OrderedNonSignaling);
//
// var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling;
// var ovfFloatingValue = Vector128.Create(9223372036854775808.0);
// var ovfMask = Avx.CompareScalar(srcVec, ovfFloatingValue, compareMode);

Comment thread
saucecontrol marked this conversation as resolved.
GenTree* srcClone = m_compiler->gtClone(srcVector);
GenTree* compareMode =
m_compiler->gtNewIconNode(static_cast<int32_t>(FloatComparisonMode::OrderedNonSignaling));
GenTree* nanMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcVector, srcClone, compareMode,
NI_AVX512_CompareScalarMask, srcType, 16);

castRange.InsertAtEnd(srcClone);
castRange.InsertAtEnd(compareMode);
castRange.InsertAtEnd(nanMask);

compareMode = m_compiler->gtNewIconNode(
static_cast<int32_t>(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling));

GenTreeVecCon* ovfFloatingValue = m_compiler->gtNewVconNode(TYP_SIMD16);
ovfFloatingValue->EvaluateBroadcastInPlace(srcType, 9223372036854775808.0); // 2^63

srcClone = m_compiler->gtClone(srcVector);
GenTree* ovfMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcClone, ovfFloatingValue, compareMode,
NI_AVX512_CompareScalarMask, srcType, 16);

castRange.InsertAtEnd(srcClone);
castRange.InsertAtEnd(ovfFloatingValue);
castRange.InsertAtEnd(compareMode);
castRange.InsertAtEnd(ovfMask);

// Now we convert, using the masks created above for NaN and positive overflow saturation.
//
// This creates roughly the equivalent of the following C# code:
// var convert = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec);
// var convertMasked = Avx512F.VL.BlendVariable(Vector128<long>.Zero, convert, nanMask);
//
// var maxLong = Vector128.Create(long.MaxValue);
// castResult = Avx512F.VL.BlendVariable(convertMasked, maxLong, ovfMask);

GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);

srcClone = m_compiler->gtClone(srcVector);
GenTree* convert =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone,
NI_AVX512_ConvertToVector128Int64WithTruncation, srcType, 16);

castRange.InsertAtEnd(zero);
castRange.InsertAtEnd(srcClone);
castRange.InsertAtEnd(convert);

GenTree* convertMasked = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, zero, convert, nanMask,
NI_AVX512_BlendVariableMask, dstType, 16);

GenTreeVecCon* maxLong = m_compiler->gtNewVconNode(TYP_SIMD16);
maxLong->EvaluateBroadcastInPlace(dstType, INT64_MAX);

castRange.InsertAtEnd(convertMasked);
castRange.InsertAtEnd(maxLong);

castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, convertMasked, maxLong, ovfMask,
NI_AVX512_BlendVariableMask, dstType, 16);
}

// Because the results are in a SIMD register, we need to ToScalar() them out.
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(genActualType(dstType), castResult, dstType, 16);

castRange.InsertAtEnd(castResult);
castRange.InsertAtEnd(toScalar);

Range().InsertAfter(cast, std::move(castRange));
Range().Remove(cast);

if (use.IsDummyUse())
{
toScalar->SetUnusedValue();
}
use.ReplaceWith(toScalar);

return toScalar->gtNext;
return toScalar;
Comment thread
saucecontrol marked this conversation as resolved.
}
#endif // FEATURE_HW_INTRINSICS && TARGET_X86

Expand Down
8 changes: 2 additions & 6 deletions src/coreclr/jit/flowgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1339,12 +1339,8 @@ bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool o
}

#if defined(TARGET_X86) || defined(TARGET_ARM)
if (varTypeIsFloating(fromType) && varTypeIsLong(toType))
{
return true;
}

if (varTypeIsLong(fromType) && varTypeIsFloating(toType))
if ((varTypeIsLong(fromType) && varTypeIsFloating(toType)) ||
(varTypeIsFloating(fromType) && varTypeIsLong(toType)))
Comment thread
tannergooding marked this conversation as resolved.
{
#if defined(TARGET_X86)
return !compOpportunisticallyDependsOn(InstructionSet_AVX512);
Expand Down
106 changes: 86 additions & 20 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6492,7 +6492,38 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
{
var_types dstType = tree->AsCast()->CastToType();

if (varTypeIsLong(dstType))
if (compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
{
#if defined(TARGET_X86)
if (varTypeIsLong(dstType))
{
// unsigned: vcvttp*2uqqs xmm0, xmm0
// vmovq [mem], xmm0
//
// signed: vcvttp*2qqs xmm0, xmm0
// vmovq [mem], xmm0
Comment thread
saucecontrol marked this conversation as resolved.

costEx = 4 + FLT_IND_COST_EX; // 4 + FLT_IND_COST_EX
costSz = 6 + 6; // 12

if (op1Type == TYP_FLOAT)
{
// vector widening float->long instructions take 1 extra cycle
// compared to same-size conversion
costEx += 1;
}
}
else
#endif
{
// unsigned: vcvtts*2usis eax, xmm0
// signed: vcvtts*2sis eax, xmm0

costEx = 7;
costSz = 6;
}
}
else if (varTypeIsLong(dstType))
{
#if defined(TARGET_AMD64)
if (varTypeIsUnsigned(dstType))
Expand Down Expand Up @@ -6540,24 +6571,59 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
costSz = 5 + 4 + 10 + 5 + 8 + 4; // 36
}
#else
// unsigned: ...
// call CORINFO_HELP_DBL2ULNG
//
// signed: ...
// call CORINFO_HELP_DBL2ULNG

costEx = 5 + (3 * IND_COST_EX); // CALL
costSz = 5; // 5
if (compOpportunisticallyDependsOn(InstructionSet_AVX512))
{
if (varTypeIsUnsigned(dstType))
{
// vxorps xmm1, xmm1, xmm1
// vmaxs* xmm0, xmm0, xmm1
// vcvttp*2uqq xmm0, xmm0
// vmovq [mem], xmm0

level++;
costEx = 1 + 4 + 4 + FLT_IND_COST_EX; // 9 + FLT_IND_COST_EX
costSz = 4 + 4 + 6 + 6; // 20
}
else
{
// vcmpords* k1, xmm0, xmm0
// vcmpge_oqs* k2, xmm0, qword ptr [@RWD00]
// vcvttp*2qq xmm0 {k1}{z}, xmm0
// vpblendmq xmm0 {k2}, xmm0, qword ptr [@RWD08] {1to2}
// vmovq [mem], xmm0

costEx = 4 + (4 + FLT_IND_COST_EX) + 4 + (1 + FLT_IND_COST_EX) +
FLT_IND_COST_EX; // 13 + (3 * FLT_IND_COST_EX)
costSz = 7 + 11 + 6 + 10 + 6; // 40
}

if (op1Type == TYP_FLOAT)
if (op1Type == TYP_FLOAT)
{
// vector widening float->long instructions take 1 extra cycle
// compared to same-size conversion
costEx += 1;
}
}
else
{
// vcvtss2sd xmm0, xmm0, xmm0
// ...
// unsigned: ...
// call CORINFO_HELP_DBL2ULNG
//
// signed: ...
// call CORINFO_HELP_DBL2ULNG

costEx = 5 + (3 * IND_COST_EX); // CALL
costSz = 5; // 5

level++;

costEx += 4; // 4 + CALL
costSz += 4; // 9
if (op1Type == TYP_FLOAT)
{
// vcvtss2sd xmm0, xmm0, xmm0
// ...

costEx += 4; // 4 + CALL
costSz += 4; // 9
}
}
#endif
}
Expand Down Expand Up @@ -34946,8 +35012,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
break;
}

bool maskIsZero = false;
bool maskIsAllOnes = false;
bool maskIsZero = false;
bool maskIsAllBitsSet = false;

if (op3->IsCnsMsk())
{
Expand All @@ -34958,7 +35024,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
GenTreeMskCon* mask = op3->AsMskCon();
uint32_t elemCount = simdSize / genTypeSize(simdBaseType);

maskIsAllOnes = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount);
maskIsAllBitsSet = mask->gtSimdMaskVal.GetRawBits() == simdmask_t::GetBitMask(elemCount);
}
}
else
Expand All @@ -34969,11 +35035,11 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)

if (!maskIsZero)
{
maskIsAllOnes = op3->IsVectorAllBitsSet();
maskIsAllBitsSet = op3->IsVectorAllBitsSet();
}
}

if (maskIsAllOnes)
if (maskIsAllBitsSet)
{
if ((op1->gtFlags & GTF_SIDE_EFFECT) != 0)
{
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -1252,6 +1252,7 @@ HARDWARE_INTRINSIC(AVX512, CompareNotGreaterThanOrEqualMask,
HARDWARE_INTRINSIC(AVX512, CompareNotLessThanMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, 1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(AVX512, CompareNotLessThanOrEqualMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, 1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(AVX512, CompareOrderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, -1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(AVX512, CompareScalarMask, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpss, INS_vcmpsd}, -1, 4, HW_Category_IMM, HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(AVX512, CompareUnorderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, -1, 4, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(AVX512, CompressMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, 3, 3, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromSecondArg)
HARDWARE_INTRINSIC(AVX512, CompressStoreMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, -1, -1, HW_Category_MemoryStore, HW_Flag_NoFlag)
Expand Down
Loading
Loading