Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30420,7 +30420,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp,
else
#endif // TARGET_XARCH
{
assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
assert((simdSize == 8) || (simdSize == 16));

#if defined(TARGET_ARM64)
assert(!isScalar || (simdSize == 8));
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1344,7 +1344,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
op1 = impSIMDPopStack();

retNode = gtNewSimdDotProdNode(simdType, op1, op2, simdBaseType, simdSize);
retNode = gtNewSimdGetElementNode(retType, retNode, gtNewIconNode(0), simdBaseType, simdSize);
retNode = gtNewSimdToScalarNode(retType, retNode, simdBaseType, simdSize);
}
break;
}
Expand Down
68 changes: 4 additions & 64 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2093,7 +2093,7 @@ bool Lowering::IsValidConstForMovImm(GenTreeHWIntrinsic* node)
}

//----------------------------------------------------------------------------------------------
// Lowering::LowerHWIntrinsicCmpOp: Lowers a Vector128 or Vector256 comparison intrinsic
// Lowering::LowerHWIntrinsicCmpOp: Lowers a Vector64 or Vector128 comparison intrinsic
//
// Arguments:
// node - The hardware intrinsic node.
Expand Down Expand Up @@ -2145,7 +2145,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
}

// Special case: "vec ==/!= zero_vector"
if (!varTypeIsFloating(simdBaseType) && (op != nullptr) && (simdSize != 12))
if (!varTypeIsFloating(simdBaseType) && (op != nullptr))
{
GenTree* cmp = op;
if (simdSize != 8) // we don't need compression for Vector64
Expand Down Expand Up @@ -2221,26 +2221,6 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
BlockRange().InsertBefore(node, cmp);
LowerNode(cmp);

if ((simdBaseType == TYP_FLOAT) && (simdSize == 12))
{
// For TYP_SIMD12 we don't want the upper bits to participate in the comparison. So, we will insert all ones
// into those bits of the result, "as if" the upper bits are equal. Then if all lower bits are equal, we get the
// expected all-ones result, and will get the expected 0's only where there are non-matching bits.

GenTree* idxCns = m_compiler->gtNewIconNode(3, TYP_INT);
BlockRange().InsertAfter(cmp, idxCns);

GenTree* insCns = m_compiler->gtNewIconNode(-1, TYP_INT);
BlockRange().InsertAfter(idxCns, insCns);

GenTree* tmp =
m_compiler->gtNewSimdHWIntrinsicNode(simdType, cmp, idxCns, insCns, NI_AdvSimd_Insert, TYP_INT, simdSize);
BlockRange().InsertAfter(insCns, tmp);
LowerNode(tmp);

cmp = tmp;
}

if (simdSize != 8) // we don't need compression for Vector64
{
GenTree* msk;
Expand Down Expand Up @@ -2339,7 +2319,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)

if (isConstant)
{
assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
assert((simdSize == 8) || (simdSize == 16));

for (GenTree* arg : node->Operands())
{
Expand Down Expand Up @@ -2497,46 +2477,6 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
GenTree* tmp1 = nullptr;
GenTree* tmp2 = nullptr;

if (simdSize == 12)
{
assert(simdBaseType == TYP_FLOAT);

// For 12 byte SIMD, we need to clear the upper 4 bytes:
// idx = CNS_INT int 0x03
// tmp1 = * CNS_DBL float 0.0
// /--* op1 simd16
// +--* idx int
// +--* tmp1 simd16
// op1 = * HWINTRINSIC simd16 T Insert
// ...

// This is roughly the following managed code:
// op1 = AdvSimd.Insert(op1, 0x03, 0.0f);
// ...

idx = m_compiler->gtNewIconNode(0x03, TYP_INT);
BlockRange().InsertAfter(op1, idx);

tmp1 = m_compiler->gtNewZeroConNode(TYP_FLOAT);
BlockRange().InsertAfter(idx, tmp1);
LowerNode(tmp1);

op1 = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op1, idx, tmp1, NI_AdvSimd_Insert, simdBaseType, simdSize);
BlockRange().InsertAfter(tmp1, op1);
LowerNode(op1);

idx = m_compiler->gtNewIconNode(0x03, TYP_INT);
BlockRange().InsertAfter(op2, idx);

tmp2 = m_compiler->gtNewZeroConNode(TYP_FLOAT);
BlockRange().InsertAfter(idx, tmp2);
LowerNode(tmp2);

op2 = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op2, idx, tmp2, NI_AdvSimd_Insert, simdBaseType, simdSize);
BlockRange().InsertAfter(tmp2, op2);
LowerNode(op2);
}

// We will be constructing the following parts:
// ...
// /--* op1 simd16
Expand Down Expand Up @@ -2615,7 +2555,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
}
else
{
assert((simdSize == 12) || (simdSize == 16));
assert(simdSize == 16);

// We will be constructing the following parts:
// ...
Expand Down
173 changes: 9 additions & 164 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2969,10 +2969,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
}
}

// TODO-XARCH-AVX512: We should handle TYP_SIMD12 here under the EVEX path, but doing
// so will require us to account for the unused 4th element.

if ((simdType != TYP_SIMD12) && m_compiler->canUseEvexEncoding())
if (m_compiler->canUseEvexEncoding())
{
// The EVEX encoded versions of the comparison instructions all return a kmask
//
Expand Down Expand Up @@ -3373,32 +3370,11 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
case TYP_USHORT:
case TYP_INT:
case TYP_UINT:
{
cmpType = simdBaseType;
mskType = TYP_UBYTE;

if (simdSize == 32)
{
cmpIntrinsic = NI_AVX2_CompareEqual;
mskIntrinsic = NI_AVX2_MoveMask;
mskConstant = -1;
}
else
{
assert(simdSize == 16);

cmpIntrinsic = NI_X86Base_CompareEqual;
mskIntrinsic = NI_X86Base_MoveMask;
mskConstant = 0xFFFF;
}
break;
}

case TYP_LONG:
case TYP_ULONG:
{
mskType = TYP_UBYTE;
cmpType = simdBaseType;
mskType = TYP_UBYTE;

if (simdSize == 32)
{
Expand Down Expand Up @@ -3430,22 +3406,11 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
}
else
{
assert(simdSize == 16);

cmpIntrinsic = NI_X86Base_CompareEqual;
mskIntrinsic = NI_X86Base_MoveMask;

if (simdSize == 16)
{
mskConstant = 0xF;
}
else if (simdSize == 12)
{
mskConstant = 0x7;
}
else
{
assert(simdSize == 8);
mskConstant = 0x3;
}
mskConstant = 0xF;
}
break;
}
Expand Down Expand Up @@ -3489,20 +3454,6 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
GenTree* mskCns = m_compiler->gtNewIconNode(mskConstant, TYP_INT);
BlockRange().InsertAfter(msk, mskCns);

if ((simdBaseType == TYP_FLOAT) && (simdSize < 16))
{
// For TYP_SIMD8 and TYP_SIMD12 we need to clear the upper bits and can't assume their value

GenTree* tmp = m_compiler->gtNewOperNode(GT_AND, TYP_INT, msk, mskCns);
BlockRange().InsertAfter(mskCns, tmp);
LowerNode(tmp);

msk = tmp;

mskCns = m_compiler->gtNewIconNode(mskConstant, TYP_INT);
BlockRange().InsertAfter(msk, mskCns);
}

node->ChangeOper(cmpOp);
node->ChangeType(TYP_INT);
node->AsOp()->gtOp1 = msk;
Expand Down Expand Up @@ -4185,7 +4136,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)

if (isConstant)
{
assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16) || (simdSize == 32) || (simdSize == 64));
assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64));

for (GenTree* arg : node->Operands())
{
Expand Down Expand Up @@ -5829,19 +5780,9 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
// var tmp3 = Avx.DotProduct(op1, op2, 0xFF);
// return tmp3.ToScalar();

if (simdSize == 8)
{
idx = m_compiler->gtNewIconNode(0x3F, TYP_INT);
}
else if (simdSize == 12)
{
idx = m_compiler->gtNewIconNode(0x7F, TYP_INT);
}
else
{
assert(simdSize == 16);
idx = m_compiler->gtNewIconNode(0xFF, TYP_INT);
}
assert(simdSize == 16);

idx = m_compiler->gtNewIconNode(0xFF, TYP_INT);
BlockRange().InsertBefore(node, idx);

if (varTypeIsSIMD(node->gtType))
Expand Down Expand Up @@ -5913,91 +5854,6 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
unreached();
}
}

if (simdSize == 8)
{
assert(simdBaseType == TYP_FLOAT);

// If simdSize == 8 then we have only two elements, not the 4 that we got from getSIMDVectorLength,
// which we gave a simdSize of 16. So, we set the simd16Count to 2 so that only 1 hadd will
// be emitted rather than 2, so that the upper two elements will be ignored.

simd16Count = 2;
}
else if (simdSize == 12)
{
assert(simdBaseType == TYP_FLOAT);

// We need to mask off the most significant element to avoid the shuffle + add
// from including it in the computed result. We need to do this for both op1 and
// op2 in case one of them is `NaN` (because Zero * NaN == NaN)

simd16_t simd16Val = {};

simd16Val.i32[0] = -1;
simd16Val.i32[1] = -1;
simd16Val.i32[2] = -1;
simd16Val.i32[3] = +0;

simdType = TYP_SIMD16;
simdSize = 16;

// We will be constructing the following parts:
// ...
// +--* CNS_INT int -1
// +--* CNS_INT int -1
// +--* CNS_INT int -1
// +--* CNS_INT int 0
// tmp1 = * HWINTRINSIC simd16 T Create
// /--* op1 simd16
// +--* tmp1 simd16
// op1 = * HWINTRINSIC simd16 T And
// ...

// This is roughly the following managed code:
// ...
// tmp1 = Vector128.Create(-1, -1, -1, 0);
// op1 = Sse.And(op1, tmp1);
// ...

GenTreeVecCon* vecCon1 = m_compiler->gtNewVconNode(simdType);
memcpy(&vecCon1->gtSimdVal, &simd16Val, sizeof(simd16_t));
BlockRange().InsertAfter(op1, vecCon1);

op1 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, vecCon1, simdBaseType, simdSize);
BlockRange().InsertAfter(vecCon1, op1);

LowerNode(vecCon1);
LowerNode(op1);

// We will be constructing the following parts:
// ...
// +--* CNS_INT int -1
// +--* CNS_INT int -1
// +--* CNS_INT int -1
// +--* CNS_INT int 0
// tmp2 = * HWINTRINSIC simd16 T Create
// /--* op2 simd16
// +--* tmp2 simd16
// op2 = * HWINTRINSIC simd16 T And
// ...

// This is roughly the following managed code:
// ...
// tmp2 = Vector128.Create(-1, -1, -1, 0);
// op2 = Sse.And(op2, tmp2);
// ...

GenTreeVecCon* vecCon2 = m_compiler->gtNewVconNode(simdType);
memcpy(&vecCon2->gtSimdVal, &simd16Val, sizeof(simd16_t));
BlockRange().InsertAfter(op2, vecCon2);

op2 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op2, vecCon2, simdBaseType, simdSize);
BlockRange().InsertAfter(vecCon2, op2);

LowerNode(vecCon2);
LowerNode(op2);
}
}

// We will be constructing the following parts:
Expand Down Expand Up @@ -9284,17 +9140,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
}
}

if ((simdSize == 8) || (simdSize == 12))
{
// We want to handle GetElement/ToScalar still for Vector2/3
if (!HWIntrinsicInfo::IsVectorToScalar(intrinsicId) && !HWIntrinsicInfo::IsVectorGetElement(intrinsicId))
{
// TODO-XArch-CQ: Ideally we would key this off of the size the containing node
// expects vs the size node actually is or would be if spilled to the stack
return;
}
}

// TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained

const bool isCommutative = node->isCommutativeHWIntrinsic();
Expand Down
Loading