Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5491,7 +5491,13 @@ void CodeGen::genConsumeRegs(GenTree* tree)
}
else
{
#ifdef FEATURE_SIMD
// (In)Equality operation that produces bool result, when compared
// against Vector zero, marks its Vector Zero operand as contained.
assert(tree->OperIsLeaf() || tree->IsIntegralConstVector(0));
#else
assert(tree->OperIsLeaf());
#endif
}
}
else
Expand Down
2 changes: 1 addition & 1 deletion src/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ bool Is4ByteAVXInstruction(instruction ins)
return (ins == INS_dpps || ins == INS_dppd || ins == INS_insertps || ins == INS_pcmpeqq || ins == INS_pcmpgtq ||
ins == INS_vbroadcastss || ins == INS_vbroadcastsd || ins == INS_vpbroadcastb || ins == INS_vpbroadcastw ||
ins == INS_vpbroadcastd || ins == INS_vpbroadcastq || ins == INS_vextractf128 || ins == INS_vinsertf128 ||
ins == INS_pmulld);
ins == INS_pmulld || ins == INS_ptest);
#else
return false;
#endif
Expand Down
27 changes: 27 additions & 0 deletions src/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -1490,6 +1490,7 @@ struct GenTree

inline bool IsFPZero();
inline bool IsIntegralConst(ssize_t constVal);
inline bool IsIntegralConstVector(ssize_t constVal);

inline bool IsBoxedValue();

Expand Down Expand Up @@ -4880,6 +4881,32 @@ inline bool GenTree::IsIntegralConst(ssize_t constVal)
return false;
}

//-------------------------------------------------------------------
// IsIntegralConstVector: returns true if this this is a SIMD vector
// with all its elements equal to an integral constant.
//
// Arguments:
// constVal - const value of vector element
//
// Returns:
// True if this represents an integral const SIMD vector.
//
inline bool GenTree::IsIntegralConstVector(ssize_t constVal)
{
#ifdef FEATURE_SIMD
// SIMDIntrinsicInit intrinsic with a const value as initializer
// represents a const vector.
if ((gtOper == GT_SIMD) && (gtSIMD.gtSIMDIntrinsicID == SIMDIntrinsicInit) && gtGetOp1()->IsIntegralConst(constVal))
{
assert(varTypeIsIntegral(gtSIMD.gtSIMDBaseType));
assert(gtGetOp2() == nullptr);
return true;
}
#endif

return false;
}

inline bool GenTree::IsBoxedValue()
{
assert(gtOper != GT_BOX || gtBox.BoxOp() != nullptr);
Expand Down
1 change: 1 addition & 0 deletions src/jit/instrsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ INST3( insertps, "insertps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS
INST3( pcmpeqq, "pcmpeqq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x29)) // Packed compare 64-bit integers for equality
INST3( pcmpgtq, "pcmpgtq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x37)) // Packed compare 64-bit integers for equality
INST3( pmulld, "pmulld" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x40)) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result
INST3( ptest, "ptest" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x17)) // Packed logical compare
INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)

INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
Expand Down
38 changes: 28 additions & 10 deletions src/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2529,16 +2529,34 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)

case SIMDIntrinsicOpEquality:
case SIMDIntrinsicOpInEquality:
// Need two SIMD registers as scratch.
// See genSIMDIntrinsicRelOp() for details on code sequence generate and
// the need for two scratch registers.
//
// Note these intrinsics produce a BOOL result, hence internal float
// registers reserved are guaranteed to be different from target
// integer register without explicitly specifying.
info->srcCount = 2;
info->internalFloatCount = 2;
info->setInternalCandidates(lsra, lsra->allSIMDRegs());
info->srcCount = 2;

// On AVX, we can generate optimal code for (in)equality
// against zero.
op2 = tree->gtGetOp2();
if (comp->canUseAVX() && op2->IsIntegralConstVector(0))
{
// On AVX we can use ptest instruction for (in)equality
// against zero to generate optimal code.
//
// We can safely do the below optimization for integral
// vectors but not for floating-point for the reason
// that we have +0.0 and -0.0 and +0.0 == -0.0
MakeSrcContained(tree, op2);
}
else
{

// Need two SIMD registers as scratch.
// See genSIMDIntrinsicRelOp() for details on code sequence generate and
// the need for two scratch registers.
//
// Note these intrinsics produce a BOOL result, hence internal float
// registers reserved are guaranteed to be different from target
// integer register without explicitly specifying.
info->internalFloatCount = 2;
info->setInternalCandidates(lsra, lsra->allSIMDRegs());
}
break;

case SIMDIntrinsicDotProduct:
Expand Down
177 changes: 95 additions & 82 deletions src/jit/simdcodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1116,15 +1116,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
{
assert(genIsValidIntReg(targetReg));

// We need two additional XMM register as scratch
assert(simdNode->gtRsvdRegs != RBM_NONE);
assert(genCountBits(simdNode->gtRsvdRegs) == 2);

regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
tmpRegsMask &= ~tmpReg1Mask;
regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
var_types simdType = op1->TypeGet();
// TODO-1stClassStructs: Temporary to minimize asmDiffs
if (simdType == TYP_DOUBLE)
Expand All @@ -1139,91 +1130,113 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
simdType = TYP_SIMD16;
}

// tmpReg1 = (op1Reg == op2Reg)
// Call this value of tmpReg1 as 'compResult' for further reference below.
regNumber otherReg = op2Reg;
if (tmpReg1 != op2Reg)
// On AVX, we can generate optimal code for (in)equality
// against zero.
if (compiler->canUseAVX() && op2->IsIntegralConstVector(0))
{
if (tmpReg1 != op1Reg)
{
inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
}
assert(op2->isContained());
inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType));
}
else
{
otherReg = op1Reg;
}

// For all integer types we can use TYP_INT comparison.
unsigned ival = 0;
instruction ins =
getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);
// We need two additional XMM register as scratch
assert(simdNode->gtRsvdRegs != RBM_NONE);
assert(genCountBits(simdNode->gtRsvdRegs) == 2);

if (varTypeIsFloating(baseType))
{
getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
}
else
{
inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
}
regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
tmpRegsMask &= ~tmpReg1Mask;
regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);

// If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
{
// Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
//
// Generated code sequence
// - vextractf128 tmpReg2, tmpReg1, 0x01
// tmpReg2[128..255] <- 0
// tmpReg2[0..127] <- tmpReg1[128..255]
// - vandps tmpReg1, tempReg2
// This will zero-out upper portion of tmpReg1 and
// lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
}
// Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
if (simdType != TYP_SIMD8)
{
// tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
// Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);
// tmpReg1 = (op1Reg == op2Reg)
// Call this value of tmpReg1 as 'compResult' for further reference below.
regNumber otherReg = op2Reg;
if (tmpReg1 != op2Reg)
{
if (tmpReg1 != op1Reg)
{
inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
}
}
else
{
otherReg = op1Reg;
}

// tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
//
// Note that what we have computed is as follows at this point:
// tmpReg1[0] = compResult[0] & compResult[2]
// tmpReg1[1] = compResult[1] & compResult[3]
inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
}
// At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
// OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields.
// For all integer types we can use TYP_INT comparison.
unsigned ival = 0;
instruction ins =
getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);

// tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
// tmpReg2[0] = compResult[1] & compResult[3]
getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);
if (varTypeIsFloating(baseType))
{
getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
}
else
{
inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
}

// tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
// That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??
// If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
{
// Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
//
// Generated code sequence
// - vextractf128 tmpReg2, tmpReg1, 0x01
// tmpReg2[128..255] <- 0
// tmpReg2[0..127] <- tmpReg1[128..255]
// - vandps tmpReg1, tempReg2
// This will zero-out upper portion of tmpReg1 and
// lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
}
// Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
if (simdType != TYP_SIMD8)
{
// tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
// Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);

// tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
//
// Note that what we have computed is as follows at this point:
// tmpReg1[0] = compResult[0] & compResult[2]
// tmpReg1[1] = compResult[1] & compResult[3]
inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
}
// At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
// OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields.

// targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
// (Note that for mov_xmm2i, the int register is always in the reg2 position.
inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
// tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
// tmpReg2[0] = compResult[1] & compResult[3]
getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);

// tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
// That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??

// targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
// (Note that for mov_xmm2i, the int register is always in the reg2 position.
inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);

// Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false.
// Equality:
// cmp targetReg, 0xFFFFFFFF
// sete targetReg
// movzx targetReg, targetReg
//
// InEquality:
// cmp targetReg, 0xFFFFFFFF
// setne targetReg
// movzx targetReg, targetReg
//
getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF);
}

// Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false.
// Equality:
// cmp targetReg, 0xFFFFFFFF
// sete targetReg
// movzx targetReg, targetReg
//
// InEquality:
// cmp targetReg, 0xFFFFFFFF
// setne targetReg
// movzx targetReg, targetReg
//
getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF);
inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, TYP_INT,
EA_1BYTE);
assert(simdNode->TypeGet() == TYP_INT);
Expand Down
21 changes: 18 additions & 3 deletions tests/src/JIT/SIMD/VectorIntEquals.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,27 @@ private static int VectorIntEquals()
Vector<int> B = new Vector<int>(3);
Vector<int> C = new Vector<int>(5);


bool result = A.Equals(B);
if (!result) return Fail;
if (!result)
{
return Fail;
}

result = A.Equals(C);
if (result) return Fail;
if (result)
{
return Fail;
}

if (A.Equals(Vector<int>.Zero))
{
return Fail;
}

if (!Vector<int>.Zero.Equals(Vector<int>.Zero))
{
return Fail;
}

return Pass;
}
Expand Down