diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index c1ede9f0f1c4..269995f8e86a 100644 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -5491,7 +5491,13 @@ void CodeGen::genConsumeRegs(GenTree* tree) } else { +#ifdef FEATURE_SIMD + // (In)Equality operation that produces bool result, when compared + // against Vector zero, marks its Vector Zero operand as contained. + assert(tree->OperIsLeaf() || tree->IsIntegralConstVector(0)); +#else assert(tree->OperIsLeaf()); +#endif } } else diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index 4f18ca8033b2..2e87b7b3db5b 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -105,7 +105,7 @@ bool Is4ByteAVXInstruction(instruction ins) return (ins == INS_dpps || ins == INS_dppd || ins == INS_insertps || ins == INS_pcmpeqq || ins == INS_pcmpgtq || ins == INS_vbroadcastss || ins == INS_vbroadcastsd || ins == INS_vpbroadcastb || ins == INS_vpbroadcastw || ins == INS_vpbroadcastd || ins == INS_vpbroadcastq || ins == INS_vextractf128 || ins == INS_vinsertf128 || - ins == INS_pmulld); + ins == INS_pmulld || ins == INS_ptest); #else return false; #endif diff --git a/src/jit/gentree.h b/src/jit/gentree.h index a9e7935c2154..a4c023aaee63 100644 --- a/src/jit/gentree.h +++ b/src/jit/gentree.h @@ -1490,6 +1490,7 @@ struct GenTree inline bool IsFPZero(); inline bool IsIntegralConst(ssize_t constVal); + inline bool IsIntegralConstVector(ssize_t constVal); inline bool IsBoxedValue(); @@ -4880,6 +4881,32 @@ inline bool GenTree::IsIntegralConst(ssize_t constVal) return false; } +//------------------------------------------------------------------- +// IsIntegralConstVector: returns true if this this is a SIMD vector +// with all its elements equal to an integral constant. +// +// Arguments: +// constVal - const value of vector element +// +// Returns: +// True if this represents an integral const SIMD vector. +// +inline bool GenTree::IsIntegralConstVector(ssize_t constVal) +{ +#ifdef FEATURE_SIMD + // SIMDIntrinsicInit intrinsic with a const value as initializer + // represents a const vector. + if ((gtOper == GT_SIMD) && (gtSIMD.gtSIMDIntrinsicID == SIMDIntrinsicInit) && gtGetOp1()->IsIntegralConst(constVal)) + { + assert(varTypeIsIntegral(gtSIMD.gtSIMDBaseType)); + assert(gtGetOp2() == nullptr); + return true; + } +#endif + + return false; +} + inline bool GenTree::IsBoxedValue() { assert(gtOper != GT_BOX || gtBox.BoxOp() != nullptr); diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index 4831ef95cf86..986bf9f60cd3 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -317,6 +317,7 @@ INST3( insertps, "insertps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( pcmpeqq, "pcmpeqq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x29)) // Packed compare 64-bit integers for equality INST3( pcmpgtq, "pcmpgtq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x37)) // Packed compare 64-bit integers for equality INST3( pmulld, "pmulld" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x40)) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result +INST3( ptest, "ptest" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x17)) // Packed logical compare INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index c10e952de46b..408398bb16d9 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2529,16 +2529,34 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) case SIMDIntrinsicOpEquality: case SIMDIntrinsicOpInEquality: - // Need two SIMD registers as scratch. - // See genSIMDIntrinsicRelOp() for details on code sequence generate and - // the need for two scratch registers. - // - // Note these intrinsics produce a BOOL result, hence internal float - // registers reserved are guaranteed to be different from target - // integer register without explicitly specifying. - info->srcCount = 2; - info->internalFloatCount = 2; - info->setInternalCandidates(lsra, lsra->allSIMDRegs()); + info->srcCount = 2; + + // On AVX, we can generate optimal code for (in)equality + // against zero. + op2 = tree->gtGetOp2(); + if (comp->canUseAVX() && op2->IsIntegralConstVector(0)) + { + // On AVX we can use ptest instruction for (in)equality + // against zero to generate optimal code. + // + // We can safely do the below optimization for integral + // vectors but not for floating-point for the reason + // that we have +0.0 and -0.0 and +0.0 == -0.0 + MakeSrcContained(tree, op2); + } + else + { + + // Need two SIMD registers as scratch. + // See genSIMDIntrinsicRelOp() for details on code sequence generate and + // the need for two scratch registers. + // + // Note these intrinsics produce a BOOL result, hence internal float + // registers reserved are guaranteed to be different from target + // integer register without explicitly specifying. + info->internalFloatCount = 2; + info->setInternalCandidates(lsra, lsra->allSIMDRegs()); + } break; case SIMDIntrinsicDotProduct: diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp index 007b0d695bc0..58a8ac71881b 100644 --- a/src/jit/simdcodegenxarch.cpp +++ b/src/jit/simdcodegenxarch.cpp @@ -1116,15 +1116,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) { assert(genIsValidIntReg(targetReg)); - // We need two additional XMM register as scratch - assert(simdNode->gtRsvdRegs != RBM_NONE); - assert(genCountBits(simdNode->gtRsvdRegs) == 2); - - regMaskTP tmpRegsMask = simdNode->gtRsvdRegs; - regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask); - tmpRegsMask &= ~tmpReg1Mask; - regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask); - regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask); var_types simdType = op1->TypeGet(); // TODO-1stClassStructs: Temporary to minimize asmDiffs if (simdType == TYP_DOUBLE) @@ -1139,91 +1130,113 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) simdType = TYP_SIMD16; } - // tmpReg1 = (op1Reg == op2Reg) - // Call this value of tmpReg1 as 'compResult' for further reference below. - regNumber otherReg = op2Reg; - if (tmpReg1 != op2Reg) + // On AVX, we can generate optimal code for (in)equality + // against zero. + if (compiler->canUseAVX() && op2->IsIntegralConstVector(0)) { - if (tmpReg1 != op1Reg) - { - inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType)); - } + assert(op2->isContained()); + inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType)); } else { - otherReg = op1Reg; - } - // For all integer types we can use TYP_INT comparison. - unsigned ival = 0; - instruction ins = - getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival); + // We need two additional XMM register as scratch + assert(simdNode->gtRsvdRegs != RBM_NONE); + assert(genCountBits(simdNode->gtRsvdRegs) == 2); - if (varTypeIsFloating(baseType)) - { - getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival); - } - else - { - inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType)); - } + regMaskTP tmpRegsMask = simdNode->gtRsvdRegs; + regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask); + tmpRegsMask &= ~tmpReg1Mask; + regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask); + regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask); - // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result. - if (compiler->canUseAVX() && (simdType == TYP_SIMD32)) - { - // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits - // - // Generated code sequence - // - vextractf128 tmpReg2, tmpReg1, 0x01 - // tmpReg2[128..255] <- 0 - // tmpReg2[0..127] <- tmpReg1[128..255] - // - vandps tmpReg1, tempReg2 - // This will zero-out upper portion of tmpReg1 and - // lower portion of tmpReg1 is and of upper and lower 128-bit comparison result. - getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01); - inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); - } - // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result. - if (simdType != TYP_SIMD8) - { - // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2)) - // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE - getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E); + // tmpReg1 = (op1Reg == op2Reg) + // Call this value of tmpReg1 as 'compResult' for further reference below. + regNumber otherReg = op2Reg; + if (tmpReg1 != op2Reg) + { + if (tmpReg1 != op1Reg) + { + inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType)); + } + } + else + { + otherReg = op1Reg; + } - // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2) - // - // Note that what we have computed is as follows at this point: - // tmpReg1[0] = compResult[0] & compResult[2] - // tmpReg1[1] = compResult[1] & compResult[3] - inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); - } - // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1], - // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields. + // For all integer types we can use TYP_INT comparison. + unsigned ival = 0; + instruction ins = + getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival); - // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1)) - // tmpReg2[0] = compResult[1] & compResult[3] - getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1); + if (varTypeIsFloating(baseType)) + { + getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival); + } + else + { + inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType)); + } - // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2) - // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3] - inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps?? + // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result. + if (compiler->canUseAVX() && (simdType == TYP_SIMD32)) + { + // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits + // + // Generated code sequence + // - vextractf128 tmpReg2, tmpReg1, 0x01 + // tmpReg2[128..255] <- 0 + // tmpReg2[0..127] <- tmpReg1[128..255] + // - vandps tmpReg1, tempReg2 + // This will zero-out upper portion of tmpReg1 and + // lower portion of tmpReg1 is and of upper and lower 128-bit comparison result. + getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01); + inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); + } + // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result. + if (simdType != TYP_SIMD8) + { + // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2)) + // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE + getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E); + + // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2) + // + // Note that what we have computed is as follows at this point: + // tmpReg1[0] = compResult[0] & compResult[2] + // tmpReg1[1] = compResult[1] & compResult[3] + inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); + } + // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1], + // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields. - // targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3] - // (Note that for mov_xmm2i, the int register is always in the reg2 position. - inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT); + // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1)) + // tmpReg2[0] = compResult[1] & compResult[3] + getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1); + + // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2) + // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3] + inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps?? + + // targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3] + // (Note that for mov_xmm2i, the int register is always in the reg2 position. + inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT); + + // Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false. + // Equality: + // cmp targetReg, 0xFFFFFFFF + // sete targetReg + // movzx targetReg, targetReg + // + // InEquality: + // cmp targetReg, 0xFFFFFFFF + // setne targetReg + // movzx targetReg, targetReg + // + getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF); + } - // Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false. - // Equality: - // cmp targetReg, 0xFFFFFFFF - // sete targetReg - // movzx targetReg, targetReg - // - // InEquality: - // cmp targetReg, 0xFFFFFFFF - // setne targetReg - // movzx targetReg, targetReg - // - getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF); inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, TYP_INT, EA_1BYTE); assert(simdNode->TypeGet() == TYP_INT); diff --git a/tests/src/JIT/SIMD/VectorIntEquals.cs b/tests/src/JIT/SIMD/VectorIntEquals.cs index bbbbbe519de5..c5d818027c66 100644 --- a/tests/src/JIT/SIMD/VectorIntEquals.cs +++ b/tests/src/JIT/SIMD/VectorIntEquals.cs @@ -17,12 +17,27 @@ private static int VectorIntEquals() Vector B = new Vector(3); Vector C = new Vector(5); - bool result = A.Equals(B); - if (!result) return Fail; + if (!result) + { + return Fail; + } result = A.Equals(C); - if (result) return Fail; + if (result) + { + return Fail; + } + + if (A.Equals(Vector.Zero)) + { + return Fail; + } + + if (!Vector.Zero.Equals(Vector.Zero)) + { + return Fail; + } return Pass; }