dotnet · sivarv · Sep 27, 2016 · Sep 27, 2016
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
@@ -5491,7 +5491,13 @@ void CodeGen::genConsumeRegs(GenTree* tree)
         }
         else
         {
+#ifdef FEATURE_SIMD
+            // (In)Equality operation that produces bool result, when compared
+            // against Vector zero, marks its Vector Zero operand as contained.
+            assert(tree->OperIsLeaf() || tree->IsIntegralConstVector(0));
+#else
             assert(tree->OperIsLeaf());
+#endif
         }
     }
     else

diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp
@@ -105,7 +105,7 @@ bool Is4ByteAVXInstruction(instruction ins)
     return (ins == INS_dpps || ins == INS_dppd || ins == INS_insertps || ins == INS_pcmpeqq || ins == INS_pcmpgtq ||
             ins == INS_vbroadcastss || ins == INS_vbroadcastsd || ins == INS_vpbroadcastb || ins == INS_vpbroadcastw ||
             ins == INS_vpbroadcastd || ins == INS_vpbroadcastq || ins == INS_vextractf128 || ins == INS_vinsertf128 ||
-            ins == INS_pmulld);
+            ins == INS_pmulld || ins == INS_ptest);
 #else
     return false;
 #endif

diff --git a/src/jit/gentree.h b/src/jit/gentree.h
@@ -1490,6 +1490,7 @@ struct GenTree
 
     inline bool IsFPZero();
     inline bool IsIntegralConst(ssize_t constVal);
+    inline bool IsIntegralConstVector(ssize_t constVal);
 
     inline bool IsBoxedValue();
 
@@ -4880,6 +4881,32 @@ inline bool GenTree::IsIntegralConst(ssize_t constVal)
     return false;
 }
 
+//-------------------------------------------------------------------
+// IsIntegralConstVector: returns true if this this is a SIMD vector
+// with all its elements equal to an integral constant.
+//
+// Arguments:
+//     constVal  -  const value of vector element
+//
+// Returns:
+//     True if this represents an integral const SIMD vector.
+//
+inline bool GenTree::IsIntegralConstVector(ssize_t constVal)
+{
+#ifdef FEATURE_SIMD
+    // SIMDIntrinsicInit intrinsic with a const value as initializer
+    // represents a const vector.
+    if ((gtOper == GT_SIMD) && (gtSIMD.gtSIMDIntrinsicID == SIMDIntrinsicInit) && gtGetOp1()->IsIntegralConst(constVal))
+    {
+        assert(varTypeIsIntegral(gtSIMD.gtSIMDBaseType));
+        assert(gtGetOp2() == nullptr);
+        return true;
+    }
+#endif
+
+    return false;
+}
+
 inline bool GenTree::IsBoxedValue()
 {
     assert(gtOper != GT_BOX || gtBox.BoxOp() != nullptr);

diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h
@@ -317,6 +317,7 @@ INST3( insertps,     "insertps"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SS
 INST3( pcmpeqq,      "pcmpeqq"     , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x29))   // Packed compare 64-bit integers for equality
 INST3( pcmpgtq,      "pcmpgtq"     , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x37))   // Packed compare 64-bit integers for equality
 INST3( pmulld,       "pmulld"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x40))   // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result
+INST3( ptest,        "ptest"       , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x17))   // Packed logical compare
 INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
 
 INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)

diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
@@ -2529,16 +2529,34 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
 
         case SIMDIntrinsicOpEquality:
         case SIMDIntrinsicOpInEquality:
-            // Need two SIMD registers as scratch.
-            // See genSIMDIntrinsicRelOp() for details on code sequence generate and
-            // the need for two scratch registers.
-            //
-            // Note these intrinsics produce a BOOL result, hence internal float
-            // registers reserved are guaranteed to be different from target
-            // integer register without explicitly specifying.
-            info->srcCount           = 2;
-            info->internalFloatCount = 2;
-            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            info->srcCount = 2;
+
+            // On AVX, we can generate optimal code for (in)equality
+            // against zero.
+            op2 = tree->gtGetOp2();
+            if (comp->canUseAVX() && op2->IsIntegralConstVector(0))
+            {
+                // On AVX we can use ptest instruction for (in)equality
+                // against zero to generate optimal code.
+                //
+                // We can safely do the below optimization for integral
+                // vectors but not for floating-point for the reason
+                // that we have +0.0 and -0.0 and +0.0 == -0.0
+                MakeSrcContained(tree, op2);
+            }
+            else
+            {
+
+                // Need two SIMD registers as scratch.
+                // See genSIMDIntrinsicRelOp() for details on code sequence generate and
+                // the need for two scratch registers.
+                //
+                // Note these intrinsics produce a BOOL result, hence internal float
+                // registers reserved are guaranteed to be different from target
+                // integer register without explicitly specifying.
+                info->internalFloatCount = 2;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            }
             break;
 
         case SIMDIntrinsicDotProduct:

diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp
@@ -1116,15 +1116,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
         {
             assert(genIsValidIntReg(targetReg));
 
-            // We need two additional XMM register as scratch
-            assert(simdNode->gtRsvdRegs != RBM_NONE);
-            assert(genCountBits(simdNode->gtRsvdRegs) == 2);
-
-            regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
-            regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
-            tmpRegsMask &= ~tmpReg1Mask;
-            regNumber tmpReg1  = genRegNumFromMask(tmpReg1Mask);
-            regNumber tmpReg2  = genRegNumFromMask(tmpRegsMask);
             var_types simdType = op1->TypeGet();
             // TODO-1stClassStructs: Temporary to minimize asmDiffs
             if (simdType == TYP_DOUBLE)
@@ -1139,91 +1130,113 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                 simdType = TYP_SIMD16;
             }
 
-            // tmpReg1 = (op1Reg == op2Reg)
-            // Call this value of tmpReg1 as 'compResult' for further reference below.
-            regNumber otherReg = op2Reg;
-            if (tmpReg1 != op2Reg)
+            // On AVX, we can generate optimal code for (in)equality
+            // against zero.
+            if (compiler->canUseAVX() && op2->IsIntegralConstVector(0))
             {
-                if (tmpReg1 != op1Reg)
-                {
-                    inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
-                }
+                assert(op2->isContained());
+                inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType));
             }
             else
             {
-                otherReg = op1Reg;
-            }
 
-            // For all integer types we can use TYP_INT comparison.
-            unsigned    ival = 0;
-            instruction ins =
-                getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);
+                // We need two additional XMM register as scratch
+                assert(simdNode->gtRsvdRegs != RBM_NONE);
+                assert(genCountBits(simdNode->gtRsvdRegs) == 2);
 
-            if (varTypeIsFloating(baseType))
-            {
-                getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
-            }
-            else
-            {
-                inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
-            }
+                regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
+                regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
+                tmpRegsMask &= ~tmpReg1Mask;
+                regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
+                regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
 
-            // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
-            if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
-            {
-                // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
-                //
-                // Generated code sequence
-                // - vextractf128 tmpReg2, tmpReg1, 0x01
-                //       tmpReg2[128..255] <- 0
-                //       tmpReg2[0..127]   <- tmpReg1[128..255]
-                // - vandps tmpReg1, tempReg2
-                //       This will zero-out upper portion of tmpReg1 and
-                //       lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
-                getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
-                inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
-            }
-            // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
-            if (simdType != TYP_SIMD8)
-            {
-                // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
-                // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
-                getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);
+                // tmpReg1 = (op1Reg == op2Reg)
+                // Call this value of tmpReg1 as 'compResult' for further reference below.
+                regNumber otherReg = op2Reg;
+                if (tmpReg1 != op2Reg)
+                {
+                    if (tmpReg1 != op1Reg)
+                    {
+                        inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
+                    }
+                }
+                else
+                {
+                    otherReg = op1Reg;
+                }
 
-                // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
-                //
-                // Note that what we have computed is as follows at this point:
-                // tmpReg1[0] = compResult[0] & compResult[2]
-                // tmpReg1[1] = compResult[1] & compResult[3]
-                inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
-            }
-            // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
-            // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields.
+                // For all integer types we can use TYP_INT comparison.
+                unsigned    ival = 0;
+                instruction ins =
+                    getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);
 
-            // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
-            // tmpReg2[0] = compResult[1] & compResult[3]
-            getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);
+                if (varTypeIsFloating(baseType))
+                {
+                    getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
+                }
+                else
+                {
+                    inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
+                }
 
-            // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
-            // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
-            inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??
+                // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
+                if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
+                {
+                    // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
+                    //
+                    // Generated code sequence
+                    // - vextractf128 tmpReg2, tmpReg1, 0x01
+                    //       tmpReg2[128..255] <- 0
+                    //       tmpReg2[0..127]   <- tmpReg1[128..255]
+                    // - vandps tmpReg1, tempReg2
+                    //       This will zero-out upper portion of tmpReg1 and
+                    //       lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
+                    getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
+                    inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
+                }
+                // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
+                if (simdType != TYP_SIMD8)
+                {
+                    // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
+                    // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
+                    getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);
+
+                    // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
+                    //
+                    // Note that what we have computed is as follows at this point:
+                    // tmpReg1[0] = compResult[0] & compResult[2]
+                    // tmpReg1[1] = compResult[1] & compResult[3]
+                    inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
+                }
+                // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
+                // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields.
 
-            // targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
-            // (Note that for mov_xmm2i, the int register is always in the reg2 position.
-            inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
+                // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
+                // tmpReg2[0] = compResult[1] & compResult[3]
+                getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);
+
+                // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
+                // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
+                inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??
+
+                // targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
+                // (Note that for mov_xmm2i, the int register is always in the reg2 position.
+                inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
+
+                // Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false.
+                // Equality:
+                //   cmp targetReg, 0xFFFFFFFF
+                //   sete targetReg
+                //   movzx targetReg, targetReg
+                //
+                // InEquality:
+                //   cmp targetReg, 0xFFFFFFFF
+                //   setne targetReg
+                //   movzx targetReg, targetReg
+                //
+                getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF);
+            }
 
-            // Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false.
-            // Equality:
-            //   cmp targetReg, 0xFFFFFFFF
-            //   sete targetReg
-            //   movzx targetReg, targetReg
-            //
-            // InEquality:
-            //   cmp targetReg, 0xFFFFFFFF
-            //   setne targetReg
-            //   movzx targetReg, targetReg
-            //
-            getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF);
             inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, TYP_INT,
                     EA_1BYTE);
             assert(simdNode->TypeGet() == TYP_INT);

diff --git a/tests/src/JIT/SIMD/VectorIntEquals.cs b/tests/src/JIT/SIMD/VectorIntEquals.cs
@@ -17,12 +17,27 @@ private static int VectorIntEquals()
         Vector<int> B = new Vector<int>(3);
         Vector<int> C = new Vector<int>(5);
 
-
         bool result = A.Equals(B);
-        if (!result) return Fail;
+        if (!result)
+        {
+            return Fail;
+        }
 
         result = A.Equals(C);
-        if (result) return Fail;
+        if (result)
+        {
+            return Fail;
+        }
+
+        if (A.Equals(Vector<int>.Zero))
+        {
+            return Fail;
+        }
+
+        if (!Vector<int>.Zero.Equals(Vector<int>.Zero))
+        {
+            return Fail;
+        }
 
         return Pass;
     }