diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 9b5101f953ab65..cd5a198ea57464 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -31560,90 +31560,101 @@ bool GenTree::IsInvariant() const //------------------------------------------------------------------- // IsVectorPerElementMask: returns true if this node is a vector constant per-element mask -// (every element has either all bits set or none of them). +// (every element has either all bits set or none of them) for the +// given simd size and base type. // // Arguments: // simdBaseType - the base type of the constant being checked. // simdSize - the size of the SIMD type of the intrinsic. // // Returns: -// True if this node is a vector constant per-element mask. +// True if this node is a per-element mask compatible with simdBaseType and simdSize // bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize) const { #ifdef FEATURE_SIMD + // This should be kept in sync with ValueNumStore::IsVectorPerElementMask + + var_types simdType = TypeGet(); + unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType); + + assert(varTypeIsSIMD(simdType)); + assert(genTypeSize(simdType) == simdSize); + if (IsCnsVec()) { const GenTreeVecCon* vecCon = AsVecCon(); + return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal, simdBaseType, elementCount); + } - int elementCount = vecCon->ElementCount(simdSize, simdBaseType); + if (!OperIsHWIntrinsic()) + { + return false; + } - switch (simdBaseType) - { - case TYP_BYTE: - case TYP_UBYTE: - return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u8[0], elementCount); - case TYP_SHORT: - case TYP_USHORT: - return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u16[0], elementCount); - case TYP_INT: - case TYP_UINT: - case TYP_FLOAT: - return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u32[0], elementCount); - case TYP_LONG: - case TYP_ULONG: - case TYP_DOUBLE: - return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u64[0], elementCount); - default: - unreached(); - } + const GenTreeHWIntrinsic* intrinsic = AsHWIntrinsic(); + + NamedIntrinsic intrinsicId = intrinsic->GetHWIntrinsicId(); + unsigned intrinsicSimdSize = intrinsic->GetSimdSize(); + var_types intrinsicSimdBaseType = intrinsic->GetSimdBaseType(); + + if (intrinsicSimdSize != simdSize) + { + return false; } - else if (OperIsHWIntrinsic()) + + if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId)) { - const GenTreeHWIntrinsic* intrinsic = AsHWIntrinsic(); - const NamedIntrinsic intrinsicId = intrinsic->GetHWIntrinsicId(); + // When producing a SIMD result, we need for it to + // have a base type that is the same size or larger + // as what we expect. + // + // Consider for example us expecting `byte` and the + // intrinsic here produces `ushort`. In that case we + // expect every byte to be either `0x00` or `0xFF` + // and the intrinsic produces either `0x0000` or `0xFFFF` + // and so it meets this need. + // + // However, the inverse is not safe as we would expect + // `0x0000` or `0xFFFF`, but the intrinsic could produce + // `0x00FF` or `0xFF00` which fails the expectation. - if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId)) - { - // We directly return a per-element mask - return true; - } + return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType); + } - bool isScalar = false; - genTreeOps oper = intrinsic->GetOperForHWIntrinsicId(&isScalar); + bool isScalar = false; + genTreeOps oper = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar); - switch (oper) + switch (oper) + { + case GT_AND: + case GT_AND_NOT: + case GT_OR: + case GT_OR_NOT: + case GT_XOR: + case GT_XOR_NOT: { - case GT_AND: - case GT_AND_NOT: - case GT_OR: - case GT_OR_NOT: - case GT_XOR: - case GT_XOR_NOT: - { - // We are a binary bitwise operation where both inputs are per-element masks - return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize) && - intrinsic->Op(2)->IsVectorPerElementMask(simdBaseType, simdSize); - } + // We are a binary bitwise operation where both inputs are per-element masks + // + // While some cases like OR could combine in ways that produce a usable mask + // there isn't any way to statically determine this for non-constants and + // the constant cases should've already been folded. - case GT_NOT: - { - // We are an unary bitwise operation where the input is a per-element mask - return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize); - } + return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize) && + intrinsic->Op(2)->IsVectorPerElementMask(simdBaseType, simdSize); + } - default: - { - assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)); - break; - } + case GT_NOT: + { + // We are an unary bitwise operation where the input is a per-element mask + return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize); } - return false; - } - else if (IsCnsMsk()) - { - return true; + default: + { + assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)); + break; + } } #endif // FEATURE_SIMD @@ -33002,24 +33013,24 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } } - else if (otherNode->OperIsHWIntrinsic()) + else if (otherNode->IsVectorPerElementMask(simdBaseType, simdSize)) { - GenTreeHWIntrinsic* otherIntrinsic = otherNode->AsHWIntrinsic(); - NamedIntrinsic otherIntrinsicId = otherIntrinsic->GetHWIntrinsicId(); - - if (HWIntrinsicInfo::ReturnsPerElementMask(otherIntrinsicId) && - (genTypeSize(simdBaseType) == genTypeSize(otherIntrinsic->GetSimdBaseType()))) + // Handle `Equals(PerElementMask, AllBitsSet)` and `Equals(AllBitsSet, PerElementMask)` for + // integrals + if (cnsNode->IsVectorAllBitsSet()) { - // This optimization is only safe if we know the other node produces - // AllBitsSet or Zero per element and if the outer comparison is the - // same size as what the other node produces for its mask + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with AllBitsSet. + // + // In such a case: + // * `AllBitsSet == AllBitsSet` is true and so produces `AllBitsSet` + // * `AllBitsSet == Zero` is false and so produces `Zero` + // + // This means that we are not changing anything and can just return + // the per element mask - // Handle `(Mask == AllBitsSet) == Mask` and `(AllBitsSet == Mask) == Mask` for integrals - if (cnsNode->IsVectorAllBitsSet()) - { - resultNode = otherNode; - break; - } + resultNode = otherNode; + break; } } break; @@ -33190,6 +33201,25 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } } + else if (otherNode->IsVectorPerElementMask(simdBaseType, simdSize)) + { + // Handle `~Equals(PerElementMask, Zero)` and `~Equals(Zero, PerElementMask)` for integrals + if (cnsNode->IsVectorZero()) + { + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with Zero. + // + // In such a case: + // * `AllBitsSet != Zero` is true and so produces `AllBitsSet` + // * `Zero != Zero` is false and so produces `Zero` + // + // This means that we are not changing anything and can just return + // the per element mask + + resultNode = otherNode; + break; + } + } else if (otherNode->OperIsHWIntrinsic()) { GenTreeHWIntrinsic* otherIntrinsic = otherNode->AsHWIntrinsic(); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 09d6f831bd309a..8fbed3c79d41d3 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2702,9 +2702,19 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) GenTree* op2 = node->Op(2); GenTree* op3 = node->Op(3); + if (varTypeIsIntegral(simdBaseType)) + { + // The integral forms are marked with NormalizeSmallTypeToInt + // but actually emit an instruction that operates bytewise. So + // fixup the base type so that the per element mask check finds + // the greatest number of valid matches. + + simdBaseType = TYP_BYTE; + } + // If either of the value operands is const zero and the mask is either all // zeros or all ones per-element, we can optimize down to AND or AND_NOT. - if (op3->IsVectorPerElementMask(simdBaseType, simdSize) && (op1->IsVectorZero() || op2->IsVectorZero())) + if (op3->IsVectorPerElementMask(TYP_BYTE, simdSize) && (op1->IsVectorZero() || op2->IsVectorZero())) { var_types simdType = node->TypeGet(); GenTree* binOp = nullptr; @@ -3490,53 +3500,86 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) // op1: the condition vector // op2: the left vector // op3: the right vector + GenTree* op1 = node->Op(1); GenTree* op2 = node->Op(2); GenTree* op3 = node->Op(3); - // If the condition vector comes from a hardware intrinsic that - // returns a per-element mask, we can optimize the entire - // conditional select to a single BlendVariable instruction - // (if supported by the architecture) - - // First, determine if the condition is a per-element mask - if (op1->IsVectorPerElementMask(simdBaseType, simdSize)) + if (op1->OperIsConvertMaskToVector()) { - // Next, determine if the target architecture supports BlendVariable - NamedIntrinsic blendVariableId = NI_Illegal; + // If op1 was originally a mask, then we want to prioritize BlendVariableMask + // as it not only avoids the conversion from mask to vector, but also allows + // embedded masking and other optimizations to kick in, improving code density - bool isOp1CvtMaskToVector = op1->OperIsConvertMaskToVector(); + NamedIntrinsic blendVariableId = NI_AVX512_BlendVariableMask; + GenTreeHWIntrinsic* cvtMaskToVector = op1->AsHWIntrinsic(); - if ((simdSize == 64) || isOp1CvtMaskToVector) - { - GenTree* maskNode; + GenTree* maskNode = cvtMaskToVector->Op(1); + BlockRange().Remove(op1); + op1 = maskNode; - if (isOp1CvtMaskToVector) - { - GenTreeHWIntrinsic* cvtMaskToVector = op1->AsHWIntrinsic(); + // We need to change the base type to match the underlying mask size to ensure + // the right instruction variant is picked. If the CndSel was for TYP_INT but + // the mask was for TYP_DOUBLE then we'd generate vpblendmd when we really want + // vpblendmq. Changing the size is fine since CndSel itself is bitwise and the + // mask is just representing entire elements at a given size. - maskNode = cvtMaskToVector->Op(1); - BlockRange().Remove(op1); + simdBaseType = cvtMaskToVector->GetSimdBaseType(); - // We need to change the base type to match the underlying mask size to ensure - // the right instruction variant is picked. If the CndSel was for TYP_INT but - // the mask was for TYP_DOUBLE then we'd generate vpblendmd when we really want - // vpblendmq. Changing the size is fine since CndSel itself is bitwise and the - // the mask is just representing entire elements at a given size. + resultNode = + m_compiler->gtNewSimdHWIntrinsicNode(simdType, op3, op2, op1, blendVariableId, simdBaseType, simdSize); + } + else if (op3->IsVectorZero()) + { + // The operation is (op2 & op1) | (zero & ~op1), so we can drop the second half + BlockRange().Remove(op3); + resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize); + } + else if (op2->IsVectorZero()) + { + // The operation is (zero & op1) | (op3 & ~op1), so we can drop the first half + BlockRange().Remove(op2); + resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, op1, simdBaseType, simdSize); + } + else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512)) + { + // TernaryLogic is always single cycle with a lot of ports available and while + // BlendVariable is often also a single cycle it can frequently be 2-3 instead. + // + // Additionally, for TYP_SIMD64 this avoids needing to convert from vector to mask + // which introduces 3-4 cycles of overhead on most hardware and would be strictly worse - simdBaseType = cvtMaskToVector->GetSimdBaseType(); - } - else - { - maskNode = m_compiler->gtNewSimdCvtVectorToMaskNode(TYP_MASK, op1, simdBaseType, simdSize); - newNodes.InsertAtEnd(maskNode); - } + GenTree* control = m_compiler->gtNewIconNode(0xCA); // (B & A) | (C & ~A) + newNodes.InsertAtEnd(control); - assert(maskNode->TypeIs(TYP_MASK)); - blendVariableId = NI_AVX512_BlendVariableMask; - op1 = maskNode; + resultNode = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op1, op2, op3, control, NI_AVX512_TernaryLogic, + simdBaseType, simdSize); + } + else if (op1->IsVectorPerElementMask(TYP_BYTE, simdSize)) + { + // If the condition vector comes from a hardware intrinsic that + // returns a per-element mask, we can optimize the entire + // conditional select to a single BlendVariable instruction + // (if supported by the architecture). + // + // We can check using `TYP_BYTE` as we can change the base type + // and still get correct codegen since the ConditionalSelect is + // itself bitwise. + + // Next, determine if the target architecture supports BlendVariable + NamedIntrinsic blendVariableId = NI_Illegal; + + if (varTypeIsFloating(simdBaseType) && !op1->IsVectorPerElementMask(simdBaseType, simdSize)) + { + // For floating-point, we want to preserve the base type if the + // mask is also compatible with it, otherwise we need to fixup + // the base type to ensure the right instruction is selected for + // the mask. + + simdBaseType = TYP_BYTE; } - else if (simdSize == 32) + + if (simdSize == 32) { // For Vector256 (simdSize == 32), BlendVariable for floats/doubles // is available on AVX, whereas other types (integrals) require AVX2 @@ -3568,50 +3611,26 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) if (resultNode == nullptr) { - if (op3->IsVectorZero()) - { - BlockRange().Remove(op3); + // We cannot optimize, so produce unoptimized instructions + assert(simdSize != 64); - resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize); - } - else if (op2->IsVectorZero()) - { - BlockRange().Remove(op2); - - resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, op1, simdBaseType, simdSize); - } - else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512)) + // We'll need the mask twice + if (!op1->OperIs(GT_LCL_VAR)) { - // We can't use the mask, but we can emit a ternary logic node - GenTree* control = m_compiler->gtNewIconNode(0xCA); // (B & A) | (C & ~A) - newNodes.InsertAtEnd(control); - - resultNode = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op1, op2, op3, control, NI_AVX512_TernaryLogic, - simdBaseType, simdSize); + LIR::Use op1Use; + LIR::Use::MakeDummyUse(newNodes, op1, &op1Use); + op1Use.ReplaceWithLclVar(m_compiler); + op1 = op1Use.Def(); } - else - { - // We cannot optimize, so produce unoptimized instructions - assert(simdSize != 64); - - // We'll need the mask twice - if (!op1->OperIs(GT_LCL_VAR)) - { - LIR::Use op1Use; - LIR::Use::MakeDummyUse(newNodes, op1, &op1Use); - op1Use.ReplaceWithLclVar(m_compiler); - op1 = op1Use.Def(); - } - GenTree* tmp1 = m_compiler->gtClone(op1); - GenTree* tmp2 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize); - GenTree* tmp3 = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, tmp1, simdBaseType, simdSize); - resultNode = m_compiler->gtNewSimdBinOpNode(GT_OR, simdType, tmp2, tmp3, simdBaseType, simdSize); + GenTree* tmp1 = m_compiler->gtClone(op1); + GenTree* tmp2 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize); + GenTree* tmp3 = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, tmp1, simdBaseType, simdSize); + resultNode = m_compiler->gtNewSimdBinOpNode(GT_OR, simdType, tmp2, tmp3, simdBaseType, simdSize); - newNodes.InsertAtEnd(tmp1); - newNodes.InsertAtEnd(tmp2); - newNodes.InsertAtEnd(tmp3); - } + newNodes.InsertAtEnd(tmp1); + newNodes.InsertAtEnd(tmp2); + newNodes.InsertAtEnd(tmp3); } newNodes.InsertAtEnd(resultNode); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index fe092701846223..4f370aae0883d5 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -11503,44 +11503,58 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree) std::swap(op1, op2); } - if (((oper == GT_EQ) || (oper == GT_NE)) && op1->OperIsHWIntrinsic() && op2->IsCnsVec()) + if ((oper == GT_EQ) || (oper == GT_NE)) { - GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic(); - NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId(); - var_types op1Type = op1Intrinsic->TypeGet(); - - if (HWIntrinsicInfo::ReturnsPerElementMask(op1IntrinsicId) && - (genTypeSize(simdBaseType) == genTypeSize(op1Intrinsic->GetSimdBaseType()))) + if (op2->IsCnsVec() && op1->IsVectorPerElementMask(simdBaseType, simdSize)) { - // This optimization is only safe if we know the other node produces - // AllBitsSet or Zero per element and if the outer comparison is the - // same size as what the other node produces for its mask - bool reverseCond = false; if (oper == GT_EQ) { - // Handle `Mask == Zero` and `Zero == Mask` for integral types + // Handle `Equals(Mask, Zero)` for integral types if (op2->IsVectorZero()) { + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with Zero. + // + // In such a case: + // * `AllBitsSet == Zero` is false and so produces `Zero` + // * `Zero == Zero` is true and so produces `AllBitsSet` + // + // This means that we are just inverting the input and can reverse + // the condition + reverseCond = true; } } else if (oper == GT_NE) { - // Handle `Mask != AllBitsSet` and `AllBitsSet != Mask` for integral types + // Handle `~Equals(Mask, AllBitsSet)` for integral types if (op2->IsVectorAllBitsSet()) { + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with AllBitsSet. + // + // In such a case: + // * `AllBitsSet != AllBitsSet` is false and so produces `Zero` + // * `Zero != AllBitsSet` is true and so produces `AllBitsSet` + // + // This means that we are just inverting the input and can reverse + // the condition + reverseCond = true; } } if (reverseCond) { - GenTree* newNode = nullptr; + GenTree* newNode = nullptr; + var_types op1Type = op1->TypeGet(); - if (op1Intrinsic->OperIsConvertVectorToMask()) + if (op1->OperIsConvertVectorToMask()) { + GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic(); + #if defined(TARGET_XARCH) op1 = op1Intrinsic->Op(1); #elif defined(TARGET_ARM64) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index bca1a46e2d7890..43999e89edcebf 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -5,7 +5,7 @@ #define _SIMD_H_ template -static bool ElementsAreSame(T* array, size_t size) +static bool ElementsAreSame(const T* array, size_t size) { for (size_t i = 1; i < size; i++) { @@ -16,7 +16,7 @@ static bool ElementsAreSame(T* array, size_t size) } template -static bool ElementsAreAllBitsSetOrZero(T* array, size_t size) +static bool ElementsAreAllBitsSetOrZero(const T* array, size_t size) { for (size_t i = 0; i < size; i++) { @@ -384,6 +384,45 @@ typedef simd64_t simd_t; typedef simd16_t simd_t; #endif +static bool ElementsAreAllBitsSetOrZero(const simd_t* simdVal, var_types simdBaseType, unsigned elementCount) +{ + assert(simdVal != nullptr); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + return ElementsAreAllBitsSetOrZero(&simdVal->u8[0], elementCount); + } + + case TYP_SHORT: + case TYP_USHORT: + { + return ElementsAreAllBitsSetOrZero(&simdVal->u16[0], elementCount); + } + + case TYP_INT: + case TYP_UINT: + case TYP_FLOAT: + { + return ElementsAreAllBitsSetOrZero(&simdVal->u32[0], elementCount); + } + + case TYP_LONG: + case TYP_ULONG: + case TYP_DOUBLE: + { + return ElementsAreAllBitsSetOrZero(&simdVal->u64[0], elementCount); + } + + default: + { + unreached(); + } + } +} + inline bool IsUnaryBitwiseOperation(genTreeOps oper) { return (oper == GT_LZCNT) || (oper == GT_NOT); diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 5c2884cb29659e..92d75c08c76bef 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -2405,53 +2405,7 @@ bool ValueNumStore::VNIsVectorNaN(var_types simdType, var_types simdBaseType, Va { assert(varTypeIsSIMD(simdType)); - simd_t vector = {}; - - switch (simdType) - { - case TYP_SIMD8: - { - simd8_t tmp = GetConstantSimd8(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD12: - { - simd12_t tmp = GetConstantSimd12(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD16: - { - simd16_t tmp = GetConstantSimd16(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - -#if defined(TARGET_XARCH) - case TYP_SIMD32: - { - simd32_t tmp = GetConstantSimd32(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD64: - { - simd64_t tmp = GetConstantSimd64(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } -#endif // TARGET_XARCH - - default: - { - unreached(); - } - } - + simd_t vector = GetConstantSimd(valVN); uint32_t elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); for (uint32_t i = 0; i < elementCount; i++) @@ -2471,53 +2425,7 @@ bool ValueNumStore::VNIsVectorNegativeZero(var_types simdType, var_types simdBas { assert(varTypeIsSIMD(simdType)); - simd_t vector = {}; - - switch (simdType) - { - case TYP_SIMD8: - { - simd8_t tmp = GetConstantSimd8(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD12: - { - simd12_t tmp = GetConstantSimd12(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD16: - { - simd16_t tmp = GetConstantSimd16(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - -#if defined(TARGET_XARCH) - case TYP_SIMD32: - { - simd32_t tmp = GetConstantSimd32(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD64: - { - simd64_t tmp = GetConstantSimd64(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } -#endif // TARGET_XARCH - - default: - { - unreached(); - } - } - + simd_t vector = GetConstantSimd(valVN); uint32_t elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); for (uint32_t i = 0; i < elementCount; i++) @@ -4013,6 +3921,63 @@ float ValueNumStore::GetConstantSingle(ValueNum argVN) } #if defined(FEATURE_SIMD) +// Given a simd constant value number return its value. +// +simd_t ValueNumStore::GetConstantSimd(ValueNum argVN) +{ + assert(IsVNConstant(argVN)); + + simd_t vector = {}; + var_types simdType = TypeOfVN(argVN); + + switch (simdType) + { + case TYP_SIMD8: + { + simd8_t tmp = ConstantValue(argVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } + + case TYP_SIMD12: + { + simd12_t tmp = ConstantValue(argVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } + + case TYP_SIMD16: + { + simd16_t tmp = ConstantValue(argVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } + +#if defined(TARGET_XARCH) + case TYP_SIMD32: + { + simd32_t tmp = ConstantValue(argVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } + + case TYP_SIMD64: + { + simd64_t tmp = ConstantValue(argVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } +#endif // TARGET_XARCH + + default: + { + unreached(); + } + } + + return vector; +} + // Given a simd8 constant value number return its value as a simd8. // simd8_t ValueNumStore::GetConstantSimd8(ValueNum argVN) @@ -8444,10 +8409,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( case GT_EQ: { - NamedIntrinsic argIntrinsicId; - unsigned argSimdSize; - CorInfoType argSimdBaseJitType; - if (varTypeIsFloating(baseType)) { // Handle `(x == NaN) == false` and `(NaN == x) == false` for floating-point types @@ -8458,27 +8419,24 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( return VNZeroForType(type); } } - else if (IsVNHWIntrinsicFunc(argVN, &argIntrinsicId, &argSimdSize, &argSimdBaseJitType)) + else if (IsVectorPerElementMask(argVN, baseType, simdSize)) { - // This optimization is only safe if we know the other node produces - // AllBitsSet or Zero per element and if the outer comparison is the - // same size as what the other node produces for its mask - - if (!HWIntrinsicInfo::ReturnsPerElementMask(argIntrinsicId)) - { - break; - } - - if (genTypeSize(baseType) == genTypeSize(JitType2PreciseVarType(argSimdBaseJitType))) - { - break; - } - - // Handle `(Mask == AllBitsSet) == Mask` and `(AllBitsSet == Mask) == Mask` for integral types + // Handle `Equals(PerElementMask, AllBitsSet)` and `Equals(AllBitsSet, PerElementMask)` for + // integrals ValueNum allBitsVN = VNAllBitsForType(type, simdSize); if (cnsVN == allBitsVN) { + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with AllBitsSet. + // + // In such a case: + // * `AllBitsSet == AllBitsSet` is true and so produces `AllBitsSet` + // * `AllBitsSet == Zero` is false and so produces `Zero` + // + // This means that we are not changing anything and can just return + // the per element mask + return argVN; } } @@ -8630,10 +8588,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( { var_types simdType = Compiler::getSIMDTypeForSize(simdSize); - NamedIntrinsic argIntrinsicId; - unsigned argSimdSize; - CorInfoType argSimdBaseJitType; - if (varTypeIsFloating(baseType)) { // Handle `(x != NaN) == true` and `(NaN != x) == true` for floating-point types @@ -8643,27 +8597,23 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( return VNAllBitsForType(type, elementCount); } } - else if (IsVNHWIntrinsicFunc(argVN, &argIntrinsicId, &argSimdSize, &argSimdBaseJitType)) + else if (IsVectorPerElementMask(argVN, baseType, simdSize)) { - // This optimization is only safe if we know the other node produces - // AllBitsSet or Zero per element and if the outer comparison is the - // same size as what the other node produces for its mask - - if (!HWIntrinsicInfo::ReturnsPerElementMask(argIntrinsicId)) - { - break; - } - - if (genTypeSize(baseType) != genTypeSize(JitType2PreciseVarType(argSimdBaseJitType))) - { - break; - } - // Handle `(Mask != Zero) == Mask` and `(Zero != Mask) == Mask` for integral types ValueNum zeroVN = VNZeroForType(type); if (cnsVN == zeroVN) { + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with Zero. + // + // In such a case: + // * `AllBitsSet != Zero` is true and so produces `AllBitsSet` + // * `Zero != Zero` is false and so produces `Zero` + // + // This means that we are not changing anything and can just return + // the per element mask + return argVN; } } @@ -9291,6 +9241,107 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary( return VNForFunc(type, func, arg0VN, arg1VN, arg2VN, resultTypeVN); } +//------------------------------------------------------------------- +// IsVectorPerElementMask: returns true if the ValueNum is a vector constant per-element mask +// (every element has either all bits set or none of them) for the +// given simd size and base type. +// +// Arguments: +// vn - the value number to check +// simdBaseType - the base type of the constant being checked. +// simdSize - the size of the SIMD type of the intrinsic. +// +// Returns: +// True if vn is a per-element mask compatible with simdBaseType and simdSize +// +bool ValueNumStore::IsVectorPerElementMask(ValueNum vn, var_types simdBaseType, unsigned simdSize) +{ + // This should be kept in sync with GenTree::IsVectorPerElementMask + + var_types simdType = TypeOfVN(vn); + unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType); + + assert(varTypeIsSIMD(simdType)); + assert(genTypeSize(simdType) == simdSize); + + if (IsVNConstant(vn)) + { + simd_t simdVal = GetConstantSimd(vn); + return ElementsAreAllBitsSetOrZero(&simdVal, simdBaseType, elementCount); + } + + VNFuncApp funcApp; + NamedIntrinsic intrinsicId; + unsigned intrinsicSimdSize; + var_types intrinsicSimdBaseType; + + if (!IsVNHWIntrinsicFunc(vn, &funcApp, &intrinsicId, &intrinsicSimdSize, &intrinsicSimdBaseType)) + { + return false; + } + + if (intrinsicSimdSize != simdSize) + { + return false; + } + + if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId)) + { + // When producing a SIMD result, we need for it to + // have a base type that is the same size or larger + // as what we expect. + // + // Consider for example us expecting `byte` and the + // intrinsic here produces `ushort`. In that case we + // expect every byte to be either `0x00` or `0xFF` + // and the intrinsic produces either `0x0000` or `0xFFFF` + // and so it meets this need. + // + // However, the inverse is not safe as we would expect + // `0x0000` or `0xFFFF`, but the intrinsic could produce + // `0x00FF` or `0xFF00` which fails the expectation. + + return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType); + } + + bool isScalar = false; + genTreeOps oper = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar); + + switch (oper) + { + case GT_AND: + case GT_AND_NOT: + case GT_OR: + case GT_OR_NOT: + case GT_XOR: + case GT_XOR_NOT: + { + // We are a binary bitwise operation where both inputs are per-element masks + // + // While some cases like OR could combine in ways that produce a usable mask + // there isn't any way to statically determine this for non-constants and + // the constant cases should've already been folded. + + return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize) && + IsVectorPerElementMask(funcApp.m_args[1], simdBaseType, simdSize); + } + + case GT_NOT: + { + // We are an unary bitwise operation where the input is a per-element mask + return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize); + } + + default: + { + assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)); + break; + } + } + + return false; +} + #endif // FEATURE_HW_INTRINSICS ValueNum ValueNumStore::EvalMathFuncUnary(var_types typ, NamedIntrinsic gtMathFN, ValueNum arg0VN) @@ -10024,56 +10075,55 @@ bool ValueNumStore::IsVNBinFunc(ValueNum vn, VNFunc func, ValueNum* op1, ValueNu //---------------------------------------------------------------------------------- // IsVNHWIntrinsicFunc: A specialized version of GetVNFunc that checks if the given ValueNum -// is a HWIntrinsic. If so, it returns the intrinsicId, simdSize, and simdBaseJitType. +// is a HWIntrinsic. If so, it returns the intrinsicId, simdSize, and simdBaseType. // // Arguments: // vn - The ValueNum to check. +// funcApp - The function application // intrinsicId - The intrinsic id. // simdSize - The simd size of the intrinsic. -// simdBaseJitType - The simd base jit type for the intrinsic. +// simdBaseType - The simd base type for the intrinsic. // // Return Value: // true if the given vn is a VNFunc for a HWIntrinsic // -bool ValueNumStore::IsVNHWIntrinsicFunc(ValueNum vn, - NamedIntrinsic* intrinsicId, - unsigned* simdSize, - CorInfoType* simdBaseJitType) +bool ValueNumStore::IsVNHWIntrinsicFunc( + ValueNum vn, VNFuncApp* funcApp, NamedIntrinsic* intrinsicId, unsigned* simdSize, var_types* simdBaseType) { + assert(funcApp != nullptr); assert(intrinsicId != nullptr); + assert(simdSize != nullptr); + assert(simdBaseType != nullptr); #if defined(FEATURE_HW_INTRINSICS) - VNFuncApp funcApp; - - if (!GetVNFunc(vn, &funcApp)) + if (!GetVNFunc(vn, funcApp)) { return false; } - VNFunc outerFunc = funcApp.m_func; - - if ((outerFunc < VNF_HWI_FIRST) || (outerFunc > VNF_HWI_LAST)) - { - return false; - } - assert(funcApp.m_arity != 0); + VNFunc funcId = funcApp->m_func; - if (GetVNFunc(funcApp.m_args[funcApp.m_arity - 1], &funcApp)) + if ((funcId < VNF_HWI_FIRST) || (funcId > VNF_HWI_LAST)) { return false; } - assert(funcApp.m_func == VNF_SimdType); - assert(funcApp.m_arity != 2); + assert(funcApp->m_arity != 0); + VNFuncApp simdType; - if (!IsVNConstant(funcApp.m_args[0]) || !IsVNConstant(funcApp.m_args[1])) + if (!GetVNFunc(funcApp->m_args[funcApp->m_arity - 1], &simdType)) { return false; } - *intrinsicId = NamedIntrinsic((outerFunc - VNF_HWI_FIRST) + (NI_HW_INTRINSIC_START + 1)); - *simdSize = static_cast(GetConstantInt32(funcApp.m_args[0])); - *simdBaseJitType = static_cast(GetConstantInt32(funcApp.m_args[1])); + assert(simdType.m_func == VNF_SimdType); + assert(simdType.m_arity == 2); + assert(IsVNConstant(simdType.m_args[0])); + assert(IsVNConstant(simdType.m_args[1])); + + *intrinsicId = static_cast((funcId - VNF_HWI_FIRST) + (NI_HW_INTRINSIC_START + 1)); + *simdSize = static_cast(GetConstantInt32(simdType.m_args[0])); + *simdBaseType = static_cast(GetConstantInt32(simdType.m_args[1])); return true; #else @@ -10510,16 +10560,16 @@ void ValueNumStore::vnDumpMemOpaque(Compiler* comp, VNFuncApp* memOpaque) #ifdef FEATURE_SIMD void ValueNumStore::vnDumpSimdType(Compiler* comp, VNFuncApp* simdType) { - assert(simdType->m_func == VNF_SimdType); // Preconditions. + assert(simdType->m_func == VNF_SimdType); + assert(simdType->m_arity == 2); assert(IsVNConstant(simdType->m_args[0])); assert(IsVNConstant(simdType->m_args[1])); - int simdSize = ConstantValue(simdType->m_args[0]); - CorInfoType baseJitType = (CorInfoType)ConstantValue(simdType->m_args[1]); + int simdSize = ConstantValue(simdType->m_args[0]); + var_types simdBaseType = static_cast(ConstantValue(simdType->m_args[1])); printf("%s(simd%d, %s)", VNFuncName(simdType->m_func), simdSize, - baseJitType == CORINFO_TYPE_UNDEF ? varTypeName(TYP_UNDEF) - : varTypeName(JitType2PreciseVarType(baseJitType))); + (simdBaseType == TYP_UNDEF) ? varTypeName(TYP_UNDEF) : varTypeName(simdBaseType)); } #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/valuenum.h b/src/coreclr/jit/valuenum.h index 8c75e17935a485..d2e22aadaf0475 100644 --- a/src/coreclr/jit/valuenum.h +++ b/src/coreclr/jit/valuenum.h @@ -387,6 +387,7 @@ class ValueNumStore float GetConstantSingle(ValueNum argVN); #if defined(FEATURE_SIMD) + simd_t GetConstantSimd(ValueNum argVN); simd8_t GetConstantSimd8(ValueNum argVN); simd12_t GetConstantSimd12(ValueNum argVN); simd16_t GetConstantSimd16(ValueNum argVN); @@ -1323,6 +1324,8 @@ class ValueNumStore ValueNum arg1VN, ValueNum arg2VN, ValueNum resultTypeVN); + + bool IsVectorPerElementMask(ValueNum vn, var_types simdBaseType, unsigned simdSize); #endif // FEATURE_HW_INTRINSICS // Returns "true" iff "vn" represents a function application. @@ -1336,10 +1339,8 @@ class ValueNumStore bool IsVNBinFunc(ValueNum vn, VNFunc func, ValueNum* op1 = nullptr, ValueNum* op2 = nullptr); // Returns "true" iff "vn" is a function application for a HWIntrinsic - bool IsVNHWIntrinsicFunc(ValueNum vn, - NamedIntrinsic* intrinsicId, - unsigned* simdSize, - CorInfoType* simdBaseJitType); + bool IsVNHWIntrinsicFunc( + ValueNum vn, VNFuncApp* funcApp, NamedIntrinsic* intrinsicId, unsigned* simdSize, var_types* simdBaseType); // Returns "true" iff "vn" is a function application of the form "func(op, cns)" // the cns can be on the left side if the function is commutative.