From 9ed07d70b141453994bc1c59c3751b0a22d5e57d Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 13 Apr 2026 19:46:57 -0700 Subject: [PATCH 1/7] Cleanup GenTree::IsVectorPerElementMask to ensure the masks are compatible with the expected element count --- src/coreclr/jit/gentree.cpp | 122 +++++++--- src/coreclr/jit/simd.h | 43 +++- src/coreclr/jit/valuenum.cpp | 416 +++++++++++++++++++++-------------- src/coreclr/jit/valuenum.h | 9 +- 4 files changed, 390 insertions(+), 200 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 9b5101f953ab65..139fad2b939000 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -31560,57 +31560,112 @@ bool GenTree::IsInvariant() const //------------------------------------------------------------------- // IsVectorPerElementMask: returns true if this node is a vector constant per-element mask -// (every element has either all bits set or none of them). +// (every element has either all bits set or none of them) for the +// given simd size and base type. // // Arguments: // simdBaseType - the base type of the constant being checked. // simdSize - the size of the SIMD type of the intrinsic. // // Returns: -// True if this node is a vector constant per-element mask. +// True if this node is a per-element mask compatible with simdBaseType and simdSize // bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize) const { #ifdef FEATURE_SIMD - if (IsCnsVec()) - { - const GenTreeVecCon* vecCon = AsVecCon(); + // This should be kept in sync with ValueNumStore::IsVectorPerElementMask - int elementCount = vecCon->ElementCount(simdSize, simdBaseType); + unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType); - switch (simdBaseType) + if (OperIsConst()) + { + if (IsCnsVec()) { - case TYP_BYTE: - case TYP_UBYTE: - return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u8[0], elementCount); - case TYP_SHORT: - case TYP_USHORT: - return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u16[0], elementCount); - case TYP_INT: - case TYP_UINT: - case TYP_FLOAT: - return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u32[0], elementCount); - case TYP_LONG: - case TYP_ULONG: - case TYP_DOUBLE: - return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u64[0], elementCount); - default: - unreached(); + const GenTreeVecCon* vecCon = AsVecCon(); + return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal, simdBaseType, elementCount); + } +#if defined(FEATURE_MASKED_HW_INTRINSICS) + else if (IsCnsMsk()) + { + // For constant MASK we need to ensure that only the + // lowest elementCount bits are set. If there are more + // then we are likely in a scenario where the mask would + // get treated incorrectly. + // + // Consider for example something that expects 2 bits but + // finds 3 or more set. This ends up somewhat undefined + // behavior as we will ignoring the upper bits and this + // could lead to incorrect results. + + const GenTreeMskCon* mskCon = AsMskCon(); + uint64_t mask = mskCon->gtSimdMaskVal.GetRawBits(); + return (mask & simdmask_t::GetBitMask(elementCount)) == mask; } +#endif // FEATURE_MASKED_HW_INTRINSICS + + return false; } - else if (OperIsHWIntrinsic()) + + NamedIntrinsic intrinsicId = NI_Illegal; + unsigned intrinsicSimdSize = 0; + var_types intrinsicSimdBaseType = TYP_UNKNOWN; + + if (OperIsHWIntrinsic()) { - const GenTreeHWIntrinsic* intrinsic = AsHWIntrinsic(); - const NamedIntrinsic intrinsicId = intrinsic->GetHWIntrinsicId(); + const GenTreeHWIntrinsic* intrinsic = AsHWIntrinsic(); + + intrinsicId = intrinsic->GetHWIntrinsicId(); + intrinsicSimdSize = intrinsic->GetSimdSize(); + intrinsicSimdBaseType = intrinsic->GetSimdBaseType(); if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId)) { - // We directly return a per-element mask - return true; + if (varTypeIsMask(TypeGet())) + { + // When producing a MASK result, we need to ensure the + // element counts line up as otherwise we end up with + // a bitmask that may be interpreted incorrectly and + // lead to bad codegen. + // + // Consider for example that we have V128 and + // so we expect 4 bits. However we are producing a + // mask for a V128 and so give 2 bits instead. + // If we produce all true we get 0b11 but the consumer + // needs 0b1111 for it to be used correctly. So the + // upper two elements are incorrectly treated as false + // + // Inversely we would expect 2-bits like 0b11 and would + // get four bits like 0b1011 instead. So we'd end up + // treating both elements as a match when the upper ulong + // only had half of it match and so shouldn't be counted. + + return elementCount == GenTreeVecCon::ElementCount(intrinsicSimdSize, intrinsicSimdBaseType); + } + else + { + assert(varTypeIsSIMD(TypeGet())); + assert(simdSize == intrinsicSimdSize); + + // When producing a SIMD result, we need for it to + // have a base type that is the same size or larger + // as what we expect. + // + // Consider for example us expecting `byte` and the + // intrinsic here produces `ushort`. In that case we + // expect every byte to be either `0x00` or `0xFF` + // and the intrinsic produces either `0x0000` or `0xFFFF` + // and so it meets this need. + // + // However, the inverse is not safe as we would expect + // `0x0000` or `0xFFFF`, but the intrinsic could produce + // `0x00FF` or `0xFF00` which fails the expectation. + + return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType); + } } bool isScalar = false; - genTreeOps oper = intrinsic->GetOperForHWIntrinsicId(&isScalar); + genTreeOps oper = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar); switch (oper) { @@ -31622,6 +31677,11 @@ bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize) case GT_XOR_NOT: { // We are a binary bitwise operation where both inputs are per-element masks + // + // While some cases like OR could combine in ways that produce a usable mask + // there isn't any way to statically determine this for non-constants and + // the constant cases should've already been folded. + return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize) && intrinsic->Op(2)->IsVectorPerElementMask(simdBaseType, simdSize); } @@ -31641,10 +31701,6 @@ bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize) return false; } - else if (IsCnsMsk()) - { - return true; - } #endif // FEATURE_SIMD return false; diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index bca1a46e2d7890..43999e89edcebf 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -5,7 +5,7 @@ #define _SIMD_H_ template -static bool ElementsAreSame(T* array, size_t size) +static bool ElementsAreSame(const T* array, size_t size) { for (size_t i = 1; i < size; i++) { @@ -16,7 +16,7 @@ static bool ElementsAreSame(T* array, size_t size) } template -static bool ElementsAreAllBitsSetOrZero(T* array, size_t size) +static bool ElementsAreAllBitsSetOrZero(const T* array, size_t size) { for (size_t i = 0; i < size; i++) { @@ -384,6 +384,45 @@ typedef simd64_t simd_t; typedef simd16_t simd_t; #endif +static bool ElementsAreAllBitsSetOrZero(const simd_t* simdVal, var_types simdBaseType, unsigned elementCount) +{ + assert(simdVal != nullptr); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + return ElementsAreAllBitsSetOrZero(&simdVal->u8[0], elementCount); + } + + case TYP_SHORT: + case TYP_USHORT: + { + return ElementsAreAllBitsSetOrZero(&simdVal->u16[0], elementCount); + } + + case TYP_INT: + case TYP_UINT: + case TYP_FLOAT: + { + return ElementsAreAllBitsSetOrZero(&simdVal->u32[0], elementCount); + } + + case TYP_LONG: + case TYP_ULONG: + case TYP_DOUBLE: + { + return ElementsAreAllBitsSetOrZero(&simdVal->u64[0], elementCount); + } + + default: + { + unreached(); + } + } +} + inline bool IsUnaryBitwiseOperation(genTreeOps oper) { return (oper == GT_LZCNT) || (oper == GT_NOT); diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 5c2884cb29659e..181912a9d12582 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -2405,53 +2405,7 @@ bool ValueNumStore::VNIsVectorNaN(var_types simdType, var_types simdBaseType, Va { assert(varTypeIsSIMD(simdType)); - simd_t vector = {}; - - switch (simdType) - { - case TYP_SIMD8: - { - simd8_t tmp = GetConstantSimd8(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD12: - { - simd12_t tmp = GetConstantSimd12(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD16: - { - simd16_t tmp = GetConstantSimd16(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - -#if defined(TARGET_XARCH) - case TYP_SIMD32: - { - simd32_t tmp = GetConstantSimd32(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD64: - { - simd64_t tmp = GetConstantSimd64(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } -#endif // TARGET_XARCH - - default: - { - unreached(); - } - } - + simd_t vector = GetConstantSimd(valVN); uint32_t elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); for (uint32_t i = 0; i < elementCount; i++) @@ -2471,53 +2425,7 @@ bool ValueNumStore::VNIsVectorNegativeZero(var_types simdType, var_types simdBas { assert(varTypeIsSIMD(simdType)); - simd_t vector = {}; - - switch (simdType) - { - case TYP_SIMD8: - { - simd8_t tmp = GetConstantSimd8(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD12: - { - simd12_t tmp = GetConstantSimd12(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD16: - { - simd16_t tmp = GetConstantSimd16(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - -#if defined(TARGET_XARCH) - case TYP_SIMD32: - { - simd32_t tmp = GetConstantSimd32(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } - - case TYP_SIMD64: - { - simd64_t tmp = GetConstantSimd64(valVN); - memcpy(&vector, &tmp, genTypeSize(simdType)); - break; - } -#endif // TARGET_XARCH - - default: - { - unreached(); - } - } - + simd_t vector = GetConstantSimd(valVN); uint32_t elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); for (uint32_t i = 0; i < elementCount; i++) @@ -4013,6 +3921,63 @@ float ValueNumStore::GetConstantSingle(ValueNum argVN) } #if defined(FEATURE_SIMD) +// Given a simd constant value number return its value. +// +simd_t ValueNumStore::GetConstantSimd(ValueNum argVN) +{ + assert(IsVNConstant(argVN)); + + simd_t vector = {}; + var_types simdType = TypeOfVN(argVN); + + switch (simdType) + { + case TYP_SIMD8: + { + simd8_t tmp = ConstantValue(argVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } + + case TYP_SIMD12: + { + simd12_t tmp = ConstantValue(argVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } + + case TYP_SIMD16: + { + simd16_t tmp = ConstantValue(argVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } + +#if defined(TARGET_XARCH) + case TYP_SIMD32: + { + simd32_t tmp = ConstantValue(argVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } + + case TYP_SIMD64: + { + simd64_t tmp = ConstantValue(argVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } +#endif // TARGET_XARCH + + default: + { + unreached(); + } + } + + return vector; +} + // Given a simd8 constant value number return its value as a simd8. // simd8_t ValueNumStore::GetConstantSimd8(ValueNum argVN) @@ -8444,10 +8409,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( case GT_EQ: { - NamedIntrinsic argIntrinsicId; - unsigned argSimdSize; - CorInfoType argSimdBaseJitType; - if (varTypeIsFloating(baseType)) { // Handle `(x == NaN) == false` and `(NaN == x) == false` for floating-point types @@ -8458,27 +8419,24 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( return VNZeroForType(type); } } - else if (IsVNHWIntrinsicFunc(argVN, &argIntrinsicId, &argSimdSize, &argSimdBaseJitType)) + else if (IsVectorPerElementMask(argVN, baseType, simdSize)) { - // This optimization is only safe if we know the other node produces - // AllBitsSet or Zero per element and if the outer comparison is the - // same size as what the other node produces for its mask - - if (!HWIntrinsicInfo::ReturnsPerElementMask(argIntrinsicId)) - { - break; - } - - if (genTypeSize(baseType) == genTypeSize(JitType2PreciseVarType(argSimdBaseJitType))) - { - break; - } - - // Handle `(Mask == AllBitsSet) == Mask` and `(AllBitsSet == Mask) == Mask` for integral types + // Handle `Equals(PerElementMask, AllBitsSet)` and `Equals(AllBitsSet, PerElementMask)` for + // integrals ValueNum allBitsVN = VNAllBitsForType(type, simdSize); if (cnsVN == allBitsVN) { + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with AllBitsSet. + // + // In such a case: + // * `AllBitsSet == AllBitsSet` is true and so produces `AllBitsSet` + // * `AllBitsSet == Zero` is false and so produces `Zero` + // + // This means that we are not changing anything and can just return + // the per element mask + return argVN; } } @@ -8630,10 +8588,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( { var_types simdType = Compiler::getSIMDTypeForSize(simdSize); - NamedIntrinsic argIntrinsicId; - unsigned argSimdSize; - CorInfoType argSimdBaseJitType; - if (varTypeIsFloating(baseType)) { // Handle `(x != NaN) == true` and `(NaN != x) == true` for floating-point types @@ -8643,27 +8597,23 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( return VNAllBitsForType(type, elementCount); } } - else if (IsVNHWIntrinsicFunc(argVN, &argIntrinsicId, &argSimdSize, &argSimdBaseJitType)) + else if (IsVectorPerElementMask(argVN, baseType, simdSize)) { - // This optimization is only safe if we know the other node produces - // AllBitsSet or Zero per element and if the outer comparison is the - // same size as what the other node produces for its mask - - if (!HWIntrinsicInfo::ReturnsPerElementMask(argIntrinsicId)) - { - break; - } - - if (genTypeSize(baseType) != genTypeSize(JitType2PreciseVarType(argSimdBaseJitType))) - { - break; - } - // Handle `(Mask != Zero) == Mask` and `(Zero != Mask) == Mask` for integral types ValueNum zeroVN = VNZeroForType(type); if (cnsVN == zeroVN) { + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with Zero. + // + // In such a case: + // * `AllBitsSet != Zero` is true and so produces `AllBitsSet` + // * `Zero != Zero` is false and so produces `Zero` + // + // This means that we are not changing anything and can just return + // the per element mask + return argVN; } } @@ -9291,6 +9241,151 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary( return VNForFunc(type, func, arg0VN, arg1VN, arg2VN, resultTypeVN); } +//------------------------------------------------------------------- +// IsVectorPerElementMask: returns true if the ValueNum is a vector constant per-element mask +// (every element has either all bits set or none of them) for the +// given simd size and base type. +// +// Arguments: +// vn - the value number to check +// simdBaseType - the base type of the constant being checked. +// simdSize - the size of the SIMD type of the intrinsic. +// +// Returns: +// True if vn is a per-element mask compatible with simdBaseType and simdSize +// +bool ValueNumStore::IsVectorPerElementMask(ValueNum vn, var_types simdBaseType, unsigned simdSize) +{ + // This should be kept in sync with GenTree::IsVectorPerElementMask + + unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType); + + if (IsVNConstant(vn)) + { + var_types vnType = TypeOfVN(vn); + + if (varTypeIsSIMD(vnType)) + { + simd_t simdVal = GetConstantSimd(vn); + return ElementsAreAllBitsSetOrZero(&simdVal, simdBaseType, elementCount); + } +#if defined(FEATURE_MASKED_HW_INTRINSICS) + else if (varTypeIsMask(vnType)) + { + // For constant MASK we need to ensure that only the + // lowest elementCount bits are set. If there are more + // then we are likely in a scenario where the mask would + // get treated incorrectly. + // + // Consider for example something that expects 2 bits but + // finds 3 or more set. This ends up somewhat undefined + // behavior as we will ignoring the upper bits and this + // could lead to incorrect results. + + simdmask_t simdMaskVal = GetConstantSimdMask(vn); + uint64_t mask = simdMaskVal.GetRawBits(); + return (mask & simdmask_t::GetBitMask(elementCount)) == mask; + } +#endif // FEATURE_MASKED_HW_INTRINSICS + + return false; + } + + VNFuncApp funcApp; + + NamedIntrinsic intrinsicId = NI_Illegal; + unsigned intrinsicSimdSize = 0; + var_types intrinsicSimdBaseType = TYP_UNKNOWN; + + if (IsVNHWIntrinsicFunc(vn, &funcApp, &intrinsicId, &intrinsicSimdSize, &intrinsicSimdBaseType)) + { + if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId)) + { + if (varTypeIsMask(TypeOfVN(vn))) + { + // When producing a MASK result, we need to ensure the + // element counts line up as otherwise we end up with + // a bitmask that may be interpreted incorrectly and + // lead to bad codegen. + // + // Consider for example that we have V128 and + // so we expect 4 bits. However we are producing a + // mask for a V128 and so give 2 bits instead. + // If we produce all true we get 0b11 but the consumer + // needs 0b1111 for it to be used correctly. So the + // upper two elements are incorrectly treated as false + // + // Inversely we would expect 2-bits like 0b11 and would + // get four bits like 0b1011 instead. So we'd end up + // treating both elements as a match when the upper ulong + // only had half of it match and so shouldn't be counted. + + return elementCount == GenTreeVecCon::ElementCount(intrinsicSimdSize, intrinsicSimdBaseType); + } + else + { + assert(varTypeIsSIMD(TypeOfVN(vn))); + assert(simdSize == intrinsicSimdSize); + + // When producing a SIMD result, we need for it to + // have a base type that is the same size or larger + // as what we expect. + // + // Consider for example us expecting `byte` and the + // intrinsic here produces `ushort`. In that case we + // expect every byte to be either `0x00` or `0xFF` + // and the intrinsic produces either `0x0000` or `0xFFFF` + // and so it meets this need. + // + // However, the inverse is not safe as we would expect + // `0x0000` or `0xFFFF`, but the intrinsic could produce + // `0x00FF` or `0xFF00` which fails the expectation. + + return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType); + } + } + + bool isScalar = false; + genTreeOps oper = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar); + + switch (oper) + { + case GT_AND: + case GT_AND_NOT: + case GT_OR: + case GT_OR_NOT: + case GT_XOR: + case GT_XOR_NOT: + { + // We are a binary bitwise operation where both inputs are per-element masks + // + // While some cases like OR could combine in ways that produce a usable mask + // there isn't any way to statically determine this for non-constants and + // the constant cases should've already been folded. + + return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize) && + IsVectorPerElementMask(funcApp.m_args[1], simdBaseType, simdSize); + } + + case GT_NOT: + { + // We are an unary bitwise operation where the input is a per-element mask + return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize); + } + + default: + { + assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)); + break; + } + } + + return false; + } + + return false; +} + #endif // FEATURE_HW_INTRINSICS ValueNum ValueNumStore::EvalMathFuncUnary(var_types typ, NamedIntrinsic gtMathFN, ValueNum arg0VN) @@ -10024,56 +10119,55 @@ bool ValueNumStore::IsVNBinFunc(ValueNum vn, VNFunc func, ValueNum* op1, ValueNu //---------------------------------------------------------------------------------- // IsVNHWIntrinsicFunc: A specialized version of GetVNFunc that checks if the given ValueNum -// is a HWIntrinsic. If so, it returns the intrinsicId, simdSize, and simdBaseJitType. +// is a HWIntrinsic. If so, it returns the intrinsicId, simdSize, and simdBaseType. // // Arguments: // vn - The ValueNum to check. +// funcApp - The function application // intrinsicId - The intrinsic id. // simdSize - The simd size of the intrinsic. -// simdBaseJitType - The simd base jit type for the intrinsic. +// simdBaseType - The simd base type for the intrinsic. // // Return Value: // true if the given vn is a VNFunc for a HWIntrinsic // -bool ValueNumStore::IsVNHWIntrinsicFunc(ValueNum vn, - NamedIntrinsic* intrinsicId, - unsigned* simdSize, - CorInfoType* simdBaseJitType) +bool ValueNumStore::IsVNHWIntrinsicFunc( + ValueNum vn, VNFuncApp* funcApp, NamedIntrinsic* intrinsicId, unsigned* simdSize, var_types* simdBaseType) { + assert(funcApp != nullptr); assert(intrinsicId != nullptr); + assert(simdSize != nullptr); + assert(simdBaseType != nullptr); #if defined(FEATURE_HW_INTRINSICS) - VNFuncApp funcApp; - - if (!GetVNFunc(vn, &funcApp)) + if (!GetVNFunc(vn, funcApp)) { return false; } - VNFunc outerFunc = funcApp.m_func; + VNFunc funcId = funcApp->m_func; - if ((outerFunc < VNF_HWI_FIRST) || (outerFunc > VNF_HWI_LAST)) + if ((funcId < VNF_HWI_FIRST) || (funcId > VNF_HWI_LAST)) { return false; } - assert(funcApp.m_arity != 0); - if (GetVNFunc(funcApp.m_args[funcApp.m_arity - 1], &funcApp)) - { - return false; - } - - assert(funcApp.m_func == VNF_SimdType); - assert(funcApp.m_arity != 2); + assert(funcApp->m_arity != 0); + VNFuncApp simdType; - if (!IsVNConstant(funcApp.m_args[0]) || !IsVNConstant(funcApp.m_args[1])) + if (GetVNFunc(funcApp->m_args[funcApp->m_arity - 1], &simdType)) { return false; } - *intrinsicId = NamedIntrinsic((outerFunc - VNF_HWI_FIRST) + (NI_HW_INTRINSIC_START + 1)); - *simdSize = static_cast(GetConstantInt32(funcApp.m_args[0])); - *simdBaseJitType = static_cast(GetConstantInt32(funcApp.m_args[1])); + assert(simdType.m_func == VNF_SimdType); + assert(simdType.m_arity == 2); + assert(IsVNConstant(simdType.m_args[0])); + assert(IsVNConstant(simdType.m_args[1])); + + *intrinsicId = static_cast((funcId - VNF_HWI_FIRST) + (NI_HW_INTRINSIC_START + 1)); + *simdSize = static_cast(GetConstantInt32(funcApp->m_args[0])); + *simdBaseType = static_cast(GetConstantInt32(funcApp->m_args[1])); return true; #else @@ -10510,16 +10604,16 @@ void ValueNumStore::vnDumpMemOpaque(Compiler* comp, VNFuncApp* memOpaque) #ifdef FEATURE_SIMD void ValueNumStore::vnDumpSimdType(Compiler* comp, VNFuncApp* simdType) { - assert(simdType->m_func == VNF_SimdType); // Preconditions. + assert(simdType->m_func == VNF_SimdType); + assert(simdType->m_arity == 2); assert(IsVNConstant(simdType->m_args[0])); assert(IsVNConstant(simdType->m_args[1])); - int simdSize = ConstantValue(simdType->m_args[0]); - CorInfoType baseJitType = (CorInfoType)ConstantValue(simdType->m_args[1]); + int simdSize = ConstantValue(simdType->m_args[0]); + var_types simdBaseType = static_cast(ConstantValue(simdType->m_args[1])); printf("%s(simd%d, %s)", VNFuncName(simdType->m_func), simdSize, - baseJitType == CORINFO_TYPE_UNDEF ? varTypeName(TYP_UNDEF) - : varTypeName(JitType2PreciseVarType(baseJitType))); + (simdBaseType == TYP_UNDEF) ? varTypeName(TYP_UNDEF) : varTypeName(simdBaseType)); } #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/valuenum.h b/src/coreclr/jit/valuenum.h index 8c75e17935a485..d2e22aadaf0475 100644 --- a/src/coreclr/jit/valuenum.h +++ b/src/coreclr/jit/valuenum.h @@ -387,6 +387,7 @@ class ValueNumStore float GetConstantSingle(ValueNum argVN); #if defined(FEATURE_SIMD) + simd_t GetConstantSimd(ValueNum argVN); simd8_t GetConstantSimd8(ValueNum argVN); simd12_t GetConstantSimd12(ValueNum argVN); simd16_t GetConstantSimd16(ValueNum argVN); @@ -1323,6 +1324,8 @@ class ValueNumStore ValueNum arg1VN, ValueNum arg2VN, ValueNum resultTypeVN); + + bool IsVectorPerElementMask(ValueNum vn, var_types simdBaseType, unsigned simdSize); #endif // FEATURE_HW_INTRINSICS // Returns "true" iff "vn" represents a function application. @@ -1336,10 +1339,8 @@ class ValueNumStore bool IsVNBinFunc(ValueNum vn, VNFunc func, ValueNum* op1 = nullptr, ValueNum* op2 = nullptr); // Returns "true" iff "vn" is a function application for a HWIntrinsic - bool IsVNHWIntrinsicFunc(ValueNum vn, - NamedIntrinsic* intrinsicId, - unsigned* simdSize, - CorInfoType* simdBaseJitType); + bool IsVNHWIntrinsicFunc( + ValueNum vn, VNFuncApp* funcApp, NamedIntrinsic* intrinsicId, unsigned* simdSize, var_types* simdBaseType); // Returns "true" iff "vn" is a function application of the form "func(op, cns)" // the cns can be on the left side if the function is commutative. From 7a6e97321b531faaf6fd36ffb10da04479a7c529 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 13 Apr 2026 20:09:28 -0700 Subject: [PATCH 2/7] Replace some ReturnsPerElementMask(id) checks with node->IsVectorPerElementMask(baseType, simdSize) --- src/coreclr/jit/gentree.cpp | 49 +++++++++++++++++++++++++------------ src/coreclr/jit/morph.cpp | 44 +++++++++++++++++++++------------ 2 files changed, 63 insertions(+), 30 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 139fad2b939000..578e03f5fd34f5 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33058,24 +33058,24 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } } - else if (otherNode->OperIsHWIntrinsic()) + else if (otherNode->IsVectorPerElementMask(simdBaseType, simdSize)) { - GenTreeHWIntrinsic* otherIntrinsic = otherNode->AsHWIntrinsic(); - NamedIntrinsic otherIntrinsicId = otherIntrinsic->GetHWIntrinsicId(); - - if (HWIntrinsicInfo::ReturnsPerElementMask(otherIntrinsicId) && - (genTypeSize(simdBaseType) == genTypeSize(otherIntrinsic->GetSimdBaseType()))) + // Handle `Equals(PerElementMask, AllBitsSet)` and `Equals(AllBitsSet, PerElementMask)` for + // integrals + if (cnsNode->IsVectorAllBitsSet()) { - // This optimization is only safe if we know the other node produces - // AllBitsSet or Zero per element and if the outer comparison is the - // same size as what the other node produces for its mask + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with AllBitsSet. + // + // In such a case: + // * `AllBitsSet == AllBitsSet` is true and so produces `AllBitsSet` + // * `AllBitsSet == Zero` is false and so produces `Zero` + // + // This means that we are not changing anything and can just return + // the per element mask - // Handle `(Mask == AllBitsSet) == Mask` and `(AllBitsSet == Mask) == Mask` for integrals - if (cnsNode->IsVectorAllBitsSet()) - { - resultNode = otherNode; - break; - } + resultNode = otherNode; + break; } } break; @@ -33246,6 +33246,25 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } } + else if (otherNode->IsVectorPerElementMask(simdBaseType, simdSize)) + { + // Handle `~Equals(PerElementMask, Zero)` and `~Equals(Zero, PerElementMask)` for integrals + if (cnsNode->IsVectorZero()) + { + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with Zero. + // + // In such a case: + // * `AllBitsSet != Zero` is true and so produces `AllBitsSet` + // * `Zero != Zero` is false and so produces `Zero` + // + // This means that we are not changing anything and can just return + // the per element mask + + resultNode = otherNode; + break; + } + } else if (otherNode->OperIsHWIntrinsic()) { GenTreeHWIntrinsic* otherIntrinsic = otherNode->AsHWIntrinsic(); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index fe092701846223..4f370aae0883d5 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -11503,44 +11503,58 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree) std::swap(op1, op2); } - if (((oper == GT_EQ) || (oper == GT_NE)) && op1->OperIsHWIntrinsic() && op2->IsCnsVec()) + if ((oper == GT_EQ) || (oper == GT_NE)) { - GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic(); - NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId(); - var_types op1Type = op1Intrinsic->TypeGet(); - - if (HWIntrinsicInfo::ReturnsPerElementMask(op1IntrinsicId) && - (genTypeSize(simdBaseType) == genTypeSize(op1Intrinsic->GetSimdBaseType()))) + if (op2->IsCnsVec() && op1->IsVectorPerElementMask(simdBaseType, simdSize)) { - // This optimization is only safe if we know the other node produces - // AllBitsSet or Zero per element and if the outer comparison is the - // same size as what the other node produces for its mask - bool reverseCond = false; if (oper == GT_EQ) { - // Handle `Mask == Zero` and `Zero == Mask` for integral types + // Handle `Equals(Mask, Zero)` for integral types if (op2->IsVectorZero()) { + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with Zero. + // + // In such a case: + // * `AllBitsSet == Zero` is false and so produces `Zero` + // * `Zero == Zero` is true and so produces `AllBitsSet` + // + // This means that we are just inverting the input and can reverse + // the condition + reverseCond = true; } } else if (oper == GT_NE) { - // Handle `Mask != AllBitsSet` and `AllBitsSet != Mask` for integral types + // Handle `~Equals(Mask, AllBitsSet)` for integral types if (op2->IsVectorAllBitsSet()) { + // We are comparing something that is known per element to be either + // AllBitsSet or Zero, with AllBitsSet. + // + // In such a case: + // * `AllBitsSet != AllBitsSet` is false and so produces `Zero` + // * `Zero != AllBitsSet` is true and so produces `AllBitsSet` + // + // This means that we are just inverting the input and can reverse + // the condition + reverseCond = true; } } if (reverseCond) { - GenTree* newNode = nullptr; + GenTree* newNode = nullptr; + var_types op1Type = op1->TypeGet(); - if (op1Intrinsic->OperIsConvertVectorToMask()) + if (op1->OperIsConvertVectorToMask()) { + GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic(); + #if defined(TARGET_XARCH) op1 = op1Intrinsic->Op(1); #elif defined(TARGET_ARM64) From cdea630d3680001fadc6b7efe22b1ddfbf2f929d Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 13 Apr 2026 22:24:25 -0700 Subject: [PATCH 3/7] Respond to PR feedback --- src/coreclr/jit/gentree.cpp | 2 +- src/coreclr/jit/valuenum.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 578e03f5fd34f5..734847bb7436ff 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -31594,7 +31594,7 @@ bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize) // // Consider for example something that expects 2 bits but // finds 3 or more set. This ends up somewhat undefined - // behavior as we will ignoring the upper bits and this + // behavior as we will ignore the upper bits and this // could lead to incorrect results. const GenTreeMskCon* mskCon = AsMskCon(); diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 181912a9d12582..532cd22ed9b7ce 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -9279,7 +9279,7 @@ bool ValueNumStore::IsVectorPerElementMask(ValueNum vn, var_types simdBaseType, // // Consider for example something that expects 2 bits but // finds 3 or more set. This ends up somewhat undefined - // behavior as we will ignoring the upper bits and this + // behavior as we will ignore the upper bits and this // could lead to incorrect results. simdmask_t simdMaskVal = GetConstantSimdMask(vn); @@ -10155,7 +10155,7 @@ bool ValueNumStore::IsVNHWIntrinsicFunc( assert(funcApp->m_arity != 0); VNFuncApp simdType; - if (GetVNFunc(funcApp->m_args[funcApp->m_arity - 1], &simdType)) + if (!GetVNFunc(funcApp->m_args[funcApp->m_arity - 1], &simdType)) { return false; } @@ -10166,8 +10166,8 @@ bool ValueNumStore::IsVNHWIntrinsicFunc( assert(IsVNConstant(simdType.m_args[1])); *intrinsicId = static_cast((funcId - VNF_HWI_FIRST) + (NI_HW_INTRINSIC_START + 1)); - *simdSize = static_cast(GetConstantInt32(funcApp->m_args[0])); - *simdBaseType = static_cast(GetConstantInt32(funcApp->m_args[1])); + *simdSize = static_cast(GetConstantInt32(simdType.m_args[0])); + *simdBaseType = static_cast(GetConstantInt32(simdType.m_args[1])); return true; #else From d8acc108bc098805096dde9f7987b1d5697e384c Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 14 Apr 2026 07:10:45 -0700 Subject: [PATCH 4/7] Remove some dead TYP_MASK handling from IsVectorPerElementMask --- src/coreclr/jit/gentree.cpp | 166 ++++++++++++----------------------- src/coreclr/jit/valuenum.cpp | 161 ++++++++++++--------------------- 2 files changed, 114 insertions(+), 213 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 734847bb7436ff..5da82094d6440a 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -31575,131 +31575,81 @@ bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize) #ifdef FEATURE_SIMD // This should be kept in sync with ValueNumStore::IsVectorPerElementMask - unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType); + var_types simdType = TypeGet(); + unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType); - if (OperIsConst()) - { - if (IsCnsVec()) - { - const GenTreeVecCon* vecCon = AsVecCon(); - return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal, simdBaseType, elementCount); - } -#if defined(FEATURE_MASKED_HW_INTRINSICS) - else if (IsCnsMsk()) - { - // For constant MASK we need to ensure that only the - // lowest elementCount bits are set. If there are more - // then we are likely in a scenario where the mask would - // get treated incorrectly. - // - // Consider for example something that expects 2 bits but - // finds 3 or more set. This ends up somewhat undefined - // behavior as we will ignore the upper bits and this - // could lead to incorrect results. + assert(varTypeIsSIMD(simdType)); + assert(genTypeSize(simdType) == simdSize); - const GenTreeMskCon* mskCon = AsMskCon(); - uint64_t mask = mskCon->gtSimdMaskVal.GetRawBits(); - return (mask & simdmask_t::GetBitMask(elementCount)) == mask; - } -#endif // FEATURE_MASKED_HW_INTRINSICS + if (IsCnsVec()) + { + const GenTreeVecCon* vecCon = AsVecCon(); + return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal, simdBaseType, elementCount); + } + if (!OperIsHWIntrinsic()) + { return false; } - NamedIntrinsic intrinsicId = NI_Illegal; - unsigned intrinsicSimdSize = 0; - var_types intrinsicSimdBaseType = TYP_UNKNOWN; + const GenTreeHWIntrinsic* intrinsic = AsHWIntrinsic(); - if (OperIsHWIntrinsic()) - { - const GenTreeHWIntrinsic* intrinsic = AsHWIntrinsic(); + NamedIntrinsic intrinsicId = intrinsic->GetHWIntrinsicId(); + unsigned intrinsicSimdSize = intrinsic->GetSimdSize(); + var_types intrinsicSimdBaseType = intrinsic->GetSimdBaseType(); - intrinsicId = intrinsic->GetHWIntrinsicId(); - intrinsicSimdSize = intrinsic->GetSimdSize(); - intrinsicSimdBaseType = intrinsic->GetSimdBaseType(); + if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId)) + { + // When producing a SIMD result, we need for it to + // have a base type that is the same size or larger + // as what we expect. + // + // Consider for example us expecting `byte` and the + // intrinsic here produces `ushort`. In that case we + // expect every byte to be either `0x00` or `0xFF` + // and the intrinsic produces either `0x0000` or `0xFFFF` + // and so it meets this need. + // + // However, the inverse is not safe as we would expect + // `0x0000` or `0xFFFF`, but the intrinsic could produce + // `0x00FF` or `0xFF00` which fails the expectation. - if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId)) - { - if (varTypeIsMask(TypeGet())) - { - // When producing a MASK result, we need to ensure the - // element counts line up as otherwise we end up with - // a bitmask that may be interpreted incorrectly and - // lead to bad codegen. - // - // Consider for example that we have V128 and - // so we expect 4 bits. However we are producing a - // mask for a V128 and so give 2 bits instead. - // If we produce all true we get 0b11 but the consumer - // needs 0b1111 for it to be used correctly. So the - // upper two elements are incorrectly treated as false - // - // Inversely we would expect 2-bits like 0b11 and would - // get four bits like 0b1011 instead. So we'd end up - // treating both elements as a match when the upper ulong - // only had half of it match and so shouldn't be counted. + return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType); + } - return elementCount == GenTreeVecCon::ElementCount(intrinsicSimdSize, intrinsicSimdBaseType); - } - else - { - assert(varTypeIsSIMD(TypeGet())); - assert(simdSize == intrinsicSimdSize); + bool isScalar = false; + genTreeOps oper = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar); - // When producing a SIMD result, we need for it to - // have a base type that is the same size or larger - // as what we expect. - // - // Consider for example us expecting `byte` and the - // intrinsic here produces `ushort`. In that case we - // expect every byte to be either `0x00` or `0xFF` - // and the intrinsic produces either `0x0000` or `0xFFFF` - // and so it meets this need. - // - // However, the inverse is not safe as we would expect - // `0x0000` or `0xFFFF`, but the intrinsic could produce - // `0x00FF` or `0xFF00` which fails the expectation. + switch (oper) + { + case GT_AND: + case GT_AND_NOT: + case GT_OR: + case GT_OR_NOT: + case GT_XOR: + case GT_XOR_NOT: + { + // We are a binary bitwise operation where both inputs are per-element masks + // + // While some cases like OR could combine in ways that produce a usable mask + // there isn't any way to statically determine this for non-constants and + // the constant cases should've already been folded. - return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType); - } + return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize) && + intrinsic->Op(2)->IsVectorPerElementMask(simdBaseType, simdSize); } - bool isScalar = false; - genTreeOps oper = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar); - - switch (oper) + case GT_NOT: { - case GT_AND: - case GT_AND_NOT: - case GT_OR: - case GT_OR_NOT: - case GT_XOR: - case GT_XOR_NOT: - { - // We are a binary bitwise operation where both inputs are per-element masks - // - // While some cases like OR could combine in ways that produce a usable mask - // there isn't any way to statically determine this for non-constants and - // the constant cases should've already been folded. - - return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize) && - intrinsic->Op(2)->IsVectorPerElementMask(simdBaseType, simdSize); - } - - case GT_NOT: - { - // We are an unary bitwise operation where the input is a per-element mask - return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize); - } - - default: - { - assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)); - break; - } + // We are an unary bitwise operation where the input is a per-element mask + return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize); } - return false; + default: + { + assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)); + break; + } } #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 532cd22ed9b7ce..4cd3f9e35e9f3e 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -9258,129 +9258,80 @@ bool ValueNumStore::IsVectorPerElementMask(ValueNum vn, var_types simdBaseType, { // This should be kept in sync with GenTree::IsVectorPerElementMask - unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType); + var_types simdType = TypeOfVN(vn); + unsigned elementCount = GenTreeVecCon::ElementCount(simdSize, simdBaseType); + + assert(varTypeIsSIMD(simdType)); + assert(genTypeSize(simdType) == simdSize); if (IsVNConstant(vn)) { - var_types vnType = TypeOfVN(vn); - - if (varTypeIsSIMD(vnType)) - { - simd_t simdVal = GetConstantSimd(vn); - return ElementsAreAllBitsSetOrZero(&simdVal, simdBaseType, elementCount); - } -#if defined(FEATURE_MASKED_HW_INTRINSICS) - else if (varTypeIsMask(vnType)) - { - // For constant MASK we need to ensure that only the - // lowest elementCount bits are set. If there are more - // then we are likely in a scenario where the mask would - // get treated incorrectly. - // - // Consider for example something that expects 2 bits but - // finds 3 or more set. This ends up somewhat undefined - // behavior as we will ignore the upper bits and this - // could lead to incorrect results. + simd_t simdVal = GetConstantSimd(vn); + return ElementsAreAllBitsSetOrZero(&simdVal, simdBaseType, elementCount); + } - simdmask_t simdMaskVal = GetConstantSimdMask(vn); - uint64_t mask = simdMaskVal.GetRawBits(); - return (mask & simdmask_t::GetBitMask(elementCount)) == mask; - } -#endif // FEATURE_MASKED_HW_INTRINSICS + VNFuncApp funcApp; + NamedIntrinsic intrinsicId; + unsigned intrinsicSimdSize; + var_types intrinsicSimdBaseType; + if (!IsVNHWIntrinsicFunc(vn, &funcApp, &intrinsicId, &intrinsicSimdSize, &intrinsicSimdBaseType)) + { return false; } - VNFuncApp funcApp; + if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId)) + { + // When producing a SIMD result, we need for it to + // have a base type that is the same size or larger + // as what we expect. + // + // Consider for example us expecting `byte` and the + // intrinsic here produces `ushort`. In that case we + // expect every byte to be either `0x00` or `0xFF` + // and the intrinsic produces either `0x0000` or `0xFFFF` + // and so it meets this need. + // + // However, the inverse is not safe as we would expect + // `0x0000` or `0xFFFF`, but the intrinsic could produce + // `0x00FF` or `0xFF00` which fails the expectation. - NamedIntrinsic intrinsicId = NI_Illegal; - unsigned intrinsicSimdSize = 0; - var_types intrinsicSimdBaseType = TYP_UNKNOWN; + return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType); + } - if (IsVNHWIntrinsicFunc(vn, &funcApp, &intrinsicId, &intrinsicSimdSize, &intrinsicSimdBaseType)) + bool isScalar = false; + genTreeOps oper = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar); + + switch (oper) { - if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId)) + case GT_AND: + case GT_AND_NOT: + case GT_OR: + case GT_OR_NOT: + case GT_XOR: + case GT_XOR_NOT: { - if (varTypeIsMask(TypeOfVN(vn))) - { - // When producing a MASK result, we need to ensure the - // element counts line up as otherwise we end up with - // a bitmask that may be interpreted incorrectly and - // lead to bad codegen. - // - // Consider for example that we have V128 and - // so we expect 4 bits. However we are producing a - // mask for a V128 and so give 2 bits instead. - // If we produce all true we get 0b11 but the consumer - // needs 0b1111 for it to be used correctly. So the - // upper two elements are incorrectly treated as false - // - // Inversely we would expect 2-bits like 0b11 and would - // get four bits like 0b1011 instead. So we'd end up - // treating both elements as a match when the upper ulong - // only had half of it match and so shouldn't be counted. - - return elementCount == GenTreeVecCon::ElementCount(intrinsicSimdSize, intrinsicSimdBaseType); - } - else - { - assert(varTypeIsSIMD(TypeOfVN(vn))); - assert(simdSize == intrinsicSimdSize); - - // When producing a SIMD result, we need for it to - // have a base type that is the same size or larger - // as what we expect. - // - // Consider for example us expecting `byte` and the - // intrinsic here produces `ushort`. In that case we - // expect every byte to be either `0x00` or `0xFF` - // and the intrinsic produces either `0x0000` or `0xFFFF` - // and so it meets this need. - // - // However, the inverse is not safe as we would expect - // `0x0000` or `0xFFFF`, but the intrinsic could produce - // `0x00FF` or `0xFF00` which fails the expectation. + // We are a binary bitwise operation where both inputs are per-element masks + // + // While some cases like OR could combine in ways that produce a usable mask + // there isn't any way to statically determine this for non-constants and + // the constant cases should've already been folded. - return genTypeSize(intrinsicSimdBaseType) >= genTypeSize(simdBaseType); - } + return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize) && + IsVectorPerElementMask(funcApp.m_args[1], simdBaseType, simdSize); } - bool isScalar = false; - genTreeOps oper = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(intrinsicId, simdBaseType, &isScalar); - - switch (oper) + case GT_NOT: { - case GT_AND: - case GT_AND_NOT: - case GT_OR: - case GT_OR_NOT: - case GT_XOR: - case GT_XOR_NOT: - { - // We are a binary bitwise operation where both inputs are per-element masks - // - // While some cases like OR could combine in ways that produce a usable mask - // there isn't any way to statically determine this for non-constants and - // the constant cases should've already been folded. - - return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize) && - IsVectorPerElementMask(funcApp.m_args[1], simdBaseType, simdSize); - } - - case GT_NOT: - { - // We are an unary bitwise operation where the input is a per-element mask - return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize); - } - - default: - { - assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)); - break; - } + // We are an unary bitwise operation where the input is a per-element mask + return IsVectorPerElementMask(funcApp.m_args[0], simdBaseType, simdSize); } - return false; + default: + { + assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)); + break; + } } return false; From 1d62fabd9cc3d943565931287a66133a80fd26bc Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 14 Apr 2026 09:11:55 -0700 Subject: [PATCH 5/7] Don't regress codegen for ConditionalSelect when the condition is a PerElementMask --- src/coreclr/jit/lowerxarch.cpp | 172 +++++++++++++++++++-------------- 1 file changed, 100 insertions(+), 72 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 09d6f831bd309a..42af1c1c3c04d8 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2702,6 +2702,16 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) GenTree* op2 = node->Op(2); GenTree* op3 = node->Op(3); + if (varTypeIsIntegral(simdBaseType)) + { + // The integral forms are marked with NormalizeSmallTypeToInt + // but actually emit an instruction that operates bytewise. So + // fixup the base type so that the per element mask check finds + // the greatest number of valid matches. + + simdBaseType = TYP_BYTE; + } + // If either of the value operands is const zero and the mask is either all // zeros or all ones per-element, we can optimize down to AND or AND_NOT. if (op3->IsVectorPerElementMask(simdBaseType, simdSize) && (op1->IsVectorZero() || op2->IsVectorZero())) @@ -3490,53 +3500,95 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) // op1: the condition vector // op2: the left vector // op3: the right vector + GenTree* op1 = node->Op(1); GenTree* op2 = node->Op(2); GenTree* op3 = node->Op(3); - // If the condition vector comes from a hardware intrinsic that - // returns a per-element mask, we can optimize the entire - // conditional select to a single BlendVariable instruction - // (if supported by the architecture) + // When op1 isn't a mask, we want to directly optimize the case where op2 or op3 + // is zero as the operation being done is `(op2 & op1) | (op3 & ~op1)` and doing + // `x & 0` always produces `0`, so we can drop half the operation. + // + // However, if op1 is originally a mask, then we want to skip this as we'll convert + // to a BlendVariableMask instruction instead, which can then be embedded in most + // operations, improving code density and throughput. - // First, determine if the condition is a per-element mask - if (op1->IsVectorPerElementMask(simdBaseType, simdSize)) + if (op1->OperIsConvertMaskToVector()) { - // Next, determine if the target architecture supports BlendVariable - NamedIntrinsic blendVariableId = NI_Illegal; + // If op1 was originally a mask, then we want to prioritize BlendVariableMask + // as it not only avoids the conversion from mask to vector, but also allows + // embedded masking and other optimizations to kick in, improving code density - bool isOp1CvtMaskToVector = op1->OperIsConvertMaskToVector(); + NamedIntrinsic blendVariableId = NI_AVX512_BlendVariableMask; + GenTreeHWIntrinsic* cvtMaskToVector = op1->AsHWIntrinsic(); - if ((simdSize == 64) || isOp1CvtMaskToVector) - { - GenTree* maskNode; + GenTree* maskNode = cvtMaskToVector->Op(1); + BlockRange().Remove(op1); - if (isOp1CvtMaskToVector) - { - GenTreeHWIntrinsic* cvtMaskToVector = op1->AsHWIntrinsic(); + // We need to change the base type to match the underlying mask size to ensure + // the right instruction variant is picked. If the CndSel was for TYP_INT but + // the mask was for TYP_DOUBLE then we'd generate vpblendmd when we really want + // vpblendmq. Changing the size is fine since CndSel itself is bitwise and the + // the mask is just representing entire elements at a given size. - maskNode = cvtMaskToVector->Op(1); - BlockRange().Remove(op1); + simdBaseType = cvtMaskToVector->GetSimdBaseType(); - // We need to change the base type to match the underlying mask size to ensure - // the right instruction variant is picked. If the CndSel was for TYP_INT but - // the mask was for TYP_DOUBLE then we'd generate vpblendmd when we really want - // vpblendmq. Changing the size is fine since CndSel itself is bitwise and the - // the mask is just representing entire elements at a given size. + resultNode = + m_compiler->gtNewSimdHWIntrinsicNode(simdType, op3, op2, op1, blendVariableId, simdBaseType, simdSize); + } + else if (op3->IsVectorZero()) + { + // The operation is (op2 & op1) | (zero & ~op1), so we can drop the second half + BlockRange().Remove(op3); + resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize); + } + else if (op2->IsVectorZero()) + { + // The operation is (zero & op1) | (op3 & ~op1), so we can drop the first half + BlockRange().Remove(op2); + resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, op1, simdBaseType, simdSize); + } + else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512)) + { + // TernaryLogic is always single cycle with a lot of ports availalbe and while + // BlendVariable is often also a single cycle it can frequently be 2-3 instead. + // + // Additionally, for TYP_SIMD64 this avoids needing to convert from vector to mask + // which introduces 3-4 cycles of overhead on most hardware and would be strictly worse - simdBaseType = cvtMaskToVector->GetSimdBaseType(); - } - else - { - maskNode = m_compiler->gtNewSimdCvtVectorToMaskNode(TYP_MASK, op1, simdBaseType, simdSize); - newNodes.InsertAtEnd(maskNode); - } + GenTree* control = m_compiler->gtNewIconNode(0xCA); // (B & A) | (C & ~A) + newNodes.InsertAtEnd(control); - assert(maskNode->TypeIs(TYP_MASK)); - blendVariableId = NI_AVX512_BlendVariableMask; - op1 = maskNode; + resultNode = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op1, op2, op3, control, NI_AVX512_TernaryLogic, + simdBaseType, simdSize); + } + else if (op1->IsVectorPerElementMask(TYP_BYTE, simdSize)) + { + // If the condition vector comes from a hardware intrinsic that + // returns a per-element mask, we can optimize the entire + // conditional select to a single BlendVariable instruction + // (if supported by the architecture). + // + // We can check using `TYP_BYTE` as we can change the base type + // and still get correct codegen since the ConditionalSelect is + // itself bitwise. + + // Next, determine if the target architecture supports BlendVariable + NamedIntrinsic blendVariableId = NI_Illegal; + + bool isFloating = false; + + if (varTypeIsFloating(simdBaseType) && !op1->IsVectorPerElementMask(simdBaseType, simdSize)) + { + // For floating-point, we want to preserve the base type if the + // mask is also compatible with it, otherwise we need to fixup + // the base type to ensure the right instruction is selected for + // the mask. + + simdBaseType = TYP_BYTE; } - else if (simdSize == 32) + + if (simdSize == 32) { // For Vector256 (simdSize == 32), BlendVariable for floats/doubles // is available on AVX, whereas other types (integrals) require AVX2 @@ -3568,50 +3620,26 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) if (resultNode == nullptr) { - if (op3->IsVectorZero()) - { - BlockRange().Remove(op3); + // We cannot optimize, so produce unoptimized instructions + assert(simdSize != 64); - resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize); - } - else if (op2->IsVectorZero()) - { - BlockRange().Remove(op2); - - resultNode = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, op1, simdBaseType, simdSize); - } - else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512)) + // We'll need the mask twice + if (!op1->OperIs(GT_LCL_VAR)) { - // We can't use the mask, but we can emit a ternary logic node - GenTree* control = m_compiler->gtNewIconNode(0xCA); // (B & A) | (C & ~A) - newNodes.InsertAtEnd(control); - - resultNode = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op1, op2, op3, control, NI_AVX512_TernaryLogic, - simdBaseType, simdSize); + LIR::Use op1Use; + LIR::Use::MakeDummyUse(newNodes, op1, &op1Use); + op1Use.ReplaceWithLclVar(m_compiler); + op1 = op1Use.Def(); } - else - { - // We cannot optimize, so produce unoptimized instructions - assert(simdSize != 64); - - // We'll need the mask twice - if (!op1->OperIs(GT_LCL_VAR)) - { - LIR::Use op1Use; - LIR::Use::MakeDummyUse(newNodes, op1, &op1Use); - op1Use.ReplaceWithLclVar(m_compiler); - op1 = op1Use.Def(); - } - GenTree* tmp1 = m_compiler->gtClone(op1); - GenTree* tmp2 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize); - GenTree* tmp3 = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, tmp1, simdBaseType, simdSize); - resultNode = m_compiler->gtNewSimdBinOpNode(GT_OR, simdType, tmp2, tmp3, simdBaseType, simdSize); + GenTree* tmp1 = m_compiler->gtClone(op1); + GenTree* tmp2 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, op2, simdBaseType, simdSize); + GenTree* tmp3 = m_compiler->gtNewSimdBinOpNode(GT_AND_NOT, simdType, op3, tmp1, simdBaseType, simdSize); + resultNode = m_compiler->gtNewSimdBinOpNode(GT_OR, simdType, tmp2, tmp3, simdBaseType, simdSize); - newNodes.InsertAtEnd(tmp1); - newNodes.InsertAtEnd(tmp2); - newNodes.InsertAtEnd(tmp3); - } + newNodes.InsertAtEnd(tmp1); + newNodes.InsertAtEnd(tmp2); + newNodes.InsertAtEnd(tmp3); } newNodes.InsertAtEnd(resultNode); From 73f53fc72d697b1f891282e505cf0911b1af1187 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 14 Apr 2026 09:46:29 -0700 Subject: [PATCH 6/7] Respond to PR feedback --- src/coreclr/jit/gentree.cpp | 5 +++++ src/coreclr/jit/lowerxarch.cpp | 9 +-------- src/coreclr/jit/valuenum.cpp | 5 +++++ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 5da82094d6440a..cd5a198ea57464 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -31598,6 +31598,11 @@ bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize) unsigned intrinsicSimdSize = intrinsic->GetSimdSize(); var_types intrinsicSimdBaseType = intrinsic->GetSimdBaseType(); + if (intrinsicSimdSize != simdSize) + { + return false; + } + if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId)) { // When producing a SIMD result, we need for it to diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 42af1c1c3c04d8..8cd3a3c99fedeb 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3505,14 +3505,6 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) GenTree* op2 = node->Op(2); GenTree* op3 = node->Op(3); - // When op1 isn't a mask, we want to directly optimize the case where op2 or op3 - // is zero as the operation being done is `(op2 & op1) | (op3 & ~op1)` and doing - // `x & 0` always produces `0`, so we can drop half the operation. - // - // However, if op1 is originally a mask, then we want to skip this as we'll convert - // to a BlendVariableMask instruction instead, which can then be embedded in most - // operations, improving code density and throughput. - if (op1->OperIsConvertMaskToVector()) { // If op1 was originally a mask, then we want to prioritize BlendVariableMask @@ -3524,6 +3516,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) GenTree* maskNode = cvtMaskToVector->Op(1); BlockRange().Remove(op1); + op1 = maskNode; // We need to change the base type to match the underlying mask size to ensure // the right instruction variant is picked. If the CndSel was for TYP_INT but diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 4cd3f9e35e9f3e..92d75c08c76bef 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -9280,6 +9280,11 @@ bool ValueNumStore::IsVectorPerElementMask(ValueNum vn, var_types simdBaseType, return false; } + if (intrinsicSimdSize != simdSize) + { + return false; + } + if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId)) { // When producing a SIMD result, we need for it to From c16ea95323b32831e7f941b36c5a38453936ae91 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 14 Apr 2026 11:23:29 -0700 Subject: [PATCH 7/7] Respond to PR feedback --- src/coreclr/jit/lowerxarch.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 8cd3a3c99fedeb..8fbed3c79d41d3 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2714,7 +2714,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) // If either of the value operands is const zero and the mask is either all // zeros or all ones per-element, we can optimize down to AND or AND_NOT. - if (op3->IsVectorPerElementMask(simdBaseType, simdSize) && (op1->IsVectorZero() || op2->IsVectorZero())) + if (op3->IsVectorPerElementMask(TYP_BYTE, simdSize) && (op1->IsVectorZero() || op2->IsVectorZero())) { var_types simdType = node->TypeGet(); GenTree* binOp = nullptr; @@ -3522,7 +3522,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) // the right instruction variant is picked. If the CndSel was for TYP_INT but // the mask was for TYP_DOUBLE then we'd generate vpblendmd when we really want // vpblendmq. Changing the size is fine since CndSel itself is bitwise and the - // the mask is just representing entire elements at a given size. + // mask is just representing entire elements at a given size. simdBaseType = cvtMaskToVector->GetSimdBaseType(); @@ -3543,7 +3543,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) } else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512)) { - // TernaryLogic is always single cycle with a lot of ports availalbe and while + // TernaryLogic is always single cycle with a lot of ports available and while // BlendVariable is often also a single cycle it can frequently be 2-3 instead. // // Additionally, for TYP_SIMD64 this avoids needing to convert from vector to mask @@ -3569,8 +3569,6 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) // Next, determine if the target architecture supports BlendVariable NamedIntrinsic blendVariableId = NI_Illegal; - bool isFloating = false; - if (varTypeIsFloating(simdBaseType) && !op1->IsVectorPerElementMask(simdBaseType, simdSize)) { // For floating-point, we want to preserve the base type if the