From 63447df7e01ae4ae7bb57d83983a463cc3e2f5cc Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Fri, 20 Mar 2026 01:40:26 -0700 Subject: [PATCH 1/3] remove dead vector2/3 lowering code --- src/coreclr/jit/gentree.cpp | 2 +- src/coreclr/jit/hwintrinsicarm64.cpp | 2 +- src/coreclr/jit/lowerarmarch.cpp | 66 +---------- src/coreclr/jit/lowerxarch.cpp | 168 ++------------------------- 4 files changed, 13 insertions(+), 225 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 3dfca0ba725a0a..457432ec1cb90b 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -30420,7 +30420,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp, else #endif // TARGET_XARCH { - assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16)); + assert((simdSize == 8) || (simdSize == 16)); #if defined(TARGET_ARM64) assert(!isScalar || (simdSize == 8)); diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index c930f30bc3c8cc..3c47fe9a34173c 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -1344,7 +1344,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdDotProdNode(simdType, op1, op2, simdBaseType, simdSize); - retNode = gtNewSimdGetElementNode(retType, retNode, gtNewIconNode(0), simdBaseType, simdSize); + retNode = gtNewSimdToScalarNode(retType, retNode, simdBaseType, simdSize); } break; } diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 2f7ec732acb297..cd21b6e13876f0 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -2093,7 +2093,7 @@ bool Lowering::IsValidConstForMovImm(GenTreeHWIntrinsic* node) } //---------------------------------------------------------------------------------------------- -// Lowering::LowerHWIntrinsicCmpOp: Lowers a Vector128 or Vector256 comparison intrinsic +// Lowering::LowerHWIntrinsicCmpOp: Lowers a Vector64 or Vector128 comparison intrinsic // // Arguments: // node - The hardware intrinsic node. @@ -2221,26 +2221,6 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm BlockRange().InsertBefore(node, cmp); LowerNode(cmp); - if ((simdBaseType == TYP_FLOAT) && (simdSize == 12)) - { - // For TYP_SIMD12 we don't want the upper bits to participate in the comparison. So, we will insert all ones - // into those bits of the result, "as if" the upper bits are equal. Then if all lower bits are equal, we get the - // expected all-ones result, and will get the expected 0's only where there are non-matching bits. - - GenTree* idxCns = m_compiler->gtNewIconNode(3, TYP_INT); - BlockRange().InsertAfter(cmp, idxCns); - - GenTree* insCns = m_compiler->gtNewIconNode(-1, TYP_INT); - BlockRange().InsertAfter(idxCns, insCns); - - GenTree* tmp = - m_compiler->gtNewSimdHWIntrinsicNode(simdType, cmp, idxCns, insCns, NI_AdvSimd_Insert, TYP_INT, simdSize); - BlockRange().InsertAfter(insCns, tmp); - LowerNode(tmp); - - cmp = tmp; - } - if (simdSize != 8) // we don't need compression for Vector64 { GenTree* msk; @@ -2339,7 +2319,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) if (isConstant) { - assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16)); + assert((simdSize == 8) || (simdSize == 16)); for (GenTree* arg : node->Operands()) { @@ -2497,46 +2477,6 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) GenTree* tmp1 = nullptr; GenTree* tmp2 = nullptr; - if (simdSize == 12) - { - assert(simdBaseType == TYP_FLOAT); - - // For 12 byte SIMD, we need to clear the upper 4 bytes: - // idx = CNS_INT int 0x03 - // tmp1 = * CNS_DBL float 0.0 - // /--* op1 simd16 - // +--* idx int - // +--* tmp1 simd16 - // op1 = * HWINTRINSIC simd16 T Insert - // ... - - // This is roughly the following managed code: - // op1 = AdvSimd.Insert(op1, 0x03, 0.0f); - // ... - - idx = m_compiler->gtNewIconNode(0x03, TYP_INT); - BlockRange().InsertAfter(op1, idx); - - tmp1 = m_compiler->gtNewZeroConNode(TYP_FLOAT); - BlockRange().InsertAfter(idx, tmp1); - LowerNode(tmp1); - - op1 = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op1, idx, tmp1, NI_AdvSimd_Insert, simdBaseType, simdSize); - BlockRange().InsertAfter(tmp1, op1); - LowerNode(op1); - - idx = m_compiler->gtNewIconNode(0x03, TYP_INT); - BlockRange().InsertAfter(op2, idx); - - tmp2 = m_compiler->gtNewZeroConNode(TYP_FLOAT); - BlockRange().InsertAfter(idx, tmp2); - LowerNode(tmp2); - - op2 = m_compiler->gtNewSimdHWIntrinsicNode(simdType, op2, idx, tmp2, NI_AdvSimd_Insert, simdBaseType, simdSize); - BlockRange().InsertAfter(tmp2, op2); - LowerNode(op2); - } - // We will be constructing the following parts: // ... // /--* op1 simd16 @@ -2615,7 +2555,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) } else { - assert((simdSize == 12) || (simdSize == 16)); + assert(simdSize == 16); // We will be constructing the following parts: // ... diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index a16db3113fc366..0ee6a0eaa3d30c 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3373,32 +3373,11 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm case TYP_USHORT: case TYP_INT: case TYP_UINT: - { - cmpType = simdBaseType; - mskType = TYP_UBYTE; - - if (simdSize == 32) - { - cmpIntrinsic = NI_AVX2_CompareEqual; - mskIntrinsic = NI_AVX2_MoveMask; - mskConstant = -1; - } - else - { - assert(simdSize == 16); - - cmpIntrinsic = NI_X86Base_CompareEqual; - mskIntrinsic = NI_X86Base_MoveMask; - mskConstant = 0xFFFF; - } - break; - } - case TYP_LONG: case TYP_ULONG: { - mskType = TYP_UBYTE; cmpType = simdBaseType; + mskType = TYP_UBYTE; if (simdSize == 32) { @@ -3430,22 +3409,11 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm } else { + assert(simdSize == 16); + cmpIntrinsic = NI_X86Base_CompareEqual; mskIntrinsic = NI_X86Base_MoveMask; - - if (simdSize == 16) - { - mskConstant = 0xF; - } - else if (simdSize == 12) - { - mskConstant = 0x7; - } - else - { - assert(simdSize == 8); - mskConstant = 0x3; - } + mskConstant = 0xF; } break; } @@ -3489,20 +3457,6 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm GenTree* mskCns = m_compiler->gtNewIconNode(mskConstant, TYP_INT); BlockRange().InsertAfter(msk, mskCns); - if ((simdBaseType == TYP_FLOAT) && (simdSize < 16)) - { - // For TYP_SIMD8 and TYP_SIMD12 we need to clear the upper bits and can't assume their value - - GenTree* tmp = m_compiler->gtNewOperNode(GT_AND, TYP_INT, msk, mskCns); - BlockRange().InsertAfter(mskCns, tmp); - LowerNode(tmp); - - msk = tmp; - - mskCns = m_compiler->gtNewIconNode(mskConstant, TYP_INT); - BlockRange().InsertAfter(msk, mskCns); - } - node->ChangeOper(cmpOp); node->ChangeType(TYP_INT); node->AsOp()->gtOp1 = msk; @@ -4185,7 +4139,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) if (isConstant) { - assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16) || (simdSize == 32) || (simdSize == 64)); + assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64)); for (GenTree* arg : node->Operands()) { @@ -5829,19 +5783,9 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) // var tmp3 = Avx.DotProduct(op1, op2, 0xFF); // return tmp3.ToScalar(); - if (simdSize == 8) - { - idx = m_compiler->gtNewIconNode(0x3F, TYP_INT); - } - else if (simdSize == 12) - { - idx = m_compiler->gtNewIconNode(0x7F, TYP_INT); - } - else - { - assert(simdSize == 16); - idx = m_compiler->gtNewIconNode(0xFF, TYP_INT); - } + assert(simdSize == 16); + + idx = m_compiler->gtNewIconNode(0xFF, TYP_INT); BlockRange().InsertBefore(node, idx); if (varTypeIsSIMD(node->gtType)) @@ -5913,91 +5857,6 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) unreached(); } } - - if (simdSize == 8) - { - assert(simdBaseType == TYP_FLOAT); - - // If simdSize == 8 then we have only two elements, not the 4 that we got from getSIMDVectorLength, - // which we gave a simdSize of 16. So, we set the simd16Count to 2 so that only 1 hadd will - // be emitted rather than 2, so that the upper two elements will be ignored. - - simd16Count = 2; - } - else if (simdSize == 12) - { - assert(simdBaseType == TYP_FLOAT); - - // We need to mask off the most significant element to avoid the shuffle + add - // from including it in the computed result. We need to do this for both op1 and - // op2 in case one of them is `NaN` (because Zero * NaN == NaN) - - simd16_t simd16Val = {}; - - simd16Val.i32[0] = -1; - simd16Val.i32[1] = -1; - simd16Val.i32[2] = -1; - simd16Val.i32[3] = +0; - - simdType = TYP_SIMD16; - simdSize = 16; - - // We will be constructing the following parts: - // ... - // +--* CNS_INT int -1 - // +--* CNS_INT int -1 - // +--* CNS_INT int -1 - // +--* CNS_INT int 0 - // tmp1 = * HWINTRINSIC simd16 T Create - // /--* op1 simd16 - // +--* tmp1 simd16 - // op1 = * HWINTRINSIC simd16 T And - // ... - - // This is roughly the following managed code: - // ... - // tmp1 = Vector128.Create(-1, -1, -1, 0); - // op1 = Sse.And(op1, tmp1); - // ... - - GenTreeVecCon* vecCon1 = m_compiler->gtNewVconNode(simdType); - memcpy(&vecCon1->gtSimdVal, &simd16Val, sizeof(simd16_t)); - BlockRange().InsertAfter(op1, vecCon1); - - op1 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op1, vecCon1, simdBaseType, simdSize); - BlockRange().InsertAfter(vecCon1, op1); - - LowerNode(vecCon1); - LowerNode(op1); - - // We will be constructing the following parts: - // ... - // +--* CNS_INT int -1 - // +--* CNS_INT int -1 - // +--* CNS_INT int -1 - // +--* CNS_INT int 0 - // tmp2 = * HWINTRINSIC simd16 T Create - // /--* op2 simd16 - // +--* tmp2 simd16 - // op2 = * HWINTRINSIC simd16 T And - // ... - - // This is roughly the following managed code: - // ... - // tmp2 = Vector128.Create(-1, -1, -1, 0); - // op2 = Sse.And(op2, tmp2); - // ... - - GenTreeVecCon* vecCon2 = m_compiler->gtNewVconNode(simdType); - memcpy(&vecCon2->gtSimdVal, &simd16Val, sizeof(simd16_t)); - BlockRange().InsertAfter(op2, vecCon2); - - op2 = m_compiler->gtNewSimdBinOpNode(GT_AND, simdType, op2, vecCon2, simdBaseType, simdSize); - BlockRange().InsertAfter(vecCon2, op2); - - LowerNode(vecCon2); - LowerNode(op2); - } } // We will be constructing the following parts: @@ -9284,17 +9143,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } } - if ((simdSize == 8) || (simdSize == 12)) - { - // We want to handle GetElement/ToScalar still for Vector2/3 - if (!HWIntrinsicInfo::IsVectorToScalar(intrinsicId) && !HWIntrinsicInfo::IsVectorGetElement(intrinsicId)) - { - // TODO-XArch-CQ: Ideally we would key this off of the size the containing node - // expects vs the size node actually is or would be if spilled to the stack - return; - } - } - // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained const bool isCommutative = node->isCommutativeHWIntrinsic(); From 4c9f71bca044c2572a6292995c957fc0d726974e Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Sat, 21 Mar 2026 17:18:04 -0700 Subject: [PATCH 2/3] remove one more unreachable condition --- src/coreclr/jit/lowerarmarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index cd21b6e13876f0..ffdd8b9729d73b 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -2145,7 +2145,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm } // Special case: "vec ==/!= zero_vector" - if (!varTypeIsFloating(simdBaseType) && (op != nullptr) && (simdSize != 12)) + if (!varTypeIsFloating(simdBaseType) && (op != nullptr)) { GenTree* cmp = op; if (simdSize != 8) // we don't need compression for Vector64 From e6b533c56a7dc45e737f4d84d06af4edc72c2ea0 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Sat, 21 Mar 2026 17:30:32 -0700 Subject: [PATCH 3/3] and another --- src/coreclr/jit/lowerxarch.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 0ee6a0eaa3d30c..2b181486ee7d75 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2969,10 +2969,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm } } - // TODO-XARCH-AVX512: We should handle TYP_SIMD12 here under the EVEX path, but doing - // so will require us to account for the unused 4th element. - - if ((simdType != TYP_SIMD12) && m_compiler->canUseEvexEncoding()) + if (m_compiler->canUseEvexEncoding()) { // The EVEX encoded versions of the comparison instructions all return a kmask //