From d264c75c984ee91ad47decb99eada7fab53c6fc3 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 30 Aug 2023 17:43:47 -0700 Subject: [PATCH 1/8] Use `Broadcasti128` --- src/coreclr/jit/gentree.cpp | 3 +- src/coreclr/jit/hwintrinsic.cpp | 1 + src/coreclr/jit/hwintrinsiclistxarch.h | 2 +- src/coreclr/jit/lower.h | 1 + src/coreclr/jit/lowerxarch.cpp | 71 +++++++++++++++++++++++++- 5 files changed, 74 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 1d5ed98d0b4910..3383c57e7d6650 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -25514,6 +25514,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: + case NI_AVX512F_BroadcastVector128ToVector512: if (GetAuxiliaryJitType() == CORINFO_TYPE_PTR) { addr = Op(1); @@ -25557,7 +25558,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const if (addr != nullptr) { - assert(varTypeIsI(addr)); + assert((varTypeIsI(addr) || addr->IsVectorConst())); return true; } diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 287b214d0f40b8..221477e724a564 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -1426,6 +1426,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: + case NI_AVX512F_BroadcastVector128ToVector512: { // These intrinsics have both pointer and vector overloads // We want to be able to differentiate between them so lets diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 0300c6b7ef0e24..5c32d5890a90ed 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -835,7 +835,7 @@ HARDWARE_INTRINSIC(AVX512F, And, HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpblendmd, INS_vpblendmd, INS_vpblendmq, INS_vpblendmq, INS_vblendmps, INS_vblendmpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, CompareEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512F, CompareGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 157ed28e70fbb9..d4cb20b3ef1e7e 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -118,6 +118,7 @@ class Lowering final : public Phase void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); #ifdef TARGET_XARCH void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); + bool TryCompressConstVecData(GenTreeStoreInd* node); #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 3daa96268d7213..bad90a6a1e00ff 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -96,12 +96,17 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) } } +#if defined(FEATURE_HW_INTRINSICS) + if(comp->IsBaselineVector512IsaSupportedOpportunistically() && node->Data()->OperIs(GT_CNS_VEC) && node->Data()->AsVecCon()->TypeIs(TYP_SIMD64)) + { + TryCompressConstVecData(node); + } +#endif // Optimization: do not unnecessarily zero-extend the result of setcc. if (varTypeIsByte(node) && (node->Data()->OperIsCompare() || node->Data()->OperIs(GT_SETCC))) { node->Data()->ChangeType(TYP_BYTE); } - ContainCheckStoreIndir(node); } @@ -7663,6 +7668,20 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { switch (parentIntrinsicId) { + case NI_AVX512F_BroadcastVector128ToVector512: + { + if(parentNode->OperIsMemoryLoad()) + { + supportsGeneralLoads = !childNode->OperIsHWIntrinsic(); + break; + } + else + { + supportsGeneralLoads = true; + break; + } + } + case NI_SSE41_ConvertToVector128Int16: case NI_SSE41_ConvertToVector128Int32: case NI_SSE41_ConvertToVector128Int64: @@ -8512,6 +8531,43 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, MakeSrcContained(parentNode, childNode); } +//---------------------------------------------------------------------------------------------- +// TryCompressConstVecData: +// Try to compress the constant vector input if it has duplicated parts and can be optimized by +// broadcast +// +// Arguments: +// node - the storeind node. +// +// Return: +// return true if compress success. +bool Lowering::TryCompressConstVecData(GenTreeStoreInd* node) +{ + assert(node->Data()->OperIs(GT_CNS_VEC)); + GenTreeVecCon* vecCon = node->Data()->AsVecCon(); + // Try use broadcasti128 + if(vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[1] && + vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[2] && + vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[3]) + { + simd16_t simd16Val = {}; + simd16Val.f64[0] = vecCon->gtSimd64Val.f64[0]; + simd16Val.f64[1] = vecCon->gtSimd64Val.f64[1]; + GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); + memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); + // GenTreeIndir* compressedVecConIndir = comp->gtNewIndir(TYP_I_IMPL, compressedVecCon, GTF_IND_NONFAULTING | GTF_IND_INVARIANT); + BlockRange().InsertBefore(node->Data(), compressedVecCon); + // BlockRange().InsertBefore(node->Data(), compressedVecConIndir); + BlockRange().Remove(vecCon); + GenTreeHWIntrinsic* broadcast128 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, NI_AVX512F_BroadcastVector128ToVector512, CORINFO_TYPE_UINT, 64); + BlockRange().InsertBefore(node, broadcast128); + node->Data() = broadcast128; + LowerNode(broadcast128); + return true; + } + return false; +} + //---------------------------------------------------------------------------------------------- // ContainCheckHWIntrinsicAddr: Perform containment analysis for an address operand of a hardware // intrinsic node. @@ -8708,6 +8764,18 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_AVX512F_BroadcastVector128ToVector512: + { + if(node->OperIsMemoryLoad()) + { + ContainCheckHWIntrinsicAddr(node, op1); + return; + } + + assert(op1->OperIs(GT_CNS_VEC)); + break; + } + case NI_AVX512F_ConvertToVector256Int32: case NI_AVX512F_ConvertToVector256UInt32: case NI_AVX512F_VL_ConvertToVector128UInt32: @@ -8768,7 +8836,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) assert(!node->OperIsMemoryLoad()); bool supportsRegOptional = false; - if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional)) { MakeSrcContained(node, op1); From baf3bcc0f1663d27b46dbd9cb0fbc2485e33c1ba Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 31 Aug 2023 13:52:42 -0700 Subject: [PATCH 2/8] remove un-needed changes --- src/coreclr/jit/gentree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 3383c57e7d6650..74c6d496b5872c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -25558,7 +25558,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const if (addr != nullptr) { - assert((varTypeIsI(addr) || addr->IsVectorConst())); + assert(varTypeIsI(addr)); return true; } From 698b72e21b2542ea8bfe8ef59bfe97d3d3e04475 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 11 Sep 2023 12:38:26 -0700 Subject: [PATCH 3/8] Nit: remove some unnecessary commetns and line deletion. --- src/coreclr/jit/lowerxarch.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index bad90a6a1e00ff..f24b15e7d346b7 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -8555,9 +8555,7 @@ bool Lowering::TryCompressConstVecData(GenTreeStoreInd* node) simd16Val.f64[1] = vecCon->gtSimd64Val.f64[1]; GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); - // GenTreeIndir* compressedVecConIndir = comp->gtNewIndir(TYP_I_IMPL, compressedVecCon, GTF_IND_NONFAULTING | GTF_IND_INVARIANT); BlockRange().InsertBefore(node->Data(), compressedVecCon); - // BlockRange().InsertBefore(node->Data(), compressedVecConIndir); BlockRange().Remove(vecCon); GenTreeHWIntrinsic* broadcast128 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, NI_AVX512F_BroadcastVector128ToVector512, CORINFO_TYPE_UINT, 64); BlockRange().InsertBefore(node, broadcast128); @@ -8836,6 +8834,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) assert(!node->OperIsMemoryLoad()); bool supportsRegOptional = false; + if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional)) { MakeSrcContained(node, op1); From afda4de96d2cb50970a68f4e9cc0c285009756e6 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 12 Sep 2023 10:52:32 -0700 Subject: [PATCH 4/8] filter out the AllBitsSet and Zero vector from the opts --- src/coreclr/jit/lowerxarch.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index f24b15e7d346b7..881b7fcf58009c 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -99,7 +99,11 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) #if defined(FEATURE_HW_INTRINSICS) if(comp->IsBaselineVector512IsaSupportedOpportunistically() && node->Data()->OperIs(GT_CNS_VEC) && node->Data()->AsVecCon()->TypeIs(TYP_SIMD64)) { - TryCompressConstVecData(node); + if(!node->Data()->AsVecCon()->IsAllBitsSet() && !node->Data()->AsVecCon()->IsZero()) + { + // To avoid some unexpected regression, this optimization only applies to non-all 1/0 constant vectors. + TryCompressConstVecData(node); + } } #endif // Optimization: do not unnecessarily zero-extend the result of setcc. From e9d6ca795bc1b777e8d86e7fc7a476fb42844837 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 13 Sep 2023 10:31:36 -0700 Subject: [PATCH 5/8] Apply format patch --- src/coreclr/jit/lowerxarch.cpp | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 881b7fcf58009c..dbe08b51a98fe3 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -97,9 +97,10 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) } #if defined(FEATURE_HW_INTRINSICS) - if(comp->IsBaselineVector512IsaSupportedOpportunistically() && node->Data()->OperIs(GT_CNS_VEC) && node->Data()->AsVecCon()->TypeIs(TYP_SIMD64)) + if (comp->IsBaselineVector512IsaSupportedOpportunistically() && node->Data()->OperIs(GT_CNS_VEC) && + node->Data()->AsVecCon()->TypeIs(TYP_SIMD64)) { - if(!node->Data()->AsVecCon()->IsAllBitsSet() && !node->Data()->AsVecCon()->IsZero()) + if (!node->Data()->AsVecCon()->IsAllBitsSet() && !node->Data()->AsVecCon()->IsZero()) { // To avoid some unexpected regression, this optimization only applies to non-all 1/0 constant vectors. TryCompressConstVecData(node); @@ -7674,7 +7675,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { case NI_AVX512F_BroadcastVector128ToVector512: { - if(parentNode->OperIsMemoryLoad()) + if (parentNode->OperIsMemoryLoad()) { supportsGeneralLoads = !childNode->OperIsHWIntrinsic(); break; @@ -8550,18 +8551,20 @@ bool Lowering::TryCompressConstVecData(GenTreeStoreInd* node) assert(node->Data()->OperIs(GT_CNS_VEC)); GenTreeVecCon* vecCon = node->Data()->AsVecCon(); // Try use broadcasti128 - if(vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[1] && - vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[2] && - vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[3]) + if (vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[1] && + vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[2] && + vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[3]) { - simd16_t simd16Val = {}; - simd16Val.f64[0] = vecCon->gtSimd64Val.f64[0]; - simd16Val.f64[1] = vecCon->gtSimd64Val.f64[1]; + simd16_t simd16Val = {}; + simd16Val.f64[0] = vecCon->gtSimd64Val.f64[0]; + simd16Val.f64[1] = vecCon->gtSimd64Val.f64[1]; GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); BlockRange().InsertBefore(node->Data(), compressedVecCon); BlockRange().Remove(vecCon); - GenTreeHWIntrinsic* broadcast128 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, NI_AVX512F_BroadcastVector128ToVector512, CORINFO_TYPE_UINT, 64); + GenTreeHWIntrinsic* broadcast128 = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, NI_AVX512F_BroadcastVector128ToVector512, + CORINFO_TYPE_UINT, 64); BlockRange().InsertBefore(node, broadcast128); node->Data() = broadcast128; LowerNode(broadcast128); @@ -8768,7 +8771,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX512F_BroadcastVector128ToVector512: { - if(node->OperIsMemoryLoad()) + if (node->OperIsMemoryLoad()) { ContainCheckHWIntrinsicAddr(node, op1); return; @@ -8838,7 +8841,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) assert(!node->OperIsMemoryLoad()); bool supportsRegOptional = false; - + if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional)) { MakeSrcContained(node, op1); From 4a3bf08dfb03a17736053210a76303ba66af3dde Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 20 Sep 2023 17:52:30 -0700 Subject: [PATCH 6/8] extend the coverage to V512->V256 and V256->V128 --- src/coreclr/jit/gentree.cpp | 2 + src/coreclr/jit/hwintrinsic.cpp | 2 + src/coreclr/jit/hwintrinsiclistxarch.h | 4 +- src/coreclr/jit/lower.h | 2 +- src/coreclr/jit/lowerxarch.cpp | 111 ++++++++++++++++++------- 5 files changed, 90 insertions(+), 31 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 74c6d496b5872c..4fda3845fa0891 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -25514,7 +25514,9 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: + case NI_AVX2_BroadcastVector128ToVector256: case NI_AVX512F_BroadcastVector128ToVector512: + case NI_AVX512F_BroadcastVector256ToVector512: if (GetAuxiliaryJitType() == CORINFO_TYPE_PTR) { addr = Op(1); diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 221477e724a564..06803b3a76894c 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -1426,7 +1426,9 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: + case NI_AVX2_BroadcastVector128ToVector256: case NI_AVX512F_BroadcastVector128ToVector512: + case NI_AVX512F_BroadcastVector256ToVector512: { // These intrinsics have both pointer and vector overloads // We want to be able to differentiate between them so lets diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 5c32d5890a90ed..bddcb05ad7fce4 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -766,7 +766,7 @@ HARDWARE_INTRINSIC(AVX2, Blend, HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, false, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector128, 16, 1, true, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector256, 32, 1, true, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, false, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, false, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, true, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, true, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, true, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) @@ -836,7 +836,7 @@ HARDWARE_INTRINSIC(AVX512F, AndNot, HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpblendmd, INS_vpblendmd, INS_vpblendmq, INS_vpblendmq, INS_vblendmps, INS_vblendmpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX512F, CompareEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512F, CompareGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index d4cb20b3ef1e7e..fcde0e67292e8c 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -118,7 +118,7 @@ class Lowering final : public Phase void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); #ifdef TARGET_XARCH void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); - bool TryCompressConstVecData(GenTreeStoreInd* node); + void TryCompressConstVecData(GenTreeStoreInd* node); #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index dbe08b51a98fe3..61efd510f04966 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -96,23 +96,34 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) } } + // Optimization: do not unnecessarily zero-extend the result of setcc. + if (varTypeIsByte(node) && (node->Data()->OperIsCompare() || node->Data()->OperIs(GT_SETCC))) + { + node->Data()->ChangeType(TYP_BYTE); + } + ContainCheckStoreIndir(node); + #if defined(FEATURE_HW_INTRINSICS) - if (comp->IsBaselineVector512IsaSupportedOpportunistically() && node->Data()->OperIs(GT_CNS_VEC) && - node->Data()->AsVecCon()->TypeIs(TYP_SIMD64)) + if (comp->IsBaselineVector512IsaSupportedOpportunistically() || comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) { - if (!node->Data()->AsVecCon()->IsAllBitsSet() && !node->Data()->AsVecCon()->IsZero()) + if (!node->Data()->OperIs(GT_CNS_VEC)) + { + return; + } + + if(!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32) && !node->Data()->AsVecCon()->TypeIs(TYP_SIMD64)) + { + return; + } + if (node->Data()->AsVecCon()->IsAllBitsSet() || node->Data()->AsVecCon()->IsZero()) { // To avoid some unexpected regression, this optimization only applies to non-all 1/0 constant vectors. - TryCompressConstVecData(node); + return; } + + TryCompressConstVecData(node); } #endif - // Optimization: do not unnecessarily zero-extend the result of setcc. - if (varTypeIsByte(node) && (node->Data()->OperIsCompare() || node->Data()->OperIs(GT_SETCC))) - { - node->Data()->ChangeType(TYP_BYTE); - } - ContainCheckStoreIndir(node); } //---------------------------------------------------------------------------------------------- @@ -7673,7 +7684,9 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { switch (parentIntrinsicId) { + case NI_AVX2_BroadcastVector128ToVector256: case NI_AVX512F_BroadcastVector128ToVector512: + case NI_AVX512F_BroadcastVector256ToVector512: { if (parentNode->OperIsMemoryLoad()) { @@ -8546,31 +8559,71 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, // // Return: // return true if compress success. -bool Lowering::TryCompressConstVecData(GenTreeStoreInd* node) +void Lowering::TryCompressConstVecData(GenTreeStoreInd* node) { assert(node->Data()->OperIs(GT_CNS_VEC)); GenTreeVecCon* vecCon = node->Data()->AsVecCon(); - // Try use broadcasti128 - if (vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[1] && + GenTreeHWIntrinsic* broadcast = nullptr; + + if(vecCon->TypeIs(TYP_SIMD32)) + { + assert(comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)); + if(vecCon->gtSimd32Val.v128[0] == vecCon->gtSimdVal.v128[1]) + { + simd16_t simd16Val = {}; + simd16Val.f64[0] = vecCon->gtSimd32Val.f64[0]; + simd16Val.f64[1] = vecCon->gtSimd32Val.f64[1]; + GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); + memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); + BlockRange().InsertBefore(node->Data(), compressedVecCon); + BlockRange().Remove(vecCon); + broadcast = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, compressedVecCon, NI_AVX2_BroadcastVector128ToVector256, + CORINFO_TYPE_UINT, 32); + } + } + else + { + assert(vecCon->TypeIs(TYP_SIMD64)); + assert(comp->IsBaselineVector512IsaSupportedOpportunistically()); + if (vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[1] && vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[2] && vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[3]) + { + simd16_t simd16Val = {}; + simd16Val.f64[0] = vecCon->gtSimd64Val.f64[0]; + simd16Val.f64[1] = vecCon->gtSimd64Val.f64[1]; + GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); + memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); + BlockRange().InsertBefore(node->Data(), compressedVecCon); + BlockRange().Remove(vecCon); + broadcast = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, NI_AVX512F_BroadcastVector128ToVector512, + CORINFO_TYPE_UINT, 64); + } + else if(vecCon->gtSimd64Val.v256[0] == vecCon->gtSimd64Val.v256[1]) + { + simd32_t simd32Val = {}; + simd32Val.v128[0] = vecCon->gtSimd32Val.v128[0]; + simd32Val.v128[1] = vecCon->gtSimd32Val.v128[1]; + GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD32); + memcpy(&compressedVecCon->gtSimdVal, &simd32Val, sizeof(simd32_t)); + BlockRange().InsertBefore(node->Data(), compressedVecCon); + BlockRange().Remove(vecCon); + broadcast = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, NI_AVX512F_BroadcastVector256ToVector512, + CORINFO_TYPE_UINT, 64); + } + } + + if(broadcast == nullptr) { - simd16_t simd16Val = {}; - simd16Val.f64[0] = vecCon->gtSimd64Val.f64[0]; - simd16Val.f64[1] = vecCon->gtSimd64Val.f64[1]; - GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD16); - memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); - BlockRange().InsertBefore(node->Data(), compressedVecCon); - BlockRange().Remove(vecCon); - GenTreeHWIntrinsic* broadcast128 = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, NI_AVX512F_BroadcastVector128ToVector512, - CORINFO_TYPE_UINT, 64); - BlockRange().InsertBefore(node, broadcast128); - node->Data() = broadcast128; - LowerNode(broadcast128); - return true; + return; } - return false; + + BlockRange().InsertBefore(node, broadcast); + node->Data() = broadcast; + LowerNode(broadcast); } //---------------------------------------------------------------------------------------------- @@ -8769,7 +8822,9 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_AVX2_BroadcastVector128ToVector256: case NI_AVX512F_BroadcastVector128ToVector512: + case NI_AVX512F_BroadcastVector256ToVector512: { if (node->OperIsMemoryLoad()) { From 53529986396618027c943985588f2d46182b75be Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 20 Sep 2023 18:02:55 -0700 Subject: [PATCH 7/8] apply format patch --- src/coreclr/jit/lowerxarch.cpp | 46 ++++++++++++++++------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 61efd510f04966..85d92cc298016e 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -104,14 +104,15 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) ContainCheckStoreIndir(node); #if defined(FEATURE_HW_INTRINSICS) - if (comp->IsBaselineVector512IsaSupportedOpportunistically() || comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + if (comp->IsBaselineVector512IsaSupportedOpportunistically() || + comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) { if (!node->Data()->OperIs(GT_CNS_VEC)) { return; } - if(!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32) && !node->Data()->AsVecCon()->TypeIs(TYP_SIMD64)) + if (!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32) && !node->Data()->AsVecCon()->TypeIs(TYP_SIMD64)) { return; } @@ -8562,13 +8563,13 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, void Lowering::TryCompressConstVecData(GenTreeStoreInd* node) { assert(node->Data()->OperIs(GT_CNS_VEC)); - GenTreeVecCon* vecCon = node->Data()->AsVecCon(); + GenTreeVecCon* vecCon = node->Data()->AsVecCon(); GenTreeHWIntrinsic* broadcast = nullptr; - if(vecCon->TypeIs(TYP_SIMD32)) - { + if (vecCon->TypeIs(TYP_SIMD32)) + { assert(comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)); - if(vecCon->gtSimd32Val.v128[0] == vecCon->gtSimdVal.v128[1]) + if (vecCon->gtSimd32Val.v128[0] == vecCon->gtSimdVal.v128[1]) { simd16_t simd16Val = {}; simd16Val.f64[0] = vecCon->gtSimd32Val.f64[0]; @@ -8577,18 +8578,17 @@ void Lowering::TryCompressConstVecData(GenTreeStoreInd* node) memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); BlockRange().InsertBefore(node->Data(), compressedVecCon); BlockRange().Remove(vecCon); - broadcast = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, compressedVecCon, NI_AVX2_BroadcastVector128ToVector256, - CORINFO_TYPE_UINT, 32); + broadcast = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, compressedVecCon, + NI_AVX2_BroadcastVector128ToVector256, CORINFO_TYPE_UINT, 32); } - } - else - { + } + else + { assert(vecCon->TypeIs(TYP_SIMD64)); assert(comp->IsBaselineVector512IsaSupportedOpportunistically()); if (vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[1] && - vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[2] && - vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[3]) + vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[2] && + vecCon->gtSimd64Val.v128[0] == vecCon->gtSimd64Val.v128[3]) { simd16_t simd16Val = {}; simd16Val.f64[0] = vecCon->gtSimd64Val.f64[0]; @@ -8597,26 +8597,24 @@ void Lowering::TryCompressConstVecData(GenTreeStoreInd* node) memcpy(&compressedVecCon->gtSimdVal, &simd16Val, sizeof(simd16_t)); BlockRange().InsertBefore(node->Data(), compressedVecCon); BlockRange().Remove(vecCon); - broadcast = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, NI_AVX512F_BroadcastVector128ToVector512, - CORINFO_TYPE_UINT, 64); + broadcast = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, + NI_AVX512F_BroadcastVector128ToVector512, CORINFO_TYPE_UINT, 64); } - else if(vecCon->gtSimd64Val.v256[0] == vecCon->gtSimd64Val.v256[1]) + else if (vecCon->gtSimd64Val.v256[0] == vecCon->gtSimd64Val.v256[1]) { simd32_t simd32Val = {}; - simd32Val.v128[0] = vecCon->gtSimd32Val.v128[0]; - simd32Val.v128[1] = vecCon->gtSimd32Val.v128[1]; + simd32Val.v128[0] = vecCon->gtSimd32Val.v128[0]; + simd32Val.v128[1] = vecCon->gtSimd32Val.v128[1]; GenTreeVecCon* compressedVecCon = comp->gtNewVconNode(TYP_SIMD32); memcpy(&compressedVecCon->gtSimdVal, &simd32Val, sizeof(simd32_t)); BlockRange().InsertBefore(node->Data(), compressedVecCon); BlockRange().Remove(vecCon); - broadcast = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, NI_AVX512F_BroadcastVector256ToVector512, - CORINFO_TYPE_UINT, 64); + broadcast = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, compressedVecCon, + NI_AVX512F_BroadcastVector256ToVector512, CORINFO_TYPE_UINT, 64); } } - if(broadcast == nullptr) + if (broadcast == nullptr) { return; } From a01fd58d6675f255d0e1754a835e232ea1719838 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 25 Oct 2023 09:57:32 -0700 Subject: [PATCH 8/8] Resolve comment --- src/coreclr/jit/lowerxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 85d92cc298016e..954f795c1f7553 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -112,7 +112,7 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) return; } - if (!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32) && !node->Data()->AsVecCon()->TypeIs(TYP_SIMD64)) + if (!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32, TYP_SIMD64)) { return; }