From 2733b543128d7d32c23d59b513e63c876aba741d Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 3 Sep 2024 11:22:59 +0100 Subject: [PATCH 01/31] Add BuildConditionalSelectHWIntrinsic() --- src/coreclr/jit/lsra.h | 3 + src/coreclr/jit/lsraarm64.cpp | 358 +++++++++++++++++++--------------- 2 files changed, 209 insertions(+), 152 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 6ae08910afc10f..9e1acae39cfc75 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -2065,6 +2065,9 @@ class LinearScan : public LinearScanInterface #ifdef TARGET_ARM64 int BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwNode = nullptr); void BuildConsecutiveRegistersForDef(GenTree* treeNode, int fieldCount); + int BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrinsicTree, + const HWIntrinsic intrin, + int* pDstCount); #endif // TARGET_ARM64 #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 79bf3c16778df6..b733086f6f5693 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1369,6 +1369,198 @@ int LinearScan::BuildNode(GenTree* tree) #include "hwintrinsic.h" +//------------------------------------------------------------------------ +// BuildHWIntrinsic: Set the NodeInfo for a GT_HWINTRINSIC ConditionalSelect tree with an embedded op. +// +// These need special handling as the ConditionalSelect and embedded op will be generated as a single instruction, +// and possibly prefixed with a MOVPRFX +// +// +// Arguments: +// tree - The GT_HWINTRINSIC node of interest +// intrin - The GT_HWINTRINSIC node as a HWIntrinsic +// pDstCount - OUT parameter - the number of registers defined for the given node +// +// Return Value: +// The number of sources consumed by this node. +// +int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrinsicTree, + const HWIntrinsic intrin, + int* pDstCount) +{ + assert(intrin.id == NI_Sve_ConditionalSelect); + assert(HWIntrinsicInfo::IsMaskedOperation(intrin.id)); + assert(HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)); + assert(!HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)); + assert(!HWIntrinsicInfo::IsLowVectorOperation(intrin.id)); + assert(!HWIntrinsicInfo::IsMultiReg(intrin.id)); + assert(intrin.op1 != nullptr); + assert(varTypeIsMask(intrin.op1->TypeGet())); + assert(intrin.op2 != nullptr); + assert(intrin.op2->OperIs(GT_HWINTRINSIC)); + assert(intrin.op2->IsEmbMaskOp()); + assert(!intrin.op2->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)); + assert(intrin.op3 != nullptr); + assert(intrin.op4 == nullptr); + + int srcCount = 0; + + // Handle Op1 + + bool tgtPrefEmbOp2 = false; + SingleTypeRegSet predMask = RBM_ALLMASK.GetPredicateRegSet(); + if (intrin.id == NI_Sve_ConditionalSelect) + { + // If this is conditional select, make sure to check the embedded + // operation to determine the predicate mask. + assert(intrinsicTree->GetOperandCount() == 3); + assert(!HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)); + + GenTreeHWIntrinsic* embOp2Node = intrin.op2->AsHWIntrinsic(); + const HWIntrinsic intrinEmb(embOp2Node); + if (HWIntrinsicInfo::IsLowMaskedOperation(intrinEmb.id)) + { + predMask = RBM_LOWMASK.GetPredicateRegSet(); + } + + // Special-case, CreateBreakPropagateMask's op2 is the RMW node. + if (intrinEmb.id == NI_Sve_CreateBreakPropagateMask) + { + assert(embOp2Node->isRMWHWIntrinsic(compiler)); + tgtPrefEmbOp2 = true; + } + } + + if (tgtPrefEmbOp2) + { + srcCount += BuildDelayFreeUses(intrin.op1, nullptr, predMask); + } + else + { + srcCount += BuildOperandUses(intrin.op1, predMask); + } + + // Handle Op2 and Op3 + + if ((intrin.op2->isRMWHWIntrinsic(compiler))) + { + // If the embedded operation has RMW semantics then record delay-free for operands as well as the "merge" value + GenTreeHWIntrinsic* embOp2Node = intrin.op2->AsHWIntrinsic(); + size_t embNumArgs = embOp2Node->GetOperandCount(); + const HWIntrinsic intrinEmb(embOp2Node); + GenTree* prefUseNode = nullptr; + + if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmb.id)) + { + assert(embNumArgs == 3); + + LIR::Use use; + GenTree* user = nullptr; + + if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embOp2Node, &use)) + { + user = use.User(); + } + unsigned resultOpNum = + embOp2Node->GetResultOpNumForRmwIntrinsic(user, intrinEmb.op1, intrinEmb.op2, intrinEmb.op3); + + if (resultOpNum == 0) + { + prefUseNode = embOp2Node->Op(1); + } + else + { + assert(resultOpNum >= 1 && resultOpNum <= 3); + prefUseNode = embOp2Node->Op(resultOpNum); + } + } + else + { + const bool embHasImmediateOperand = HWIntrinsicInfo::HasImmediateOperand(intrinEmb.id); + assert((embNumArgs == 1) || (embNumArgs == 2) || (embNumArgs == 3) || + (embHasImmediateOperand && (embNumArgs == 4))); + + // Special handling for embedded intrinsics with immediates: + // We might need an additional register to hold branch targets into the switch table + // that encodes the immediate + bool needsInternalRegister; + switch (intrinEmb.id) + { + case NI_Sve_ShiftRightArithmeticForDivide: + assert(embHasImmediateOperand); + assert(embNumArgs == 2); + needsInternalRegister = !embOp2Node->Op(2)->isContainedIntOrIImmed(); + break; + + case NI_Sve_AddRotateComplex: + assert(embHasImmediateOperand); + assert(embNumArgs == 3); + needsInternalRegister = !embOp2Node->Op(3)->isContainedIntOrIImmed(); + break; + + case NI_Sve_MultiplyAddRotateComplex: + assert(embHasImmediateOperand); + assert(embNumArgs == 4); + needsInternalRegister = !embOp2Node->Op(4)->isContainedIntOrIImmed(); + break; + + default: + assert(!embHasImmediateOperand); + needsInternalRegister = false; + break; + } + + if (needsInternalRegister) + { + buildInternalIntRegisterDefForNode(embOp2Node); + } + + size_t prefUseOpNum = 1; + if (intrinEmb.id == NI_Sve_CreateBreakPropagateMask) + { + prefUseOpNum = 2; + } + prefUseNode = embOp2Node->Op(prefUseOpNum); + } + + for (size_t argNum = 1; argNum <= embNumArgs; argNum++) + { + GenTree* node = embOp2Node->Op(argNum); + + if (node == prefUseNode) + { + tgtPrefUse = BuildUse(node); + srcCount++; + } + else + { + RefPosition* useRefPosition = nullptr; + + int uses = BuildDelayFreeUses(node, nullptr, RBM_NONE, &useRefPosition); + srcCount += uses; + + // It is a hard requirement that these are not allocated to the same register as the destination, + // so verify no optimizations kicked in to skip setting the delay-free. + assert((useRefPosition != nullptr && useRefPosition->delayRegFree) || (uses == 0)); + } + } + + srcCount += BuildDelayFreeUses(intrin.op3, prefUseNode); + } + else + { + srcCount += BuildOperandUses(intrin.op2); + srcCount += BuildOperandUses(intrin.op3); + } + + buildInternalRegisterUses(); + + BuildDef(intrinsicTree); + + *pDstCount = 1; + return srcCount; +} + //------------------------------------------------------------------------ // BuildHWIntrinsic: Set the NodeInfo for a GT_HWINTRINSIC tree. // @@ -1584,6 +1776,12 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } } + // ConditionalSelect with embedded masked operations require special handling + if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp())) + { + return BuildConditionalSelectWithEmbeddedOp(intrinsicTree, intrin, pDstCount); + } + // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it // is not allocated the same register as the target. const bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); @@ -1680,40 +1878,14 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } else { - bool tgtPrefEmbOp2 = false; - SingleTypeRegSet predMask = RBM_ALLMASK.GetPredicateRegSet(); - if (intrin.id == NI_Sve_ConditionalSelect) - { - // If this is conditional select, make sure to check the embedded - // operation to determine the predicate mask. - assert(intrinsicTree->GetOperandCount() == 3); - assert(!HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)); - - if (intrin.op2->OperIs(GT_HWINTRINSIC)) - { - GenTreeHWIntrinsic* embOp2Node = intrin.op2->AsHWIntrinsic(); - const HWIntrinsic intrinEmb(embOp2Node); - if (HWIntrinsicInfo::IsLowMaskedOperation(intrinEmb.id)) - { - predMask = RBM_LOWMASK.GetPredicateRegSet(); - } + SingleTypeRegSet predMask = RBM_ALLMASK.GetPredicateRegSet(); - // Special-case, CreateBreakPropagateMask's op2 is the RMW node. - if (intrinEmb.id == NI_Sve_CreateBreakPropagateMask) - { - assert(embOp2Node->isRMWHWIntrinsic(compiler)); - assert(!tgtPrefOp1); - assert(!tgtPrefOp2); - tgtPrefEmbOp2 = true; - } - } - } - else if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)) + if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)) { predMask = RBM_LOWMASK.GetPredicateRegSet(); } - if (tgtPrefOp2 || tgtPrefEmbOp2) + if (tgtPrefOp2) { assert(!tgtPrefOp1); srcCount += BuildDelayFreeUses(intrin.op1, nullptr, predMask); @@ -1733,13 +1905,13 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou tgtPrefUse = BuildUse(intrin.op1); srcCount++; } - else if ((intrin.id != NI_AdvSimd_VectorTableLookup) && (intrin.id != NI_AdvSimd_Arm64_VectorTableLookup)) + else if ((intrin.id == NI_AdvSimd_VectorTableLookup) || (intrin.id == NI_AdvSimd_Arm64_VectorTableLookup)) { - srcCount += BuildOperandUses(intrin.op1); + srcCount += BuildConsecutiveRegistersForUse(intrin.op1); } else { - srcCount += BuildConsecutiveRegistersForUse(intrin.op1); + srcCount += BuildOperandUses(intrin.op1); } } @@ -1934,116 +2106,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou return srcCount; } - else if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp()) && - (intrin.op2->isRMWHWIntrinsic(compiler))) - { - assert(intrin.op3 != nullptr); - - // For ConditionalSelect, if there is an embedded operation, and the operation has RMW semantics - // then record delay-free for operands as well as the "merge" value - GenTreeHWIntrinsic* embOp2Node = intrin.op2->AsHWIntrinsic(); - size_t numArgs = embOp2Node->GetOperandCount(); - const HWIntrinsic intrinEmb(embOp2Node); - numArgs = embOp2Node->GetOperandCount(); - GenTree* prefUseNode = nullptr; - - if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmb.id)) - { - assert(embOp2Node->isRMWHWIntrinsic(compiler)); - assert(numArgs == 3); - - LIR::Use use; - GenTree* user = nullptr; - - if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embOp2Node, &use)) - { - user = use.User(); - } - unsigned resultOpNum = - embOp2Node->GetResultOpNumForRmwIntrinsic(user, intrinEmb.op1, intrinEmb.op2, intrinEmb.op3); - - if (resultOpNum == 0) - { - prefUseNode = embOp2Node->Op(1); - } - else - { - assert(resultOpNum >= 1 && resultOpNum <= 3); - prefUseNode = embOp2Node->Op(resultOpNum); - } - } - else - { - const bool embHasImmediateOperand = HWIntrinsicInfo::HasImmediateOperand(intrinEmb.id); - assert((numArgs == 1) || (numArgs == 2) || (numArgs == 3) || (embHasImmediateOperand && (numArgs == 4))); - - // Special handling for embedded intrinsics with immediates: - // We might need an additional register to hold branch targets into the switch table - // that encodes the immediate - bool needsInternalRegister; - switch (intrinEmb.id) - { - case NI_Sve_ShiftRightArithmeticForDivide: - assert(embHasImmediateOperand); - assert(numArgs == 2); - needsInternalRegister = !embOp2Node->Op(2)->isContainedIntOrIImmed(); - break; - - case NI_Sve_AddRotateComplex: - assert(embHasImmediateOperand); - assert(numArgs == 3); - needsInternalRegister = !embOp2Node->Op(3)->isContainedIntOrIImmed(); - break; - - case NI_Sve_MultiplyAddRotateComplex: - assert(embHasImmediateOperand); - assert(numArgs == 4); - needsInternalRegister = !embOp2Node->Op(4)->isContainedIntOrIImmed(); - break; - - default: - assert(!embHasImmediateOperand); - needsInternalRegister = false; - break; - } - - if (needsInternalRegister) - { - buildInternalIntRegisterDefForNode(embOp2Node); - } - - size_t prefUseOpNum = 1; - if (intrinEmb.id == NI_Sve_CreateBreakPropagateMask) - { - prefUseOpNum = 2; - } - prefUseNode = embOp2Node->Op(prefUseOpNum); - } - - for (size_t argNum = 1; argNum <= numArgs; argNum++) - { - GenTree* node = embOp2Node->Op(argNum); - - if (node == prefUseNode) - { - tgtPrefUse = BuildUse(node); - srcCount++; - } - else - { - RefPosition* useRefPosition = nullptr; - - int uses = BuildDelayFreeUses(node, nullptr, RBM_NONE, &useRefPosition); - srcCount += uses; - - // It is a hard requirement that these are not allocated to the same register as the destination, - // so verify no optimizations kicked in to skip setting the delay-free. - assert((useRefPosition != nullptr && useRefPosition->delayRegFree) || (uses == 0)); - } - } - - srcCount += BuildDelayFreeUses(intrin.op3, prefUseNode); - } else if (intrin.op2 != nullptr) { // RMW intrinsic operands doesn't have to be delayFree when they can be assigned the same register as op1Reg @@ -2051,7 +2113,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(intrin.op1 != nullptr); - bool forceOp2DelayFree = false; SingleTypeRegSet lowVectorCandidates = RBM_NONE; size_t lowVectorOperandNum = 0; @@ -2133,15 +2194,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou candidates = RBM_ALLMASK.GetPredicateRegSet(); } - if (forceOp2DelayFree) - { - srcCount += BuildDelayFreeUses(intrin.op2, nullptr, candidates); - } - else - { - srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, intrin.op1, candidates) - : BuildOperandUses(intrin.op2, candidates); - } + srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, intrin.op1, candidates) + : BuildOperandUses(intrin.op2, candidates); } break; } From 9c75bebde955aed921e2a974490694fa39c8f378 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 3 Sep 2024 12:10:26 +0100 Subject: [PATCH 02/31] Add GetRMWOp() --- src/coreclr/jit/gentree.cpp | 102 +++++++++++++++++++++++++++++++++- src/coreclr/jit/gentree.h | 4 ++ src/coreclr/jit/lsraarm64.cpp | 99 ++++++--------------------------- 3 files changed, 122 insertions(+), 83 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index c60bff8c10202b..0806d64f1667ac 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -30297,7 +30297,107 @@ unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree return 0; } -#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS + +#if defined(TARGET_ARM64) +//------------------------------------------------------------------------ +// GetDelayFreeOp: Get the delay free characteristics of the HWIntrinsic +// +// For a RMW intrinsic preference the RMW operand to the target. +// For a simple move semantic between two SIMD registers, then preference the source operand. +// +// Arguments: +// compiler - the compiler instance +// isRMW - Set to true if is a RMW node +// delayFreeMultiple - Set to true if there are multiple values in the delay free operand +// +// Return Value: +// The operand that needs to be delay freed +// +GenTree* GenTreeHWIntrinsic::GetDelayFreeOp(Compiler* compiler, bool* isRMW, bool* delayFreeMultiple) +{ + *isRMW = isRMWHWIntrinsic(compiler); + *delayFreeMultiple = false; + + const NamedIntrinsic intrinsicId = GetHWIntrinsicId(); + GenTree* delayFreeOp = nullptr; + + switch (intrinsicId) + { + case NI_Vector64_CreateScalarUnsafe: + case NI_Vector128_CreateScalarUnsafe: + if (varTypeIsFloating(Op(1))) + { + delayFreeOp = Op(1); + } + break; + + case NI_AdvSimd_Arm64_DuplicateToVector64: + if (Op(1)->TypeGet() == TYP_DOUBLE) + { + delayFreeOp = Op(1); + } + break; + + case NI_Vector64_ToScalar: + case NI_Vector128_ToScalar: + if (varTypeIsFloating(this)) + { + delayFreeOp = Op(1); + } + break; + + case NI_Vector64_ToVector128Unsafe: + case NI_Vector128_AsVector128Unsafe: + case NI_Vector128_AsVector3: + case NI_Vector128_GetLower: + delayFreeOp = Op(1); + break; + + case NI_AdvSimd_LoadAndInsertScalarVector64x2: + case NI_AdvSimd_LoadAndInsertScalarVector64x3: + case NI_AdvSimd_LoadAndInsertScalarVector64x4: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: + assert(*isRMW); + delayFreeOp = Op(1); + *delayFreeMultiple = true; + break; + + case NI_Sve_CreateBreakPropagateMask: + // RMW operates on the second op. + assert(*isRMW); + assert(GetOperandCount() >= 2); + delayFreeOp = Op(2); + break; + + default: + if (*isRMW) + { + if (HWIntrinsicInfo::IsExplicitMaskedOperation(intrinsicId)) + { + // First operand contains the mask, RMW op is the second one. + delayFreeOp = Op(2); + } + else + { + delayFreeOp = Op(1); + } + } + break; + } + + if (delayFreeOp != nullptr) + { + // Only preference the delay op if it is not contained. + delayFreeOp = delayFreeOp->isContained() ? nullptr : delayFreeOp; + } + + return delayFreeOp; +} +#endif // TARGET_ARM64 + +#endif // (TARGET_XARCH || TARGET_ARM64) && FEATURE_HW_INTRINSICS unsigned GenTreeLclFld::GetSize() const { diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index e791d2fd064d3c..6cbf12d2f4e7bb 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6547,6 +6547,10 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic unsigned GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3); +#if defined(TARGET_ARM64) + GenTree* GetDelayFreeOp(Compiler* compiler, bool* isRMW, bool* delayFreeMultiple); +#endif // defined(TARGET_ARM64) + ClassLayout* GetLayout(Compiler* compiler) const; NamedIntrinsic GetHWIntrinsicId() const; diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index b733086f6f5693..5e692db8c86cc7 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1782,84 +1782,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou return BuildConditionalSelectWithEmbeddedOp(intrinsicTree, intrin, pDstCount); } - // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it + // Determine whether this is an operation where an op must be marked delayFree so that it // is not allocated the same register as the target. - const bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); + bool delayFreeMultiple = false; + bool isRMW = false; + GenTree* delayFreeOp = intrinsicTree->GetDelayFreeOp(compiler, &isRMW, &delayFreeMultiple); - bool tgtPrefOp1 = false; - bool tgtPrefOp2 = false; - bool delayFreeMultiple = false; if (intrin.op1 != nullptr) { - bool simdRegToSimdRegMove = false; - - switch (intrin.id) - { - case NI_Vector64_CreateScalarUnsafe: - case NI_Vector128_CreateScalarUnsafe: - { - simdRegToSimdRegMove = varTypeIsFloating(intrin.op1); - break; - } - - case NI_AdvSimd_Arm64_DuplicateToVector64: - { - simdRegToSimdRegMove = (intrin.op1->TypeGet() == TYP_DOUBLE); - break; - } - - case NI_Vector64_ToScalar: - case NI_Vector128_ToScalar: - { - simdRegToSimdRegMove = varTypeIsFloating(intrinsicTree); - break; - } - - case NI_Vector64_ToVector128Unsafe: - case NI_Vector128_AsVector128Unsafe: - case NI_Vector128_AsVector3: - case NI_Vector128_GetLower: - { - simdRegToSimdRegMove = true; - break; - } - case NI_AdvSimd_LoadAndInsertScalarVector64x2: - case NI_AdvSimd_LoadAndInsertScalarVector64x3: - case NI_AdvSimd_LoadAndInsertScalarVector64x4: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: - { - delayFreeMultiple = true; - break; - } - - default: - { - break; - } - } - - // If we have an RMW intrinsic or an intrinsic with simple move semantic between two SIMD registers, - // we want to preference op1Reg to the target if op1 is not contained. - - if ((isRMW || simdRegToSimdRegMove)) - { - if (HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)) - { - assert(!simdRegToSimdRegMove); - // Prefer op2Reg for the masked operation as mask would be the op1Reg - tgtPrefOp2 = !intrin.op1->isContained(); - } - else - { - tgtPrefOp1 = !intrin.op1->isContained(); - } - } - if (delayFreeMultiple) { - assert(isRMW); assert(intrin.op1->OperIs(GT_FIELD_LIST)); GenTreeFieldList* op1 = intrin.op1->AsFieldList(); assert(compiler->info.compNeedsConsecutiveRegisters); @@ -1885,9 +1817,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou predMask = RBM_LOWMASK.GetPredicateRegSet(); } - if (tgtPrefOp2) + if ((delayFreeOp == intrin.op2) && (intrin.op2 != nullptr)) { - assert(!tgtPrefOp1); srcCount += BuildDelayFreeUses(intrin.op1, nullptr, predMask); } else @@ -1900,7 +1831,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { srcCount += BuildAddrUses(intrin.op1); } - else if (tgtPrefOp1) + else if (delayFreeOp == intrin.op1) { tgtPrefUse = BuildUse(intrin.op1); srcCount++; @@ -1989,7 +1920,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(intrin.op3 != nullptr); assert(isRMW); srcCount += BuildConsecutiveRegistersForUse(intrin.op2, intrin.op1); - srcCount += BuildDelayFreeUses(intrin.op3, intrin.op1); + srcCount += BuildDelayFreeUses(intrin.op3, delayFreeOp); assert(dstCount == 1); buildInternalRegisterUses(); BuildDef(intrinsicTree); @@ -2121,7 +2052,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou getLowVectorOperandAndCandidates(intrin, &lowVectorOperandNum, &lowVectorCandidates); } - if (tgtPrefOp2) + if (delayFreeOp == intrin.op2) { if (!intrin.op2->isContained()) { @@ -2186,6 +2117,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou default: { + assert(delayFreeOp != intrin.op2); SingleTypeRegSet candidates = lowVectorOperandNum == 2 ? lowVectorCandidates : RBM_NONE; if (intrin.op2->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)) @@ -2194,7 +2126,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou candidates = RBM_ALLMASK.GetPredicateRegSet(); } - srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, intrin.op1, candidates) + srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, delayFreeOp, candidates) : BuildOperandUses(intrin.op2, candidates); } break; @@ -2203,11 +2135,12 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (intrin.op3 != nullptr) { + assert(delayFreeOp != intrin.op3); SingleTypeRegSet candidates = lowVectorOperandNum == 3 ? lowVectorCandidates : RBM_NONE; if (isRMW) { - srcCount += BuildDelayFreeUses(intrin.op3, (tgtPrefOp2 ? intrin.op2 : intrin.op1), candidates); + srcCount += BuildDelayFreeUses(intrin.op3, delayFreeOp, candidates); } else { @@ -2217,13 +2150,15 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (intrin.op4 != nullptr) { assert(lowVectorOperandNum != 4); - assert(!tgtPrefOp2); - srcCount += isRMW ? BuildDelayFreeUses(intrin.op4, intrin.op1) : BuildOperandUses(intrin.op4); + assert(delayFreeOp != intrin.op4); + + srcCount += isRMW ? BuildDelayFreeUses(intrin.op4, delayFreeOp) : BuildOperandUses(intrin.op4); if (intrin.op5 != nullptr) { assert(isRMW); - srcCount += BuildDelayFreeUses(intrin.op5, intrin.op1); + assert(delayFreeOp != intrin.op5); + srcCount += BuildDelayFreeUses(intrin.op5, delayFreeOp); } } } From bb908a55cce450d78797925733a162fab6d4aad4 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 3 Sep 2024 15:08:52 +0100 Subject: [PATCH 03/31] Use GetDelayFreeOp() in BuildConditionalSelectWithEmbeddedOp() --- src/coreclr/jit/lsraarm64.cpp | 97 ++++++++++++----------------------- 1 file changed, 34 insertions(+), 63 deletions(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 5e692db8c86cc7..9828d63b6f4a91 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1405,33 +1405,25 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins int srcCount = 0; + GenTreeHWIntrinsic* embeddedOpNode = intrin.op2->AsHWIntrinsic(); + const HWIntrinsic intrinEmbedded(embeddedOpNode); + size_t embNumArgs = embeddedOpNode->GetOperandCount(); + + // Determine whether this the embedded operation requires delay free + bool embeddedIsRMW = false; + bool embeddedDelayFreeMultiple = false; + GenTree* embeddedDelayFreeOp = embeddedOpNode->GetDelayFreeOp(compiler, &embeddedIsRMW, &embeddedDelayFreeMultiple); + assert(!embeddedDelayFreeMultiple); + // Handle Op1 - bool tgtPrefEmbOp2 = false; - SingleTypeRegSet predMask = RBM_ALLMASK.GetPredicateRegSet(); - if (intrin.id == NI_Sve_ConditionalSelect) + SingleTypeRegSet predMask = RBM_ALLMASK.GetPredicateRegSet(); + if (HWIntrinsicInfo::IsLowMaskedOperation(intrinEmbedded.id)) { - // If this is conditional select, make sure to check the embedded - // operation to determine the predicate mask. - assert(intrinsicTree->GetOperandCount() == 3); - assert(!HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)); - - GenTreeHWIntrinsic* embOp2Node = intrin.op2->AsHWIntrinsic(); - const HWIntrinsic intrinEmb(embOp2Node); - if (HWIntrinsicInfo::IsLowMaskedOperation(intrinEmb.id)) - { - predMask = RBM_LOWMASK.GetPredicateRegSet(); - } - - // Special-case, CreateBreakPropagateMask's op2 is the RMW node. - if (intrinEmb.id == NI_Sve_CreateBreakPropagateMask) - { - assert(embOp2Node->isRMWHWIntrinsic(compiler)); - tgtPrefEmbOp2 = true; - } + predMask = RBM_LOWMASK.GetPredicateRegSet(); } - if (tgtPrefEmbOp2) + if (embeddedDelayFreeOp != nullptr) { srcCount += BuildDelayFreeUses(intrin.op1, nullptr, predMask); } @@ -1442,92 +1434,71 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins // Handle Op2 and Op3 - if ((intrin.op2->isRMWHWIntrinsic(compiler))) + if (embeddedIsRMW) { - // If the embedded operation has RMW semantics then record delay-free for operands as well as the "merge" value - GenTreeHWIntrinsic* embOp2Node = intrin.op2->AsHWIntrinsic(); - size_t embNumArgs = embOp2Node->GetOperandCount(); - const HWIntrinsic intrinEmb(embOp2Node); - GenTree* prefUseNode = nullptr; + // If the embedded operation has RMW semantics then record delay-free for the "merge" value - if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmb.id)) + if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmbedded.id)) { assert(embNumArgs == 3); + // For FMA intrinsics, the arguments may get switched around during codegen. + // Figure out where the delay free op will be. + LIR::Use use; GenTree* user = nullptr; - if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embOp2Node, &use)) + if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embeddedOpNode, &use)) { user = use.User(); } unsigned resultOpNum = - embOp2Node->GetResultOpNumForRmwIntrinsic(user, intrinEmb.op1, intrinEmb.op2, intrinEmb.op3); + embeddedOpNode->GetResultOpNumForRmwIntrinsic(user, intrinEmbedded.op1, intrinEmbedded.op2, intrinEmbedded.op3); - if (resultOpNum == 0) - { - prefUseNode = embOp2Node->Op(1); - } - else + if (resultOpNum != 0) { - assert(resultOpNum >= 1 && resultOpNum <= 3); - prefUseNode = embOp2Node->Op(resultOpNum); + embeddedDelayFreeOp = embeddedOpNode->Op(resultOpNum); } } - else + else if (HWIntrinsicInfo::HasImmediateOperand(intrinEmbedded.id)) { - const bool embHasImmediateOperand = HWIntrinsicInfo::HasImmediateOperand(intrinEmb.id); - assert((embNumArgs == 1) || (embNumArgs == 2) || (embNumArgs == 3) || - (embHasImmediateOperand && (embNumArgs == 4))); - // Special handling for embedded intrinsics with immediates: // We might need an additional register to hold branch targets into the switch table // that encodes the immediate bool needsInternalRegister; - switch (intrinEmb.id) + switch (intrinEmbedded.id) { case NI_Sve_ShiftRightArithmeticForDivide: - assert(embHasImmediateOperand); assert(embNumArgs == 2); - needsInternalRegister = !embOp2Node->Op(2)->isContainedIntOrIImmed(); + needsInternalRegister = !embeddedOpNode->Op(2)->isContainedIntOrIImmed(); break; case NI_Sve_AddRotateComplex: - assert(embHasImmediateOperand); assert(embNumArgs == 3); - needsInternalRegister = !embOp2Node->Op(3)->isContainedIntOrIImmed(); + needsInternalRegister = !embeddedOpNode->Op(3)->isContainedIntOrIImmed(); break; case NI_Sve_MultiplyAddRotateComplex: - assert(embHasImmediateOperand); assert(embNumArgs == 4); - needsInternalRegister = !embOp2Node->Op(4)->isContainedIntOrIImmed(); + needsInternalRegister = !embeddedOpNode->Op(4)->isContainedIntOrIImmed(); break; default: - assert(!embHasImmediateOperand); needsInternalRegister = false; break; } if (needsInternalRegister) { - buildInternalIntRegisterDefForNode(embOp2Node); + buildInternalIntRegisterDefForNode(embeddedOpNode); } - - size_t prefUseOpNum = 1; - if (intrinEmb.id == NI_Sve_CreateBreakPropagateMask) - { - prefUseOpNum = 2; - } - prefUseNode = embOp2Node->Op(prefUseOpNum); } for (size_t argNum = 1; argNum <= embNumArgs; argNum++) { - GenTree* node = embOp2Node->Op(argNum); + GenTree* node = embeddedOpNode->Op(argNum); - if (node == prefUseNode) + if (node == embeddedDelayFreeOp) { tgtPrefUse = BuildUse(node); srcCount++; @@ -1545,7 +1516,7 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins } } - srcCount += BuildDelayFreeUses(intrin.op3, prefUseNode); + srcCount += BuildDelayFreeUses(intrin.op3, embeddedDelayFreeOp); } else { @@ -1784,8 +1755,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an operation where an op must be marked delayFree so that it // is not allocated the same register as the target. - bool delayFreeMultiple = false; bool isRMW = false; + bool delayFreeMultiple = false; GenTree* delayFreeOp = intrinsicTree->GetDelayFreeOp(compiler, &isRMW, &delayFreeMultiple); if (intrin.op1 != nullptr) From 23e08ff85222230744ed665ff17ffc820ea89388 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Sep 2024 10:59:51 +0100 Subject: [PATCH 04/31] simplify op2 handling --- src/coreclr/jit/lsraarm64.cpp | 147 +++++++++++++++++----------------- 1 file changed, 72 insertions(+), 75 deletions(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 9828d63b6f4a91..f5787f087c6ca8 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1788,7 +1788,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou predMask = RBM_LOWMASK.GetPredicateRegSet(); } - if ((delayFreeOp == intrin.op2) && (intrin.op2 != nullptr)) + if ((delayFreeOp != nullptr) && (delayFreeOp != intrin.op1)) { srcCount += BuildDelayFreeUses(intrin.op1, nullptr, predMask); } @@ -2023,99 +2023,96 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou getLowVectorOperandAndCandidates(intrin, &lowVectorOperandNum, &lowVectorCandidates); } - if (delayFreeOp == intrin.op2) + SingleTypeRegSet op2Candidates = lowVectorOperandNum == 2 ? lowVectorCandidates : RBM_NONE; + if (varTypeIsMask(intrin.op2->TypeGet())) { - if (!intrin.op2->isContained()) - { - assert(tgtPrefUse == nullptr); - tgtPrefUse2 = BuildUse(intrin.op2); - srcCount++; - } - else - { - srcCount += BuildOperandUses(intrin.op2); - } + assert(lowVectorOperandNum != 2); + op2Candidates = RBM_ALLMASK.GetPredicateRegSet(); } - else + + bool buildAddrUses = false; + switch (intrin.id) { - switch (intrin.id) - { - case NI_Sve_LoadVectorNonTemporal: - case NI_Sve_LoadVector128AndReplicateToVector: - case NI_Sve_StoreAndZip: - assert(intrinsicTree->OperIsMemoryLoadOrStore()); - srcCount += BuildAddrUses(intrin.op2); - break; + case NI_Sve_LoadVectorNonTemporal: + case NI_Sve_LoadVector128AndReplicateToVector: + case NI_Sve_StoreAndZip: + assert(intrinsicTree->OperIsMemoryLoadOrStore()); + buildAddrUses = true; + break; - case NI_Sve_GatherVector: - case NI_Sve_GatherVectorByteZeroExtend: - case NI_Sve_GatherVectorInt16SignExtend: - case NI_Sve_GatherVectorInt16SignExtendFirstFaulting: - case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend: - case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtendFirstFaulting: - case NI_Sve_GatherVectorInt32SignExtend: - case NI_Sve_GatherVectorInt32SignExtendFirstFaulting: - case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend: - case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtendFirstFaulting: - case NI_Sve_GatherVectorSByteSignExtend: - case NI_Sve_GatherVectorSByteSignExtendFirstFaulting: - case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend: - case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtendFirstFaulting: - case NI_Sve_GatherVectorUInt16ZeroExtend: - case NI_Sve_GatherVectorUInt16ZeroExtendFirstFaulting: - case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend: - case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtendFirstFaulting: - case NI_Sve_GatherVectorUInt32ZeroExtend: - case NI_Sve_GatherVectorUInt32ZeroExtendFirstFaulting: - case NI_Sve_GatherVectorWithByteOffsetFirstFaulting: - assert(intrinsicTree->OperIsMemoryLoadOrStore()); - FALLTHROUGH; - - case NI_Sve_PrefetchBytes: - case NI_Sve_PrefetchInt16: - case NI_Sve_PrefetchInt32: - case NI_Sve_PrefetchInt64: - case NI_Sve_GatherPrefetch8Bit: - case NI_Sve_GatherPrefetch16Bit: - case NI_Sve_GatherPrefetch32Bit: - case NI_Sve_GatherPrefetch64Bit: - if (!varTypeIsSIMD(intrin.op2->gtType)) - { - srcCount += BuildAddrUses(intrin.op2); - break; - } - FALLTHROUGH; + case NI_Sve_GatherVector: + case NI_Sve_GatherVectorByteZeroExtend: + case NI_Sve_GatherVectorInt16SignExtend: + case NI_Sve_GatherVectorInt16SignExtendFirstFaulting: + case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend: + case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtendFirstFaulting: + case NI_Sve_GatherVectorInt32SignExtend: + case NI_Sve_GatherVectorInt32SignExtendFirstFaulting: + case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend: + case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtendFirstFaulting: + case NI_Sve_GatherVectorSByteSignExtend: + case NI_Sve_GatherVectorSByteSignExtendFirstFaulting: + case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend: + case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtendFirstFaulting: + case NI_Sve_GatherVectorUInt16ZeroExtend: + case NI_Sve_GatherVectorUInt16ZeroExtendFirstFaulting: + case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend: + case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtendFirstFaulting: + case NI_Sve_GatherVectorUInt32ZeroExtend: + case NI_Sve_GatherVectorUInt32ZeroExtendFirstFaulting: + case NI_Sve_GatherVectorWithByteOffsetFirstFaulting: + assert(intrinsicTree->OperIsMemoryLoadOrStore()); + FALLTHROUGH; - default: + case NI_Sve_PrefetchBytes: + case NI_Sve_PrefetchInt16: + case NI_Sve_PrefetchInt32: + case NI_Sve_PrefetchInt64: + case NI_Sve_GatherPrefetch8Bit: + case NI_Sve_GatherPrefetch16Bit: + case NI_Sve_GatherPrefetch32Bit: + case NI_Sve_GatherPrefetch64Bit: + if (!varTypeIsSIMD(intrin.op2->gtType)) { - assert(delayFreeOp != intrin.op2); - SingleTypeRegSet candidates = lowVectorOperandNum == 2 ? lowVectorCandidates : RBM_NONE; - - if (intrin.op2->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)) - { - assert(lowVectorOperandNum != 2); - candidates = RBM_ALLMASK.GetPredicateRegSet(); - } - - srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, delayFreeOp, candidates) - : BuildOperandUses(intrin.op2, candidates); + buildAddrUses = true; } break; - } + + default: + break; + } + + if (buildAddrUses) + { + assert(delayFreeOp != intrin.op2); + assert(op2Candidates == RBM_NONE); + srcCount += BuildAddrUses(intrin.op2); + } + else if (delayFreeOp == intrin.op2) + { + assert(!intrin.op2->isContained()); + assert(tgtPrefUse == nullptr); + tgtPrefUse2 = BuildUse(intrin.op2, op2Candidates); + srcCount++; + } + else + { + srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, delayFreeOp, op2Candidates) + : BuildOperandUses(intrin.op2, op2Candidates); } if (intrin.op3 != nullptr) { assert(delayFreeOp != intrin.op3); - SingleTypeRegSet candidates = lowVectorOperandNum == 3 ? lowVectorCandidates : RBM_NONE; + SingleTypeRegSet op3Candidates = lowVectorOperandNum == 3 ? lowVectorCandidates : RBM_NONE; if (isRMW) { - srcCount += BuildDelayFreeUses(intrin.op3, delayFreeOp, candidates); + srcCount += BuildDelayFreeUses(intrin.op3, delayFreeOp, op3Candidates); } else { - srcCount += BuildOperandUses(intrin.op3, candidates); + srcCount += BuildOperandUses(intrin.op3, op3Candidates); } if (intrin.op4 != nullptr) From 866bacdf78c7fbb1a98dfe8712de4b0c0c8b517b Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Sep 2024 11:41:08 +0100 Subject: [PATCH 05/31] Add getVectorAddrOperand() --- src/coreclr/jit/lsra.h | 1 + src/coreclr/jit/lsraarm64.cpp | 135 ++++++++++++++++++++-------------- 2 files changed, 79 insertions(+), 57 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 9e1acae39cfc75..155dced4d4e209 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1445,6 +1445,7 @@ class LinearScan : public LinearScanInterface } FORCEINLINE RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); void getLowVectorOperandAndCandidates(HWIntrinsic intrin, size_t* operandNum, SingleTypeRegSet* candidates); + GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); #endif #ifdef DEBUG diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index f5787f087c6ca8..f720432f917053 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1759,6 +1759,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou bool delayFreeMultiple = false; GenTree* delayFreeOp = intrinsicTree->GetDelayFreeOp(compiler, &isRMW, &delayFreeMultiple); + // Determine whether this is an operation where one of the ops is an address + GenTree* addrOp = LinearScan::getVectorAddrOperand(intrinsicTree); + + if (intrin.op1 != nullptr) { if (delayFreeMultiple) @@ -1766,6 +1770,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(intrin.op1->OperIs(GT_FIELD_LIST)); GenTreeFieldList* op1 = intrin.op1->AsFieldList(); assert(compiler->info.compNeedsConsecutiveRegisters); + assert(addrOp != intrin.op1); + assert(delayFreeOp != intrin.op1); for (GenTreeFieldList::Use& use : op1->Uses()) { @@ -1775,6 +1781,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } else if (HWIntrinsicInfo::IsMaskedOperation(intrin.id)) { + assert(addrOp != intrin.op1); + if (!varTypeIsMask(intrin.op1->TypeGet()) && !HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)) { srcCount += BuildOperandUses(intrin.op1); @@ -1798,8 +1806,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } } } - else if (intrinsicTree->OperIsMemoryLoadOrStore()) + else if (addrOp == intrin.op1) { + assert(delayFreeOp != intrin.op1); + srcCount += BuildAddrUses(intrin.op1); } else if (delayFreeOp == intrin.op1) @@ -2030,83 +2040,43 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou op2Candidates = RBM_ALLMASK.GetPredicateRegSet(); } - bool buildAddrUses = false; - switch (intrin.id) - { - case NI_Sve_LoadVectorNonTemporal: - case NI_Sve_LoadVector128AndReplicateToVector: - case NI_Sve_StoreAndZip: - assert(intrinsicTree->OperIsMemoryLoadOrStore()); - buildAddrUses = true; - break; - - case NI_Sve_GatherVector: - case NI_Sve_GatherVectorByteZeroExtend: - case NI_Sve_GatherVectorInt16SignExtend: - case NI_Sve_GatherVectorInt16SignExtendFirstFaulting: - case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend: - case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtendFirstFaulting: - case NI_Sve_GatherVectorInt32SignExtend: - case NI_Sve_GatherVectorInt32SignExtendFirstFaulting: - case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend: - case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtendFirstFaulting: - case NI_Sve_GatherVectorSByteSignExtend: - case NI_Sve_GatherVectorSByteSignExtendFirstFaulting: - case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend: - case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtendFirstFaulting: - case NI_Sve_GatherVectorUInt16ZeroExtend: - case NI_Sve_GatherVectorUInt16ZeroExtendFirstFaulting: - case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend: - case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtendFirstFaulting: - case NI_Sve_GatherVectorUInt32ZeroExtend: - case NI_Sve_GatherVectorUInt32ZeroExtendFirstFaulting: - case NI_Sve_GatherVectorWithByteOffsetFirstFaulting: - assert(intrinsicTree->OperIsMemoryLoadOrStore()); - FALLTHROUGH; - - case NI_Sve_PrefetchBytes: - case NI_Sve_PrefetchInt16: - case NI_Sve_PrefetchInt32: - case NI_Sve_PrefetchInt64: - case NI_Sve_GatherPrefetch8Bit: - case NI_Sve_GatherPrefetch16Bit: - case NI_Sve_GatherPrefetch32Bit: - case NI_Sve_GatherPrefetch64Bit: - if (!varTypeIsSIMD(intrin.op2->gtType)) - { - buildAddrUses = true; - } - break; - - default: - break; - } - - if (buildAddrUses) + if (addrOp == intrin.op2) { assert(delayFreeOp != intrin.op2); assert(op2Candidates == RBM_NONE); + srcCount += BuildAddrUses(intrin.op2); } else if (delayFreeOp == intrin.op2) { assert(!intrin.op2->isContained()); assert(tgtPrefUse == nullptr); + tgtPrefUse2 = BuildUse(intrin.op2, op2Candidates); srcCount++; } + else if (isRMW) + { + srcCount += BuildDelayFreeUses(intrin.op2, delayFreeOp, op2Candidates); + } else { - srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, delayFreeOp, op2Candidates) - : BuildOperandUses(intrin.op2, op2Candidates); + srcCount += BuildOperandUses(intrin.op2, op2Candidates); } if (intrin.op3 != nullptr) { assert(delayFreeOp != intrin.op3); + SingleTypeRegSet op3Candidates = lowVectorOperandNum == 3 ? lowVectorCandidates : RBM_NONE; - if (isRMW) + if (addrOp == intrin.op3) + { + assert(op3Candidates == RBM_NONE); + + srcCount += BuildAddrUses(intrin.op3); + } + else if (isRMW) { srcCount += BuildDelayFreeUses(intrin.op3, delayFreeOp, op3Candidates); } @@ -2119,6 +2089,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert(lowVectorOperandNum != 4); assert(delayFreeOp != intrin.op4); + assert(addrOp != intrin.op3); srcCount += isRMW ? BuildDelayFreeUses(intrin.op4, delayFreeOp) : BuildOperandUses(intrin.op4); @@ -2126,6 +2097,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert(isRMW); assert(delayFreeOp != intrin.op5); + assert(addrOp != intrin.op3); + srcCount += BuildDelayFreeUses(intrin.op5, delayFreeOp); } } @@ -2460,6 +2433,54 @@ void LinearScan::getLowVectorOperandAndCandidates(HWIntrinsic intrin, size_t* op } } +//------------------------------------------------------------------------ +// GetDelayFreeOp: Get the address operand of the HWIntrinsic, if any +// +// Arguments: +// intrinsicTree - Node to check +// +// Return Value: +// The operand that is an address +// +GenTree* LinearScan::getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree) +{ + GenTree* pAddr = nullptr; + + if (intrinsicTree->OperIsMemoryLoad(&pAddr)) + { + assert(pAddr != nullptr); + return pAddr; + } + if (intrinsicTree->OperIsMemoryStore(&pAddr)) + { + assert(pAddr != nullptr); + return pAddr; + } + + // Operands that are not loads or stores but do require an address + switch(intrinsicTree->GetHWIntrinsicId()) + { + case NI_Sve_PrefetchBytes: + case NI_Sve_PrefetchInt16: + case NI_Sve_PrefetchInt32: + case NI_Sve_PrefetchInt64: + case NI_Sve_GatherPrefetch8Bit: + case NI_Sve_GatherPrefetch16Bit: + case NI_Sve_GatherPrefetch32Bit: + case NI_Sve_GatherPrefetch64Bit: + if (!varTypeIsSIMD(intrinsicTree->Op(2)->gtType)) + { + return intrinsicTree->Op(2); + } + break; + + default: + break; + } + + return nullptr; +} + #endif // FEATURE_HW_INTRINSICS #endif // TARGET_ARM64 From 442fc1fd7880e38cf12b940ba79f5b4aa20c727d Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Sep 2024 13:48:51 +0100 Subject: [PATCH 06/31] Add getConsecutiveRegistersOperand --- src/coreclr/jit/lsra.h | 1 + src/coreclr/jit/lsraarm64.cpp | 292 ++++++++++++++++------------------ 2 files changed, 136 insertions(+), 157 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 155dced4d4e209..9292635ba2f749 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1446,6 +1446,7 @@ class LinearScan : public LinearScanInterface FORCEINLINE RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); void getLowVectorOperandAndCandidates(HWIntrinsic intrin, size_t* operandNum, SingleTypeRegSet* candidates); GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); + GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool *destIsConsecutive); #endif #ifdef DEBUG diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index f720432f917053..a939e6e472c7c9 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1548,18 +1548,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou const HWIntrinsic intrin(intrinsicTree); - int srcCount = 0; - int dstCount = 0; - regMaskTP dstCandidates = RBM_NONE; - - if (HWIntrinsicInfo::IsMultiReg(intrin.id)) - { - dstCount = intrinsicTree->GetMultiRegCount(compiler); - } - else if (intrinsicTree->IsValue()) - { - dstCount = 1; - } + int srcCount = 0; const bool hasImmediateOperand = HWIntrinsicInfo::HasImmediateOperand(intrin.id); @@ -1762,6 +1751,9 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an operation where one of the ops is an address GenTree* addrOp = LinearScan::getVectorAddrOperand(intrinsicTree); + // Determine whether this is an operation where one of the ops has consecutive registers + bool destIsConsecutive = false; + GenTree* consecutiveOp = getConsecutiveRegistersOperand(intrinsicTree, &destIsConsecutive); if (intrin.op1 != nullptr) { @@ -1812,15 +1804,15 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou srcCount += BuildAddrUses(intrin.op1); } + else if (consecutiveOp == intrin.op1) + { + srcCount += BuildConsecutiveRegistersForUse(intrin.op1, delayFreeOp); + } else if (delayFreeOp == intrin.op1) { tgtPrefUse = BuildUse(intrin.op1); srcCount++; } - else if ((intrin.id == NI_AdvSimd_VectorTableLookup) || (intrin.id == NI_AdvSimd_Arm64_VectorTableLookup)) - { - srcCount += BuildConsecutiveRegistersForUse(intrin.op1); - } else { srcCount += BuildOperandUses(intrin.op1); @@ -1878,145 +1870,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } } } - else if (HWIntrinsicInfo::NeedsConsecutiveRegisters(intrin.id)) - { - switch (intrin.id) - { - case NI_AdvSimd_VectorTableLookup: - case NI_AdvSimd_Arm64_VectorTableLookup: - { - assert(intrin.op2 != nullptr); - srcCount += BuildOperandUses(intrin.op2); - assert(dstCount == 1); - buildInternalRegisterUses(); - BuildDef(intrinsicTree); - *pDstCount = 1; - break; - } - - case NI_AdvSimd_VectorTableLookupExtension: - case NI_AdvSimd_Arm64_VectorTableLookupExtension: - { - assert(intrin.op2 != nullptr); - assert(intrin.op3 != nullptr); - assert(isRMW); - srcCount += BuildConsecutiveRegistersForUse(intrin.op2, intrin.op1); - srcCount += BuildDelayFreeUses(intrin.op3, delayFreeOp); - assert(dstCount == 1); - buildInternalRegisterUses(); - BuildDef(intrinsicTree); - *pDstCount = 1; - break; - } - - case NI_AdvSimd_StoreSelectedScalar: - case NI_AdvSimd_Arm64_StoreSelectedScalar: - { - assert(intrin.op1 != nullptr); - assert(intrin.op3 != nullptr); - srcCount += (intrin.op2->gtType == TYP_STRUCT) ? BuildConsecutiveRegistersForUse(intrin.op2) - : BuildOperandUses(intrin.op2); - if (!intrin.op3->isContainedIntOrIImmed()) - { - srcCount += BuildOperandUses(intrin.op3); - } - assert(dstCount == 0); - buildInternalRegisterUses(); - *pDstCount = 0; - break; - } - - case NI_AdvSimd_Store: - case NI_AdvSimd_Arm64_Store: - case NI_AdvSimd_StoreVectorAndZip: - case NI_AdvSimd_Arm64_StoreVectorAndZip: - { - assert(intrin.op1 != nullptr); - srcCount += BuildConsecutiveRegistersForUse(intrin.op2); - assert(dstCount == 0); - buildInternalRegisterUses(); - *pDstCount = 0; - break; - } - - case NI_AdvSimd_LoadAndInsertScalarVector64x2: - case NI_AdvSimd_LoadAndInsertScalarVector64x3: - case NI_AdvSimd_LoadAndInsertScalarVector64x4: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: - { - assert(intrin.op2 != nullptr); - assert(intrin.op3 != nullptr); - assert(isRMW); - if (!intrin.op2->isContainedIntOrIImmed()) - { - srcCount += BuildOperandUses(intrin.op2); - } - - assert(intrinsicTree->OperIsMemoryLoadOrStore()); - srcCount += BuildAddrUses(intrin.op3); - buildInternalRegisterUses(); - FALLTHROUGH; - } - - case NI_AdvSimd_Load2xVector64AndUnzip: - case NI_AdvSimd_Load3xVector64AndUnzip: - case NI_AdvSimd_Load4xVector64AndUnzip: - case NI_AdvSimd_Arm64_Load2xVector128AndUnzip: - case NI_AdvSimd_Arm64_Load3xVector128AndUnzip: - case NI_AdvSimd_Arm64_Load4xVector128AndUnzip: - case NI_AdvSimd_Load2xVector64: - case NI_AdvSimd_Load3xVector64: - case NI_AdvSimd_Load4xVector64: - case NI_AdvSimd_Arm64_Load2xVector128: - case NI_AdvSimd_Arm64_Load3xVector128: - case NI_AdvSimd_Arm64_Load4xVector128: - case NI_AdvSimd_LoadAndReplicateToVector64x2: - case NI_AdvSimd_LoadAndReplicateToVector64x3: - case NI_AdvSimd_LoadAndReplicateToVector64x4: - case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x2: - case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x3: - case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x4: - { - assert(intrin.op1 != nullptr); - BuildConsecutiveRegistersForDef(intrinsicTree, dstCount); - *pDstCount = dstCount; - break; - } - - case NI_Sve_Load2xVectorAndUnzip: - case NI_Sve_Load3xVectorAndUnzip: - case NI_Sve_Load4xVectorAndUnzip: - { - assert(intrin.op1 != nullptr); - assert(intrin.op2 != nullptr); - assert(intrinsicTree->OperIsMemoryLoadOrStore()); - srcCount += BuildAddrUses(intrin.op2); - BuildConsecutiveRegistersForDef(intrinsicTree, dstCount); - *pDstCount = dstCount; - break; - } - - case NI_Sve_StoreAndZipx2: - case NI_Sve_StoreAndZipx3: - case NI_Sve_StoreAndZipx4: - { - assert(intrin.op2 != nullptr); - assert(intrin.op3 != nullptr); - srcCount += BuildAddrUses(intrin.op2); - srcCount += BuildConsecutiveRegistersForUse(intrin.op3); - assert(dstCount == 0); - buildInternalRegisterUses(); - *pDstCount = 0; - break; - } - - default: - noway_assert(!"Not a supported as multiple consecutive register intrinsic"); - } - return srcCount; - } else if (intrin.op2 != nullptr) { @@ -2047,6 +1900,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou srcCount += BuildAddrUses(intrin.op2); } + else if (consecutiveOp == intrin.op2) + { + srcCount += BuildConsecutiveRegistersForUse(intrin.op2, delayFreeOp); + } else if (delayFreeOp == intrin.op2) { assert(!intrin.op2->isContained()); @@ -2076,6 +1933,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou srcCount += BuildAddrUses(intrin.op3); } + else if (consecutiveOp == intrin.op3) + { + srcCount += BuildConsecutiveRegistersForUse(intrin.op3, delayFreeOp); + } else if (isRMW) { srcCount += BuildDelayFreeUses(intrin.op3, delayFreeOp, op3Candidates); @@ -2089,7 +1950,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert(lowVectorOperandNum != 4); assert(delayFreeOp != intrin.op4); - assert(addrOp != intrin.op3); + assert(addrOp != intrin.op4); + assert(consecutiveOp != intrin.op4); srcCount += isRMW ? BuildDelayFreeUses(intrin.op4, delayFreeOp) : BuildOperandUses(intrin.op4); @@ -2098,6 +1960,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(isRMW); assert(delayFreeOp != intrin.op5); assert(addrOp != intrin.op3); + assert(consecutiveOp != intrin.op5); srcCount += BuildDelayFreeUses(intrin.op5, delayFreeOp); } @@ -2107,7 +1970,25 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou buildInternalRegisterUses(); - if ((dstCount == 1) || (dstCount == 2)) + + // Build Destination + + int dstCount = 0; + + if (HWIntrinsicInfo::IsMultiReg(intrin.id)) + { + dstCount = intrinsicTree->GetMultiRegCount(compiler); + } + else if (intrinsicTree->IsValue()) + { + dstCount = 1; + } + + if (destIsConsecutive) + { + BuildConsecutiveRegistersForDef(intrinsicTree, dstCount); + } + else if ((dstCount == 1) || (dstCount == 2)) { BuildDef(intrinsicTree); @@ -2481,6 +2362,103 @@ GenTree* LinearScan::getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree) return nullptr; } +//------------------------------------------------------------------------ +// GetDelayFreeOp: Get the consecutive operand of the HWIntrinsic, if any +// +// Arguments: +// intrinsicTree - Intrin to check +// destIsConsecutive (out) - if the destination requires consective registers +// +// Return Value: +// The operand that requires consecutive registers +// +GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool *destIsConsecutive) +{ + *destIsConsecutive = false; + GenTree* consecutiveOp = nullptr; + + if (!HWIntrinsicInfo::NeedsConsecutiveRegisters(intrin.id)) + { + return nullptr; + } + + switch (intrin.id) + { + case NI_AdvSimd_Arm64_VectorTableLookup: + case NI_AdvSimd_VectorTableLookup: + consecutiveOp = intrin.op1; + assert(consecutiveOp != nullptr); + break; + + case NI_AdvSimd_Arm64_Store: + case NI_AdvSimd_Arm64_StoreVectorAndZip: + case NI_AdvSimd_Arm64_VectorTableLookupExtension: + case NI_AdvSimd_Store: + case NI_AdvSimd_StoreVectorAndZip: + case NI_AdvSimd_VectorTableLookupExtension: + consecutiveOp = intrin.op2; + assert(consecutiveOp != nullptr); + break; + + case NI_AdvSimd_StoreSelectedScalar: + case NI_AdvSimd_Arm64_StoreSelectedScalar: + if (intrin.op2->gtType == TYP_STRUCT) + { + consecutiveOp = intrin.op2; + assert(consecutiveOp != nullptr); + } + break; + + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: + case NI_AdvSimd_LoadAndInsertScalarVector64x2: + case NI_AdvSimd_LoadAndInsertScalarVector64x3: + case NI_AdvSimd_LoadAndInsertScalarVector64x4: + consecutiveOp = intrin.op2; + assert(consecutiveOp != nullptr); + *destIsConsecutive = true; + break; + + case NI_Sve_StoreAndZipx2: + case NI_Sve_StoreAndZipx3: + case NI_Sve_StoreAndZipx4: + consecutiveOp = intrin.op3; + assert(consecutiveOp != nullptr); + break; + + case NI_AdvSimd_Load2xVector64AndUnzip: + case NI_AdvSimd_Load3xVector64AndUnzip: + case NI_AdvSimd_Load4xVector64AndUnzip: + case NI_AdvSimd_Arm64_Load2xVector128AndUnzip: + case NI_AdvSimd_Arm64_Load3xVector128AndUnzip: + case NI_AdvSimd_Arm64_Load4xVector128AndUnzip: + case NI_AdvSimd_Load2xVector64: + case NI_AdvSimd_Load3xVector64: + case NI_AdvSimd_Load4xVector64: + case NI_AdvSimd_Arm64_Load2xVector128: + case NI_AdvSimd_Arm64_Load3xVector128: + case NI_AdvSimd_Arm64_Load4xVector128: + case NI_AdvSimd_LoadAndReplicateToVector64x2: + case NI_AdvSimd_LoadAndReplicateToVector64x3: + case NI_AdvSimd_LoadAndReplicateToVector64x4: + case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x2: + case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x3: + case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x4: + case NI_Sve_Load2xVectorAndUnzip: + case NI_Sve_Load3xVectorAndUnzip: + case NI_Sve_Load4xVectorAndUnzip: + *destIsConsecutive = true; + break; + + default: + noway_assert(!"Not a supported as multiple consecutive register intrinsic"); + } + + return consecutiveOp; +} + + #endif // FEATURE_HW_INTRINSICS #endif // TARGET_ARM64 From 3ce31aaea9074d9519e0df71b64f106371789cc0 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Sep 2024 14:26:15 +0100 Subject: [PATCH 07/31] Add BuildOperand() --- src/coreclr/jit/lsra.h | 1 + src/coreclr/jit/lsraarm64.cpp | 128 ++++++++++++++-------------------- 2 files changed, 53 insertions(+), 76 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 9292635ba2f749..2e8afd6cab44cc 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1447,6 +1447,7 @@ class LinearScan : public LinearScanInterface void getLowVectorOperandAndCandidates(HWIntrinsic intrin, size_t* operandNum, SingleTypeRegSet* candidates); GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool *destIsConsecutive); + int BuildOperand(GenTree* operand, GenTree* addrOp, GenTree* consecutiveOp, GenTree* delayFreeOp, bool isRMW, RefPosition** use, SingleTypeRegSet candidates); #endif #ifdef DEBUG diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index a939e6e472c7c9..9657393e9b3bfc 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1873,97 +1873,37 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou else if (intrin.op2 != nullptr) { - // RMW intrinsic operands doesn't have to be delayFree when they can be assigned the same register as op1Reg - // (i.e. a register that corresponds to read-modify-write operand) and one of them is the last use. - assert(intrin.op1 != nullptr); + // Get candidate information + SingleTypeRegSet lowVectorCandidates = RBM_NONE; size_t lowVectorOperandNum = 0; - if (HWIntrinsicInfo::IsLowVectorOperation(intrin.id)) { getLowVectorOperandAndCandidates(intrin, &lowVectorOperandNum, &lowVectorCandidates); } - SingleTypeRegSet op2Candidates = lowVectorOperandNum == 2 ? lowVectorCandidates : RBM_NONE; - if (varTypeIsMask(intrin.op2->TypeGet())) - { - assert(lowVectorOperandNum != 2); - op2Candidates = RBM_ALLMASK.GetPredicateRegSet(); - } - - if (addrOp == intrin.op2) - { - assert(delayFreeOp != intrin.op2); - assert(op2Candidates == RBM_NONE); - - srcCount += BuildAddrUses(intrin.op2); - } - else if (consecutiveOp == intrin.op2) - { - srcCount += BuildConsecutiveRegistersForUse(intrin.op2, delayFreeOp); - } - else if (delayFreeOp == intrin.op2) - { - assert(!intrin.op2->isContained()); - assert(tgtPrefUse == nullptr); - - tgtPrefUse2 = BuildUse(intrin.op2, op2Candidates); - srcCount++; - } - else if (isRMW) - { - srcCount += BuildDelayFreeUses(intrin.op2, delayFreeOp, op2Candidates); - } - else - { - srcCount += BuildOperandUses(intrin.op2, op2Candidates); - } + int numOperands = intrinsicTree->GetOperandCount(); - if (intrin.op3 != nullptr) + for (int opNum = 2; opNum <= numOperands; opNum++) { - assert(delayFreeOp != intrin.op3); - - SingleTypeRegSet op3Candidates = lowVectorOperandNum == 3 ? lowVectorCandidates : RBM_NONE; - if (addrOp == intrin.op3) + SingleTypeRegSet opCandidates = (lowVectorOperandNum == opNum) ? lowVectorCandidates : RBM_NONE; + if (varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())) { - assert(op3Candidates == RBM_NONE); - - srcCount += BuildAddrUses(intrin.op3); - } - else if (consecutiveOp == intrin.op3) - { - srcCount += BuildConsecutiveRegistersForUse(intrin.op3, delayFreeOp); - } - else if (isRMW) - { - srcCount += BuildDelayFreeUses(intrin.op3, delayFreeOp, op3Candidates); - } - else - { - srcCount += BuildOperandUses(intrin.op3, op3Candidates); + assert(lowVectorOperandNum != opNum); + opCandidates = RBM_ALLMASK.GetPredicateRegSet(); } - if (intrin.op4 != nullptr) - { - assert(lowVectorOperandNum != 4); - assert(delayFreeOp != intrin.op4); - assert(addrOp != intrin.op4); - assert(consecutiveOp != intrin.op4); + RefPosition* delayUse = nullptr; - srcCount += isRMW ? BuildDelayFreeUses(intrin.op4, delayFreeOp) : BuildOperandUses(intrin.op4); + srcCount += BuildOperand(intrinsicTree->Op(opNum), addrOp, consecutiveOp, delayFreeOp, isRMW, &delayUse, opCandidates); - if (intrin.op5 != nullptr) - { - assert(isRMW); - assert(delayFreeOp != intrin.op5); - assert(addrOp != intrin.op3); - assert(consecutiveOp != intrin.op5); - - srcCount += BuildDelayFreeUses(intrin.op5, delayFreeOp); - } + if (delayUse != nullptr) + { + assert(opNum == 2); + tgtPrefUse2 = delayUse; } } } @@ -2315,7 +2255,7 @@ void LinearScan::getLowVectorOperandAndCandidates(HWIntrinsic intrin, size_t* op } //------------------------------------------------------------------------ -// GetDelayFreeOp: Get the address operand of the HWIntrinsic, if any +// getVectorAddrOperand: Get the address operand of the HWIntrinsic, if any // // Arguments: // intrinsicTree - Node to check @@ -2363,7 +2303,7 @@ GenTree* LinearScan::getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree) } //------------------------------------------------------------------------ -// GetDelayFreeOp: Get the consecutive operand of the HWIntrinsic, if any +// getConsecutiveRegistersOperand: Get the consecutive operand of the HWIntrinsic, if any // // Arguments: // intrinsicTree - Intrin to check @@ -2459,6 +2399,42 @@ GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bo } +int LinearScan::BuildOperand(GenTree* operand, GenTree* addrOp, GenTree* consecutiveOp, GenTree* delayFreeOp, bool isRMW, RefPosition** delayUse, SingleTypeRegSet candidates) +{ + int srcCount = 0; + + if (addrOp == operand) + { + assert(delayFreeOp != operand); + assert(consecutiveOp != operand); + + srcCount = BuildAddrUses(operand, candidates); + } + else if (consecutiveOp == operand) + { + assert(delayFreeOp != operand); + assert(candidates == RBM_NONE); + + srcCount = BuildConsecutiveRegistersForUse(operand, delayFreeOp); + } + else if (delayFreeOp == operand) + { + assert(delayUse != nullptr); + *delayUse = BuildUse(operand, candidates); + srcCount=1; + } + else if (isRMW) + { + srcCount = BuildDelayFreeUses(operand, delayFreeOp, candidates); + } + else + { + srcCount = BuildOperandUses(operand, candidates); + } + + return srcCount; +} + #endif // FEATURE_HW_INTRINSICS #endif // TARGET_ARM64 From 323ce4b3c60eba0817d95e4948880758202d48a0 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Sep 2024 14:40:57 +0100 Subject: [PATCH 08/31] Use BuildOperand for op1 --- src/coreclr/jit/gentree.cpp | 102 +------------------- src/coreclr/jit/gentree.h | 4 - src/coreclr/jit/lsra.h | 1 + src/coreclr/jit/lsraarm64.cpp | 172 +++++++++++++++++++++++++--------- 4 files changed, 130 insertions(+), 149 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 0806d64f1667ac..c60bff8c10202b 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -30297,107 +30297,7 @@ unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree return 0; } - -#if defined(TARGET_ARM64) -//------------------------------------------------------------------------ -// GetDelayFreeOp: Get the delay free characteristics of the HWIntrinsic -// -// For a RMW intrinsic preference the RMW operand to the target. -// For a simple move semantic between two SIMD registers, then preference the source operand. -// -// Arguments: -// compiler - the compiler instance -// isRMW - Set to true if is a RMW node -// delayFreeMultiple - Set to true if there are multiple values in the delay free operand -// -// Return Value: -// The operand that needs to be delay freed -// -GenTree* GenTreeHWIntrinsic::GetDelayFreeOp(Compiler* compiler, bool* isRMW, bool* delayFreeMultiple) -{ - *isRMW = isRMWHWIntrinsic(compiler); - *delayFreeMultiple = false; - - const NamedIntrinsic intrinsicId = GetHWIntrinsicId(); - GenTree* delayFreeOp = nullptr; - - switch (intrinsicId) - { - case NI_Vector64_CreateScalarUnsafe: - case NI_Vector128_CreateScalarUnsafe: - if (varTypeIsFloating(Op(1))) - { - delayFreeOp = Op(1); - } - break; - - case NI_AdvSimd_Arm64_DuplicateToVector64: - if (Op(1)->TypeGet() == TYP_DOUBLE) - { - delayFreeOp = Op(1); - } - break; - - case NI_Vector64_ToScalar: - case NI_Vector128_ToScalar: - if (varTypeIsFloating(this)) - { - delayFreeOp = Op(1); - } - break; - - case NI_Vector64_ToVector128Unsafe: - case NI_Vector128_AsVector128Unsafe: - case NI_Vector128_AsVector3: - case NI_Vector128_GetLower: - delayFreeOp = Op(1); - break; - - case NI_AdvSimd_LoadAndInsertScalarVector64x2: - case NI_AdvSimd_LoadAndInsertScalarVector64x3: - case NI_AdvSimd_LoadAndInsertScalarVector64x4: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: - assert(*isRMW); - delayFreeOp = Op(1); - *delayFreeMultiple = true; - break; - - case NI_Sve_CreateBreakPropagateMask: - // RMW operates on the second op. - assert(*isRMW); - assert(GetOperandCount() >= 2); - delayFreeOp = Op(2); - break; - - default: - if (*isRMW) - { - if (HWIntrinsicInfo::IsExplicitMaskedOperation(intrinsicId)) - { - // First operand contains the mask, RMW op is the second one. - delayFreeOp = Op(2); - } - else - { - delayFreeOp = Op(1); - } - } - break; - } - - if (delayFreeOp != nullptr) - { - // Only preference the delay op if it is not contained. - delayFreeOp = delayFreeOp->isContained() ? nullptr : delayFreeOp; - } - - return delayFreeOp; -} -#endif // TARGET_ARM64 - -#endif // (TARGET_XARCH || TARGET_ARM64) && FEATURE_HW_INTRINSICS +#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS unsigned GenTreeLclFld::GetSize() const { diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 6cbf12d2f4e7bb..e791d2fd064d3c 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6547,10 +6547,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic unsigned GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3); -#if defined(TARGET_ARM64) - GenTree* GetDelayFreeOp(Compiler* compiler, bool* isRMW, bool* delayFreeMultiple); -#endif // defined(TARGET_ARM64) - ClassLayout* GetLayout(Compiler* compiler) const; NamedIntrinsic GetHWIntrinsicId() const; diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 2e8afd6cab44cc..56ee5b96852431 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1445,6 +1445,7 @@ class LinearScan : public LinearScanInterface } FORCEINLINE RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); void getLowVectorOperandAndCandidates(HWIntrinsic intrin, size_t* operandNum, SingleTypeRegSet* candidates); + GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW, bool* delayFreeMultiple); GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool *destIsConsecutive); int BuildOperand(GenTree* operand, GenTree* addrOp, GenTree* consecutiveOp, GenTree* delayFreeOp, bool isRMW, RefPosition** use, SingleTypeRegSet candidates); diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 9657393e9b3bfc..c8c40f8c51dc0e 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1412,7 +1412,7 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins // Determine whether this the embedded operation requires delay free bool embeddedIsRMW = false; bool embeddedDelayFreeMultiple = false; - GenTree* embeddedDelayFreeOp = embeddedOpNode->GetDelayFreeOp(compiler, &embeddedIsRMW, &embeddedDelayFreeMultiple); + GenTree* embeddedDelayFreeOp = getDelayFreeOp(embeddedOpNode, &embeddedIsRMW, &embeddedDelayFreeMultiple); assert(!embeddedDelayFreeMultiple); // Handle Op1 @@ -1746,7 +1746,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // is not allocated the same register as the target. bool isRMW = false; bool delayFreeMultiple = false; - GenTree* delayFreeOp = intrinsicTree->GetDelayFreeOp(compiler, &isRMW, &delayFreeMultiple); + GenTree* delayFreeOp = getDelayFreeOp(intrinsicTree, &isRMW, &delayFreeMultiple); // Determine whether this is an operation where one of the ops is an address GenTree* addrOp = LinearScan::getVectorAddrOperand(intrinsicTree); @@ -1771,52 +1771,20 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou srcCount++; } } - else if (HWIntrinsicInfo::IsMaskedOperation(intrin.id)) - { - assert(addrOp != intrin.op1); - if (!varTypeIsMask(intrin.op1->TypeGet()) && !HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)) - { - srcCount += BuildOperandUses(intrin.op1); - } - else - { - SingleTypeRegSet predMask = RBM_ALLMASK.GetPredicateRegSet(); + SingleTypeRegSet opCandidates = RBM_NONE; - if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)) - { - predMask = RBM_LOWMASK.GetPredicateRegSet(); - } + if (varTypeIsMask(intrin.op1->TypeGet()) || HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)) + { + opCandidates = RBM_ALLMASK.GetPredicateRegSet(); - if ((delayFreeOp != nullptr) && (delayFreeOp != intrin.op1)) - { - srcCount += BuildDelayFreeUses(intrin.op1, nullptr, predMask); - } - else - { - srcCount += BuildOperandUses(intrin.op1, predMask); - } + if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)) + { + opCandidates = RBM_LOWMASK.GetPredicateRegSet(); } } - else if (addrOp == intrin.op1) - { - assert(delayFreeOp != intrin.op1); - srcCount += BuildAddrUses(intrin.op1); - } - else if (consecutiveOp == intrin.op1) - { - srcCount += BuildConsecutiveRegistersForUse(intrin.op1, delayFreeOp); - } - else if (delayFreeOp == intrin.op1) - { - tgtPrefUse = BuildUse(intrin.op1); - srcCount++; - } - else - { - srcCount += BuildOperandUses(intrin.op1); - } + srcCount += BuildOperand(intrinsicTree->Op(1), addrOp, consecutiveOp, delayFreeOp, isRMW, &tgtPrefUse, opCandidates); } if ((intrin.category == HW_Category_SIMDByIndexedElement) && (genTypeSize(intrin.baseType) == 2)) @@ -1876,7 +1844,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(intrin.op1 != nullptr); // Get candidate information - SingleTypeRegSet lowVectorCandidates = RBM_NONE; size_t lowVectorOperandNum = 0; if (HWIntrinsicInfo::IsLowVectorOperation(intrin.id)) @@ -2254,6 +2221,109 @@ void LinearScan::getLowVectorOperandAndCandidates(HWIntrinsic intrin, size_t* op } } +//------------------------------------------------------------------------ +// getDelayFreeOp: Get the delay free characteristics of the HWIntrinsic +// +// For a RMW intrinsic preference the RMW operand to the target. +// For a simple move semantic between two SIMD registers, then preference the source operand. +// +// Arguments: +// intrinsicTree - Node to check +// isRMW (out) - Set to true if is a RMW node +// delayFreeMultiple (out) - Set to true if there are multiple values in the delay free operand +// +// Return Value: +// The operand that needs to be delay freed +// +GenTree* LinearScan::getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW, bool* delayFreeMultiple) +{ + *isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); + *delayFreeMultiple = false; + + const NamedIntrinsic intrinsicId = intrinsicTree->GetHWIntrinsicId(); + GenTree* delayFreeOp = nullptr; + + switch (intrinsicId) + { + case NI_Vector64_CreateScalarUnsafe: + case NI_Vector128_CreateScalarUnsafe: + if (varTypeIsFloating(intrinsicTree->Op(1))) + { + delayFreeOp = intrinsicTree->Op(1); + assert(delayFreeOp != nullptr); + } + break; + + case NI_AdvSimd_Arm64_DuplicateToVector64: + if (intrinsicTree->Op(1)->TypeGet() == TYP_DOUBLE) + { + delayFreeOp = intrinsicTree->Op(1); + assert(delayFreeOp != nullptr); + } + break; + + case NI_Vector64_ToScalar: + case NI_Vector128_ToScalar: + if (varTypeIsFloating(intrinsicTree)) + { + delayFreeOp = intrinsicTree->Op(1); + assert(delayFreeOp != nullptr); + } + break; + + case NI_Vector64_ToVector128Unsafe: + case NI_Vector128_AsVector128Unsafe: + case NI_Vector128_AsVector3: + case NI_Vector128_GetLower: + delayFreeOp = intrinsicTree->Op(1); + assert(delayFreeOp != nullptr); + break; + + case NI_AdvSimd_LoadAndInsertScalarVector64x2: + case NI_AdvSimd_LoadAndInsertScalarVector64x3: + case NI_AdvSimd_LoadAndInsertScalarVector64x4: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: + assert(*isRMW); + delayFreeOp = intrinsicTree->Op(1); + assert(delayFreeOp != nullptr); + *delayFreeMultiple = true; + break; + + case NI_Sve_CreateBreakPropagateMask: + // RMW operates on the second op. + assert(*isRMW); + delayFreeOp = intrinsicTree->Op(2); + assert(delayFreeOp != nullptr); + break; + + default: + if (*isRMW) + { + if (HWIntrinsicInfo::IsExplicitMaskedOperation(intrinsicId)) + { + // First operand contains the mask, RMW op is the second one. + delayFreeOp = intrinsicTree->Op(2); + assert(delayFreeOp != nullptr); + } + else + { + delayFreeOp = intrinsicTree->Op(1); + assert(delayFreeOp != nullptr); + } + } + break; + } + + if (delayFreeOp != nullptr) + { + // Only preference the delay op if it is not contained. + delayFreeOp = delayFreeOp->isContained() ? nullptr : delayFreeOp; + } + + return delayFreeOp; +} //------------------------------------------------------------------------ // getVectorAddrOperand: Get the address operand of the HWIntrinsic, if any // @@ -2398,7 +2468,21 @@ GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bo return consecutiveOp; } - +//------------------------------------------------------------------------ +// BuildOperand: Build an operand in the correct way +// +// Arguments: +// operand - Operand to build +// addrOp - The operand that is an address operand (if any) +// consecutiveOp - The operand that requires consecutive registers (if any) +// consecutiveOp - The operand that delay free (if any) +// isRMW - if this operand requires RMW +// delayUse (out) - if this operand is delayfree, then set to the use +// candidates - The set of candidates for the uses +// +// Return Value: +// The number of source registers used by the *parent* of this node. +// int LinearScan::BuildOperand(GenTree* operand, GenTree* addrOp, GenTree* consecutiveOp, GenTree* delayFreeOp, bool isRMW, RefPosition** delayUse, SingleTypeRegSet candidates) { int srcCount = 0; From eaa535c37bbe0e481b77ae259f2fae75e653531f Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Sep 2024 18:29:11 +0100 Subject: [PATCH 09/31] Add buildHWIntrinsicImmediate --- src/coreclr/jit/lsra.h | 1 + src/coreclr/jit/lsraarm64.cpp | 391 ++++++++++++++++++---------------- 2 files changed, 207 insertions(+), 185 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 56ee5b96852431..695df974998f15 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1448,6 +1448,7 @@ class LinearScan : public LinearScanInterface GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW, bool* delayFreeMultiple); GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool *destIsConsecutive); + bool buildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin); int BuildOperand(GenTree* operand, GenTree* addrOp, GenTree* consecutiveOp, GenTree* delayFreeOp, bool isRMW, RefPosition** use, SingleTypeRegSet candidates); #endif diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index c8c40f8c51dc0e..ec813b91bd6b9d 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1550,191 +1550,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou int srcCount = 0; - const bool hasImmediateOperand = HWIntrinsicInfo::HasImmediateOperand(intrin.id); - - if (hasImmediateOperand && !HWIntrinsicInfo::NoJmpTableImm(intrin.id)) - { - // We may need to allocate an additional general-purpose register when an intrinsic has a non-const immediate - // operand and the intrinsic does not have an alternative non-const fallback form. - // However, for a case when the operand can take only two possible values - zero and one - // the codegen can use cbnz to do conditional branch, so such register is not needed. - - bool needBranchTargetReg = false; - - int immLowerBound = 0; - int immUpperBound = 0; - - if (intrin.category == HW_Category_SIMDByIndexedElement) - { - var_types indexedElementOpType; - - if (intrin.numOperands == 2) - { - indexedElementOpType = intrin.op1->TypeGet(); - } - else if (intrin.numOperands == 3) - { - indexedElementOpType = intrin.op2->TypeGet(); - } - else - { - assert(intrin.numOperands == 4); - indexedElementOpType = intrin.op3->TypeGet(); - } - - assert(varTypeIsSIMD(indexedElementOpType)); - - const unsigned int indexedElementSimdSize = genTypeSize(indexedElementOpType); - HWIntrinsicInfo::lookupImmBounds(intrin.id, indexedElementSimdSize, intrin.baseType, 1, &immLowerBound, - &immUpperBound); - } - else - { - HWIntrinsicInfo::lookupImmBounds(intrin.id, intrinsicTree->GetSimdSize(), intrin.baseType, 1, - &immLowerBound, &immUpperBound); - } - - if ((immLowerBound != 0) || (immUpperBound != 1)) - { - if ((intrin.category == HW_Category_SIMDByIndexedElement) || - (intrin.category == HW_Category_ShiftLeftByImmediate) || - (intrin.category == HW_Category_ShiftRightByImmediate)) - { - switch (intrin.numOperands) - { - case 4: - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - break; - - case 3: - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - break; - - case 2: - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - break; - - default: - unreached(); - } - } - else - { - switch (intrin.id) - { - case NI_AdvSimd_DuplicateSelectedScalarToVector64: - case NI_AdvSimd_DuplicateSelectedScalarToVector128: - case NI_AdvSimd_Extract: - case NI_AdvSimd_Insert: - case NI_AdvSimd_InsertScalar: - case NI_AdvSimd_LoadAndInsertScalar: - case NI_AdvSimd_LoadAndInsertScalarVector64x2: - case NI_AdvSimd_LoadAndInsertScalarVector64x3: - case NI_AdvSimd_LoadAndInsertScalarVector64x4: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: - case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128: - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - break; - - case NI_AdvSimd_ExtractVector64: - case NI_AdvSimd_ExtractVector128: - case NI_AdvSimd_StoreSelectedScalar: - case NI_AdvSimd_Arm64_StoreSelectedScalar: - case NI_Sve_PrefetchBytes: - case NI_Sve_PrefetchInt16: - case NI_Sve_PrefetchInt32: - case NI_Sve_PrefetchInt64: - case NI_Sve_ExtractVector: - case NI_Sve_TrigonometricMultiplyAddCoefficient: - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - break; - - case NI_AdvSimd_Arm64_InsertSelectedScalar: - assert(intrin.op2->isContainedIntOrIImmed()); - assert(intrin.op4->isContainedIntOrIImmed()); - break; - - case NI_Sve_CreateTrueMaskByte: - case NI_Sve_CreateTrueMaskDouble: - case NI_Sve_CreateTrueMaskInt16: - case NI_Sve_CreateTrueMaskInt32: - case NI_Sve_CreateTrueMaskInt64: - case NI_Sve_CreateTrueMaskSByte: - case NI_Sve_CreateTrueMaskSingle: - case NI_Sve_CreateTrueMaskUInt16: - case NI_Sve_CreateTrueMaskUInt32: - case NI_Sve_CreateTrueMaskUInt64: - case NI_Sve_Count16BitElements: - case NI_Sve_Count32BitElements: - case NI_Sve_Count64BitElements: - case NI_Sve_Count8BitElements: - needBranchTargetReg = !intrin.op1->isContainedIntOrIImmed(); - break; - - case NI_Sve_GatherPrefetch8Bit: - case NI_Sve_GatherPrefetch16Bit: - case NI_Sve_GatherPrefetch32Bit: - case NI_Sve_GatherPrefetch64Bit: - if (!varTypeIsSIMD(intrin.op2->gtType)) - { - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - } - else - { - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - } - break; - - case NI_Sve_SaturatingDecrementBy16BitElementCount: - case NI_Sve_SaturatingDecrementBy32BitElementCount: - case NI_Sve_SaturatingDecrementBy64BitElementCount: - case NI_Sve_SaturatingDecrementBy8BitElementCount: - case NI_Sve_SaturatingIncrementBy16BitElementCount: - case NI_Sve_SaturatingIncrementBy32BitElementCount: - case NI_Sve_SaturatingIncrementBy64BitElementCount: - case NI_Sve_SaturatingIncrementBy8BitElementCount: - case NI_Sve_SaturatingDecrementBy16BitElementCountScalar: - case NI_Sve_SaturatingDecrementBy32BitElementCountScalar: - case NI_Sve_SaturatingDecrementBy64BitElementCountScalar: - case NI_Sve_SaturatingIncrementBy16BitElementCountScalar: - case NI_Sve_SaturatingIncrementBy32BitElementCountScalar: - case NI_Sve_SaturatingIncrementBy64BitElementCountScalar: - // Can only avoid generating a table if both immediates are constant. - assert(intrin.op2->isContainedIntOrIImmed() == intrin.op3->isContainedIntOrIImmed()); - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - // Ensure that internal does not collide with destination. - setInternalRegsDelayFree = true; - break; - - case NI_Sve_MultiplyAddRotateComplexBySelectedScalar: - // This API has two immediates, one of which is used to index pairs of floats in a vector. - // For a vector width of 128 bits, this means the index's range is [0, 1], - // which means we will skip the above jump table register check, - // even though we might need a jump table for the second immediate. - // Thus, this API is special-cased, and does not use the HW_Category_SIMDByIndexedElement path. - // Also, only one internal register is needed for the jump table; - // we will combine the two immediates into one jump table. - - // Can only avoid generating a table if both immediates are constant. - assert(intrin.op4->isContainedIntOrIImmed() == intrin.op5->isContainedIntOrIImmed()); - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - // Ensure that internal does not collide with destination. - setInternalRegsDelayFree = true; - break; - - default: - unreached(); - } - } - } - - if (needBranchTargetReg) - { - buildInternalIntRegisterDefForNode(intrinsicTree); - } - } // ConditionalSelect with embedded masked operations require special handling if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp())) @@ -1755,6 +1570,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou bool destIsConsecutive = false; GenTree* consecutiveOp = getConsecutiveRegistersOperand(intrinsicTree, &destIsConsecutive); + // Build any immediates + bool hasImmediateOperand = buildHWIntrinsicImmediate(intrinsicTree, intrin); + + if (intrin.op1 != nullptr) { if (delayFreeMultiple) @@ -2468,6 +2287,208 @@ GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bo return consecutiveOp; } +//------------------------------------------------------------------------ +// buildHWIntrinsicImmediate: Build immediate values +// +// Arguments: +// intrinsicTree - Intrinsic to check +// intrin - Intrin to check +// +// Return Value: +// True if the intrinsic contains an immediate +// +bool LinearScan::buildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin) +{ + const bool hasImmediateOperand = HWIntrinsicInfo::HasImmediateOperand(intrin.id); + + if (hasImmediateOperand && !HWIntrinsicInfo::NoJmpTableImm(intrin.id)) + { + // We may need to allocate an additional general-purpose register when an intrinsic has a non-const immediate + // operand and the intrinsic does not have an alternative non-const fallback form. + // However, for a case when the operand can take only two possible values - zero and one + // the codegen can use cbnz to do conditional branch, so such register is not needed. + + bool needBranchTargetReg = false; + + int immLowerBound = 0; + int immUpperBound = 0; + + if (intrin.category == HW_Category_SIMDByIndexedElement) + { + var_types indexedElementOpType; + + if (intrin.numOperands == 2) + { + indexedElementOpType = intrin.op1->TypeGet(); + } + else if (intrin.numOperands == 3) + { + indexedElementOpType = intrin.op2->TypeGet(); + } + else + { + assert(intrin.numOperands == 4); + indexedElementOpType = intrin.op3->TypeGet(); + } + + assert(varTypeIsSIMD(indexedElementOpType)); + + const unsigned int indexedElementSimdSize = genTypeSize(indexedElementOpType); + HWIntrinsicInfo::lookupImmBounds(intrin.id, indexedElementSimdSize, intrin.baseType, 1, &immLowerBound, + &immUpperBound); + } + else + { + HWIntrinsicInfo::lookupImmBounds(intrin.id, intrinsicTree->GetSimdSize(), intrin.baseType, 1, + &immLowerBound, &immUpperBound); + } + + if ((immLowerBound != 0) || (immUpperBound != 1)) + { + if ((intrin.category == HW_Category_SIMDByIndexedElement) || + (intrin.category == HW_Category_ShiftLeftByImmediate) || + (intrin.category == HW_Category_ShiftRightByImmediate)) + { + switch (intrin.numOperands) + { + case 4: + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + break; + + case 3: + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + break; + + case 2: + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + break; + + default: + unreached(); + } + } + else + { + switch (intrin.id) + { + case NI_AdvSimd_DuplicateSelectedScalarToVector64: + case NI_AdvSimd_DuplicateSelectedScalarToVector128: + case NI_AdvSimd_Extract: + case NI_AdvSimd_Insert: + case NI_AdvSimd_InsertScalar: + case NI_AdvSimd_LoadAndInsertScalar: + case NI_AdvSimd_LoadAndInsertScalarVector64x2: + case NI_AdvSimd_LoadAndInsertScalarVector64x3: + case NI_AdvSimd_LoadAndInsertScalarVector64x4: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: + case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128: + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + break; + + case NI_AdvSimd_ExtractVector64: + case NI_AdvSimd_ExtractVector128: + case NI_AdvSimd_StoreSelectedScalar: + case NI_AdvSimd_Arm64_StoreSelectedScalar: + case NI_Sve_PrefetchBytes: + case NI_Sve_PrefetchInt16: + case NI_Sve_PrefetchInt32: + case NI_Sve_PrefetchInt64: + case NI_Sve_ExtractVector: + case NI_Sve_TrigonometricMultiplyAddCoefficient: + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + break; + + case NI_AdvSimd_Arm64_InsertSelectedScalar: + assert(intrin.op2->isContainedIntOrIImmed()); + assert(intrin.op4->isContainedIntOrIImmed()); + break; + + case NI_Sve_CreateTrueMaskByte: + case NI_Sve_CreateTrueMaskDouble: + case NI_Sve_CreateTrueMaskInt16: + case NI_Sve_CreateTrueMaskInt32: + case NI_Sve_CreateTrueMaskInt64: + case NI_Sve_CreateTrueMaskSByte: + case NI_Sve_CreateTrueMaskSingle: + case NI_Sve_CreateTrueMaskUInt16: + case NI_Sve_CreateTrueMaskUInt32: + case NI_Sve_CreateTrueMaskUInt64: + case NI_Sve_Count16BitElements: + case NI_Sve_Count32BitElements: + case NI_Sve_Count64BitElements: + case NI_Sve_Count8BitElements: + needBranchTargetReg = !intrin.op1->isContainedIntOrIImmed(); + break; + + case NI_Sve_GatherPrefetch8Bit: + case NI_Sve_GatherPrefetch16Bit: + case NI_Sve_GatherPrefetch32Bit: + case NI_Sve_GatherPrefetch64Bit: + if (!varTypeIsSIMD(intrin.op2->gtType)) + { + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + } + else + { + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + } + break; + + case NI_Sve_SaturatingDecrementBy16BitElementCount: + case NI_Sve_SaturatingDecrementBy32BitElementCount: + case NI_Sve_SaturatingDecrementBy64BitElementCount: + case NI_Sve_SaturatingDecrementBy8BitElementCount: + case NI_Sve_SaturatingIncrementBy16BitElementCount: + case NI_Sve_SaturatingIncrementBy32BitElementCount: + case NI_Sve_SaturatingIncrementBy64BitElementCount: + case NI_Sve_SaturatingIncrementBy8BitElementCount: + case NI_Sve_SaturatingDecrementBy16BitElementCountScalar: + case NI_Sve_SaturatingDecrementBy32BitElementCountScalar: + case NI_Sve_SaturatingDecrementBy64BitElementCountScalar: + case NI_Sve_SaturatingIncrementBy16BitElementCountScalar: + case NI_Sve_SaturatingIncrementBy32BitElementCountScalar: + case NI_Sve_SaturatingIncrementBy64BitElementCountScalar: + // Can only avoid generating a table if both immediates are constant. + assert(intrin.op2->isContainedIntOrIImmed() == intrin.op3->isContainedIntOrIImmed()); + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + // Ensure that internal does not collide with destination. + setInternalRegsDelayFree = true; + break; + + case NI_Sve_MultiplyAddRotateComplexBySelectedScalar: + // This API has two immediates, one of which is used to index pairs of floats in a vector. + // For a vector width of 128 bits, this means the index's range is [0, 1], + // which means we will skip the above jump table register check, + // even though we might need a jump table for the second immediate. + // Thus, this API is special-cased, and does not use the HW_Category_SIMDByIndexedElement path. + // Also, only one internal register is needed for the jump table; + // we will combine the two immediates into one jump table. + + // Can only avoid generating a table if both immediates are constant. + assert(intrin.op4->isContainedIntOrIImmed() == intrin.op5->isContainedIntOrIImmed()); + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + // Ensure that internal does not collide with destination. + setInternalRegsDelayFree = true; + break; + + default: + unreached(); + } + } + } + + if (needBranchTargetReg) + { + buildInternalIntRegisterDefForNode(intrinsicTree); + } + } + + return hasImmediateOperand; +} + + //------------------------------------------------------------------------ // BuildOperand: Build an operand in the correct way // From ec4efdb74b2b9ee5256de8bbcdcba6fbb58aaeab Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 5 Sep 2024 08:34:42 +0100 Subject: [PATCH 10/31] Add getOperandCandidates() --- src/coreclr/jit/lsra.h | 2 +- src/coreclr/jit/lsraarm64.cpp | 192 ++++++++++++++-------------------- 2 files changed, 82 insertions(+), 112 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 695df974998f15..6cf1ec2a9cac42 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1444,7 +1444,7 @@ class LinearScan : public LinearScanInterface return nextConsecutiveRefPositionMap; } FORCEINLINE RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); - void getLowVectorOperandAndCandidates(HWIntrinsic intrin, size_t* operandNum, SingleTypeRegSet* candidates); + SingleTypeRegSet getOperandCandidates(GenTreeHWIntrinsic* intrinsicTree, HWIntrinsic intrin, size_t opNum); GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW, bool* delayFreeMultiple); GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool *destIsConsecutive); diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index ec813b91bd6b9d..3fa4990c63599f 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1591,96 +1591,18 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } } - SingleTypeRegSet opCandidates = RBM_NONE; - - if (varTypeIsMask(intrin.op1->TypeGet()) || HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)) - { - opCandidates = RBM_ALLMASK.GetPredicateRegSet(); - - if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)) - { - opCandidates = RBM_LOWMASK.GetPredicateRegSet(); - } - } + SingleTypeRegSet opCandidates = getOperandCandidates(intrinsicTree, intrin, 1); srcCount += BuildOperand(intrinsicTree->Op(1), addrOp, consecutiveOp, delayFreeOp, isRMW, &tgtPrefUse, opCandidates); } - if ((intrin.category == HW_Category_SIMDByIndexedElement) && (genTypeSize(intrin.baseType) == 2)) - { - // Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g. - // "MLA (by element)") have encoding that restricts what registers that can be used for the indexed element when - // the element size is H (i.e. 2 bytes). - assert(intrin.op2 != nullptr); - - if ((intrin.op4 != nullptr) || ((intrin.op3 != nullptr) && !hasImmediateOperand)) - { - if (isRMW) - { - srcCount += BuildDelayFreeUses(intrin.op2, nullptr); - srcCount += - BuildDelayFreeUses(intrin.op3, nullptr, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet()); - } - else - { - srcCount += BuildOperandUses(intrin.op2); - srcCount += BuildOperandUses(intrin.op3, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet()); - } - - if (intrin.op4 != nullptr) - { - assert(hasImmediateOperand); - assert(varTypeIsIntegral(intrin.op4)); - - srcCount += BuildOperandUses(intrin.op4); - } - } - else - { - assert(!isRMW); - - if (intrin.id == NI_Sve_DuplicateSelectedScalarToVector) - { - srcCount += BuildOperandUses(intrin.op2); - } - else - { - srcCount += BuildOperandUses(intrin.op2, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet()); - } - - if (intrin.op3 != nullptr) - { - assert(hasImmediateOperand); - assert(varTypeIsIntegral(intrin.op3)); - - srcCount += BuildOperandUses(intrin.op3); - } - } - } - - else if (intrin.op2 != nullptr) + if (intrin.op2 != nullptr) { assert(intrin.op1 != nullptr); - // Get candidate information - SingleTypeRegSet lowVectorCandidates = RBM_NONE; - size_t lowVectorOperandNum = 0; - if (HWIntrinsicInfo::IsLowVectorOperation(intrin.id)) + for (int opNum = 2; opNum <= intrin.numOperands; opNum++) { - getLowVectorOperandAndCandidates(intrin, &lowVectorOperandNum, &lowVectorCandidates); - } - - int numOperands = intrinsicTree->GetOperandCount(); - - for (int opNum = 2; opNum <= numOperands; opNum++) - { - - SingleTypeRegSet opCandidates = (lowVectorOperandNum == opNum) ? lowVectorCandidates : RBM_NONE; - if (varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())) - { - assert(lowVectorOperandNum != opNum); - opCandidates = RBM_ALLMASK.GetPredicateRegSet(); - } + SingleTypeRegSet opCandidates = getOperandCandidates(intrinsicTree, intrin, opNum); RefPosition* delayUse = nullptr; @@ -2000,44 +1922,92 @@ bool RefPosition::isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegist #endif // DEBUG //------------------------------------------------------------------------ -// getLowVectorOperandAndCandidates: Instructions for certain intrinsics operate on low vector registers -// depending on the size of the element. The method returns the candidates based on that size and -// the operand number of the intrinsics that has the restriction. +// getOperandCandidates: Get the register candidates for a given operand number // // Arguments: -// intrin - Intrinsics -// operandNum (out) - The operand number having the low vector register restriction -// candidates (out) - The restricted low vector registers +// intrinsicTree - Tree to check +// intrin - Intrin to check +// OpNum - The Operand number in the intrinsic tree +// +// Return Value: +// The candidates for the operand number // -void LinearScan::getLowVectorOperandAndCandidates(HWIntrinsic intrin, size_t* operandNum, SingleTypeRegSet* candidates) +SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicTree, HWIntrinsic intrin, size_t opNum) { - assert(HWIntrinsicInfo::IsLowVectorOperation(intrin.id)); - unsigned baseElementSize = genTypeSize(intrin.baseType); + SingleTypeRegSet opCandidates = RBM_NONE; - if (baseElementSize == 8) + if (HWIntrinsicInfo::IsLowVectorOperation(intrin.id)) { - *candidates = RBM_SVE_INDEXED_D_ELEMENT_ALLOWED_REGS.GetFloatRegSet(); + assert(!varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())); + + bool isLowVectorOpNum = false; + + switch (intrin.id) + { + case NI_Sve_DotProductBySelectedScalar: + case NI_Sve_FusedMultiplyAddBySelectedScalar: + case NI_Sve_FusedMultiplySubtractBySelectedScalar: + case NI_Sve_MultiplyAddRotateComplexBySelectedScalar: + isLowVectorOpNum = (opNum == 3); + break; + case NI_Sve_MultiplyBySelectedScalar: + isLowVectorOpNum = (opNum == 2); + break; + default: + unreached(); + } + + if (isLowVectorOpNum) + { + unsigned baseElementSize = genTypeSize(intrin.baseType); + + if (baseElementSize == 8) + { + opCandidates= RBM_SVE_INDEXED_D_ELEMENT_ALLOWED_REGS.GetFloatRegSet(); + } + else + { + assert(baseElementSize == 4); + opCandidates = RBM_SVE_INDEXED_S_ELEMENT_ALLOWED_REGS.GetFloatRegSet(); + } + } } - else + else if ((intrin.category == HW_Category_SIMDByIndexedElement) && (genTypeSize(intrin.baseType) == 2)) { - assert(baseElementSize == 4); - *candidates = RBM_SVE_INDEXED_S_ELEMENT_ALLOWED_REGS.GetFloatRegSet(); + // Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g. + // "MLA (by element)") have encoding that restricts what registers that can be used for the indexed element when + // the element size is H (i.e. 2 bytes). + + if ((intrin.numOperands == 4) || (intrin.numOperands == 3 && !HWIntrinsicInfo::HasImmediateOperand(intrin.id))) + { + opCandidates = (opNum == 3) ? RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet() : opCandidates; + } + else if (intrin.id == NI_Sve_DuplicateSelectedScalarToVector) + { + // Do nothing + } + else + { + opCandidates = (opNum == 2) ? RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet() : opCandidates; + } } + else if (opNum == 1 && HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)) + { + assert(varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())); - switch (intrin.id) + opCandidates = RBM_ALLMASK.GetPredicateRegSet(); + + if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)) + { + opCandidates = RBM_LOWMASK.GetPredicateRegSet(); + } + } + else if (varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())) { - case NI_Sve_DotProductBySelectedScalar: - case NI_Sve_FusedMultiplyAddBySelectedScalar: - case NI_Sve_FusedMultiplySubtractBySelectedScalar: - case NI_Sve_MultiplyAddRotateComplexBySelectedScalar: - *operandNum = 3; - break; - case NI_Sve_MultiplyBySelectedScalar: - *operandNum = 2; - break; - default: - unreached(); + opCandidates = RBM_ALLMASK.GetPredicateRegSet(); } + + return opCandidates; } //------------------------------------------------------------------------ @@ -2047,7 +2017,7 @@ void LinearScan::getLowVectorOperandAndCandidates(HWIntrinsic intrin, size_t* op // For a simple move semantic between two SIMD registers, then preference the source operand. // // Arguments: -// intrinsicTree - Node to check +// intrinsicTree - Tree to check // isRMW (out) - Set to true if is a RMW node // delayFreeMultiple (out) - Set to true if there are multiple values in the delay free operand // @@ -2195,7 +2165,7 @@ GenTree* LinearScan::getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree) // getConsecutiveRegistersOperand: Get the consecutive operand of the HWIntrinsic, if any // // Arguments: -// intrinsicTree - Intrin to check +// intrinsicTree - Tree to check // destIsConsecutive (out) - if the destination requires consective registers // // Return Value: @@ -2291,7 +2261,7 @@ GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bo // buildHWIntrinsicImmediate: Build immediate values // // Arguments: -// intrinsicTree - Intrinsic to check +// intrinsicTree - Tree to check // intrin - Intrin to check // // Return Value: From 8caedea8900fab4a47346921cf03686f58fcb238 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 5 Sep 2024 12:27:28 +0100 Subject: [PATCH 11/31] Remove BuildOperand() --- src/coreclr/jit/lsra.h | 1 - src/coreclr/jit/lsraarm64.cpp | 111 +++++++++++++--------------------- 2 files changed, 41 insertions(+), 71 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 6cf1ec2a9cac42..336544ed40aa4a 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1449,7 +1449,6 @@ class LinearScan : public LinearScanInterface GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool *destIsConsecutive); bool buildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin); - int BuildOperand(GenTree* operand, GenTree* addrOp, GenTree* consecutiveOp, GenTree* delayFreeOp, bool isRMW, RefPosition** use, SingleTypeRegSet candidates); #endif #ifdef DEBUG diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 3fa4990c63599f..76c430e9673713 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1573,10 +1573,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Build any immediates bool hasImmediateOperand = buildHWIntrinsicImmediate(intrinsicTree, intrin); - - if (intrin.op1 != nullptr) + // Build all Operands + for (int opNum = 1; opNum <= intrin.numOperands; opNum++) { - if (delayFreeMultiple) + GenTree* operand = intrinsicTree->Op(opNum); + + assert(operand != nullptr); + + SingleTypeRegSet candidates = getOperandCandidates(intrinsicTree, intrin, opNum); + + if (delayFreeMultiple && opNum == 1) { assert(intrin.op1->OperIs(GT_FIELD_LIST)); GenTreeFieldList* op1 = intrin.op1->AsFieldList(); @@ -1590,30 +1596,47 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou srcCount++; } } - - SingleTypeRegSet opCandidates = getOperandCandidates(intrinsicTree, intrin, 1); - - srcCount += BuildOperand(intrinsicTree->Op(1), addrOp, consecutiveOp, delayFreeOp, isRMW, &tgtPrefUse, opCandidates); - } - - if (intrin.op2 != nullptr) - { - assert(intrin.op1 != nullptr); - - for (int opNum = 2; opNum <= intrin.numOperands; opNum++) + else if (addrOp == operand) { - SingleTypeRegSet opCandidates = getOperandCandidates(intrinsicTree, intrin, opNum); + assert(delayFreeOp != operand); + assert(consecutiveOp != operand); - RefPosition* delayUse = nullptr; + srcCount += BuildAddrUses(operand, candidates); + } + else if (consecutiveOp == operand) + { + assert(delayFreeOp != operand); + assert(candidates == RBM_NONE); - srcCount += BuildOperand(intrinsicTree->Op(opNum), addrOp, consecutiveOp, delayFreeOp, isRMW, &delayUse, opCandidates); + srcCount += BuildConsecutiveRegistersForUse(operand, delayFreeOp); + } + else if (delayFreeOp == operand) + { + RefPosition* delayUse = BuildUse(operand, candidates); + srcCount+=1; - if (delayUse != nullptr) + if (opNum == 1) + { + assert(tgtPrefUse == nullptr); + assert(tgtPrefUse2 == nullptr); + tgtPrefUse = delayUse; + } + else { assert(opNum == 2); + assert(tgtPrefUse == nullptr); + assert(tgtPrefUse2 == nullptr); tgtPrefUse2 = delayUse; } } + else if (isRMW) + { + srcCount += BuildDelayFreeUses(operand, delayFreeOp, candidates); + } + else + { + srcCount += BuildOperandUses(operand, candidates); + } } buildInternalRegisterUses(); @@ -2458,58 +2481,6 @@ bool LinearScan::buildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, co return hasImmediateOperand; } - -//------------------------------------------------------------------------ -// BuildOperand: Build an operand in the correct way -// -// Arguments: -// operand - Operand to build -// addrOp - The operand that is an address operand (if any) -// consecutiveOp - The operand that requires consecutive registers (if any) -// consecutiveOp - The operand that delay free (if any) -// isRMW - if this operand requires RMW -// delayUse (out) - if this operand is delayfree, then set to the use -// candidates - The set of candidates for the uses -// -// Return Value: -// The number of source registers used by the *parent* of this node. -// -int LinearScan::BuildOperand(GenTree* operand, GenTree* addrOp, GenTree* consecutiveOp, GenTree* delayFreeOp, bool isRMW, RefPosition** delayUse, SingleTypeRegSet candidates) -{ - int srcCount = 0; - - if (addrOp == operand) - { - assert(delayFreeOp != operand); - assert(consecutiveOp != operand); - - srcCount = BuildAddrUses(operand, candidates); - } - else if (consecutiveOp == operand) - { - assert(delayFreeOp != operand); - assert(candidates == RBM_NONE); - - srcCount = BuildConsecutiveRegistersForUse(operand, delayFreeOp); - } - else if (delayFreeOp == operand) - { - assert(delayUse != nullptr); - *delayUse = BuildUse(operand, candidates); - srcCount=1; - } - else if (isRMW) - { - srcCount = BuildDelayFreeUses(operand, delayFreeOp, candidates); - } - else - { - srcCount = BuildOperandUses(operand, candidates); - } - - return srcCount; -} - #endif // FEATURE_HW_INTRINSICS #endif // TARGET_ARM64 From 99c53ebd59c4e46ab336758a049f37f5e4a8dabd Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 5 Sep 2024 13:35:01 +0100 Subject: [PATCH 12/31] remove delayFreeMultiple --- src/coreclr/jit/lsra.h | 2 +- src/coreclr/jit/lsraarm64.cpp | 69 +++++++++++++---------------------- 2 files changed, 26 insertions(+), 45 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 336544ed40aa4a..1476a29189b5ea 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1445,7 +1445,7 @@ class LinearScan : public LinearScanInterface } FORCEINLINE RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); SingleTypeRegSet getOperandCandidates(GenTreeHWIntrinsic* intrinsicTree, HWIntrinsic intrin, size_t opNum); - GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW, bool* delayFreeMultiple); + GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW); GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool *destIsConsecutive); bool buildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin); diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 76c430e9673713..b9bdbf38b098b4 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1411,9 +1411,7 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins // Determine whether this the embedded operation requires delay free bool embeddedIsRMW = false; - bool embeddedDelayFreeMultiple = false; - GenTree* embeddedDelayFreeOp = getDelayFreeOp(embeddedOpNode, &embeddedIsRMW, &embeddedDelayFreeMultiple); - assert(!embeddedDelayFreeMultiple); + GenTree* embeddedDelayFreeOp = getDelayFreeOp(embeddedOpNode, &embeddedIsRMW); // Handle Op1 @@ -1559,9 +1557,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an operation where an op must be marked delayFree so that it // is not allocated the same register as the target. - bool isRMW = false; - bool delayFreeMultiple = false; - GenTree* delayFreeOp = getDelayFreeOp(intrinsicTree, &isRMW, &delayFreeMultiple); + bool isRMW = false; + GenTree* delayFreeOp = getDelayFreeOp(intrinsicTree, &isRMW); // Determine whether this is an operation where one of the ops is an address GenTree* addrOp = LinearScan::getVectorAddrOperand(intrinsicTree); @@ -1582,21 +1579,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou SingleTypeRegSet candidates = getOperandCandidates(intrinsicTree, intrin, opNum); - if (delayFreeMultiple && opNum == 1) - { - assert(intrin.op1->OperIs(GT_FIELD_LIST)); - GenTreeFieldList* op1 = intrin.op1->AsFieldList(); - assert(compiler->info.compNeedsConsecutiveRegisters); - assert(addrOp != intrin.op1); - assert(delayFreeOp != intrin.op1); - - for (GenTreeFieldList::Use& use : op1->Uses()) - { - BuildDelayFreeUses(use.GetNode(), intrinsicTree); - srcCount++; - } - } - else if (addrOp == operand) + if (addrOp == operand) { assert(delayFreeOp != operand); assert(consecutiveOp != operand); @@ -1605,28 +1588,35 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } else if (consecutiveOp == operand) { - assert(delayFreeOp != operand); assert(candidates == RBM_NONE); + // Some operands have consective op which is also a delay free op srcCount += BuildConsecutiveRegistersForUse(operand, delayFreeOp); } else if (delayFreeOp == operand) { - RefPosition* delayUse = BuildUse(operand, candidates); - srcCount+=1; - - if (opNum == 1) + if (delayFreeOp->isContained()) { - assert(tgtPrefUse == nullptr); - assert(tgtPrefUse2 == nullptr); - tgtPrefUse = delayUse; + srcCount += BuildOperandUses(operand); } else { - assert(opNum == 2); - assert(tgtPrefUse == nullptr); - assert(tgtPrefUse2 == nullptr); - tgtPrefUse2 = delayUse; + RefPosition* delayUse = BuildUse(operand, candidates); + srcCount+=1; + + if (opNum == 1) + { + assert(tgtPrefUse == nullptr); + assert(tgtPrefUse2 == nullptr); + tgtPrefUse = delayUse; + } + else + { + assert(opNum == 2); + assert(tgtPrefUse == nullptr); + assert(tgtPrefUse2 == nullptr); + tgtPrefUse2 = delayUse; + } } } else if (isRMW) @@ -2042,15 +2032,13 @@ SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicT // Arguments: // intrinsicTree - Tree to check // isRMW (out) - Set to true if is a RMW node -// delayFreeMultiple (out) - Set to true if there are multiple values in the delay free operand // // Return Value: // The operand that needs to be delay freed // -GenTree* LinearScan::getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW, bool* delayFreeMultiple) +GenTree* LinearScan::getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW) { *isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); - *delayFreeMultiple = false; const NamedIntrinsic intrinsicId = intrinsicTree->GetHWIntrinsicId(); GenTree* delayFreeOp = nullptr; @@ -2100,7 +2088,6 @@ GenTree* LinearScan::getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isR assert(*isRMW); delayFreeOp = intrinsicTree->Op(1); assert(delayFreeOp != nullptr); - *delayFreeMultiple = true; break; case NI_Sve_CreateBreakPropagateMask: @@ -2128,12 +2115,6 @@ GenTree* LinearScan::getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isR break; } - if (delayFreeOp != nullptr) - { - // Only preference the delay op if it is not contained. - delayFreeOp = delayFreeOp->isContained() ? nullptr : delayFreeOp; - } - return delayFreeOp; } //------------------------------------------------------------------------ @@ -2237,7 +2218,7 @@ GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bo case NI_AdvSimd_LoadAndInsertScalarVector64x2: case NI_AdvSimd_LoadAndInsertScalarVector64x3: case NI_AdvSimd_LoadAndInsertScalarVector64x4: - consecutiveOp = intrin.op2; + consecutiveOp = intrin.op1; assert(consecutiveOp != nullptr); *destIsConsecutive = true; break; From 1b30f920fd753a38fc5bc9454d0ff9b15a742906 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 27 Aug 2024 16:43:12 +0100 Subject: [PATCH 13/31] Fixes from other PRs to be removed --- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 7 -- src/coreclr/jit/lowerarmarch.cpp | 68 ++++++++++--------- .../Shared/_SveBinaryOpTestTemplate.template | 29 ++++++++ .../Shared/_SveTernOpTestTemplate.template | 30 ++++++++ .../JitBlue/Runtime_106869/Runtime_106869.cs | 63 +++++++++++++++++ .../Runtime_106869/Runtime_106869.csproj | 9 +++ 6 files changed, 166 insertions(+), 40 deletions(-) create mode 100644 src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.cs create mode 100644 src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.csproj diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index bd5027cd4e02d5..aa2f0845507724 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -1106,7 +1106,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) } else if (isRMW) { - assert((targetReg == op1Reg) || (targetReg != op2Reg)); GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op1Reg, /* canSkip */ true); GetEmitter()->emitIns_R_R(ins, emitSize, targetReg, op2Reg, opt); @@ -1124,18 +1123,12 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { if (HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)) { - assert((targetReg == op1Reg) || (targetReg != op1Reg)); - assert((targetReg == op1Reg) || (targetReg != op3Reg)); - GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op2Reg, /* canSkip */ true); GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, op1Reg, op3Reg, opt); } else { - assert((targetReg == op1Reg) || (targetReg != op2Reg)); - assert((targetReg == op1Reg) || (targetReg != op3Reg)); - GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op1Reg, /* canSkip */ true); GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, op2Reg, op3Reg, opt); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index a74bb3651c88f9..6b96564af8668b 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -4050,47 +4050,49 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) GenTree* nestedOp1 = nestedCndSel->Op(1); GenTree* nestedOp2 = nestedCndSel->Op(2); assert(varTypeIsMask(nestedOp1)); - assert(nestedOp2->OperIsHWIntrinsic()); - NamedIntrinsic nestedOp2Id = nestedOp2->AsHWIntrinsic()->GetHWIntrinsicId(); - - // If the nested op uses Pg/Z, then inactive lanes will result in zeros, so can only transform if - // op3 is all zeros. Such a Csel operation is absorbed into the instruction when emitted. Skip this optimisation - // when the nestedOp is a reduce operation. - - if (nestedOp1->IsMaskAllBitsSet() && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && - (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsVectorZero())) + if (nestedOp2->OperIsHWIntrinsic()) { - GenTree* nestedOp2 = nestedCndSel->Op(2); - GenTree* nestedOp3 = nestedCndSel->Op(3); + NamedIntrinsic nestedOp2Id = nestedOp2->AsHWIntrinsic()->GetHWIntrinsicId(); - JITDUMP("lowering nested ConditionalSelect HWIntrinisic (before):\n"); - DISPTREERANGE(BlockRange(), cndSelNode); - JITDUMP("\n"); + // If the nested op uses Pg/Z, then inactive lanes will result in zeros, so can only transform if + // op3 is all zeros. Such a Csel operation is absorbed into the instruction when emitted. Skip this + // optimisation when the nestedOp is a reduce operation. - // Transform: - // - // CndSel(mask, CndSel(AllTrue, embeddedMask(trueValOp2), trueValOp3), op3) to - // CndSel(mask, embedded(trueValOp2), op3) - // - cndSelNode->Op(2) = nestedCndSel->Op(2); - if (nestedOp3->IsMaskZero()) - { - BlockRange().Remove(nestedOp3); - } - else + if (nestedOp1->IsMaskAllBitsSet() && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && + (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsVectorZero())) { - nestedOp3->SetUnusedValue(); - } + GenTree* nestedOp2 = nestedCndSel->Op(2); + GenTree* nestedOp3 = nestedCndSel->Op(3); + + JITDUMP("lowering nested ConditionalSelect HWIntrinisic (before):\n"); + DISPTREERANGE(BlockRange(), cndSelNode); + JITDUMP("\n"); + + // Transform: + // + // CndSel(mask, CndSel(AllTrue, embeddedMask(trueValOp2), trueValOp3), op3) to + // CndSel(mask, embedded(trueValOp2), op3) + // + cndSelNode->Op(2) = nestedCndSel->Op(2); + if (nestedOp3->IsMaskZero()) + { + BlockRange().Remove(nestedOp3); + } + else + { + nestedOp3->SetUnusedValue(); + } - BlockRange().Remove(nestedOp1); - BlockRange().Remove(nestedCndSel); + BlockRange().Remove(nestedOp1); + BlockRange().Remove(nestedCndSel); - JITDUMP("lowering nested ConditionalSelect HWIntrinisic (after):\n"); - DISPTREERANGE(BlockRange(), cndSelNode); - JITDUMP("\n"); + JITDUMP("lowering nested ConditionalSelect HWIntrinisic (after):\n"); + DISPTREERANGE(BlockRange(), cndSelNode); + JITDUMP("\n"); - return cndSelNode; + return cndSelNode; + } } } else if (op1->IsMaskAllBitsSet()) diff --git a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveBinaryOpTestTemplate.template b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveBinaryOpTestTemplate.template index 3ffe8ca273b954..06133b661e58d3 100644 --- a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveBinaryOpTestTemplate.template +++ b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveBinaryOpTestTemplate.template @@ -43,9 +43,15 @@ namespace JIT.HardwareIntrinsics.Arm // Validates passing a local works, using Unsafe.Read test.RunLclVarScenario_UnsafeRead(); + // Validates using the same local multiple times works, using Unsafe.Read + test.RunSameLclVarScenario_UnsafeRead(); + // Validates passing an instance member of a class works test.RunClassFldScenario(); + // Validates using the same instance member of a class multiple times works + test.RunSameClassFldScenario(); + // Validates passing the field of a local struct works test.RunStructLclFldScenario(); @@ -250,6 +256,19 @@ namespace JIT.HardwareIntrinsics.Arm ValidateResult(op1, op2, _dataTable.outArrayPtr); } + public void RunSameLclVarScenario_UnsafeRead() + { + TestLibrary.TestFramework.BeginScenario(nameof(RunSameLclVarScenario_UnsafeRead)); + + var op = Unsafe.Read<{Op1VectorType}<{Op1BaseType}>>(_dataTable.inArray1Ptr); + var op1 = op; + var op2 = op; + var result = {Isa}.{Method}(op1, op2); + + Unsafe.Write(_dataTable.outArrayPtr, result); + ValidateResult(op1, op2, _dataTable.outArrayPtr); + } + public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); @@ -260,6 +279,16 @@ namespace JIT.HardwareIntrinsics.Arm ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } + public void RunSameClassFldScenario() + { + TestLibrary.TestFramework.BeginScenario(nameof(RunSameClassFldScenario)); + + var result = {Isa}.{Method}(_fld1, _fld1); + + Unsafe.Write(_dataTable.outArrayPtr, result); + ValidateResult(_fld1, _fld1, _dataTable.outArrayPtr); + } + public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); diff --git a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveTernOpTestTemplate.template b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveTernOpTestTemplate.template index 4c3334cb1475c8..13343926763ced 100644 --- a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveTernOpTestTemplate.template +++ b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveTernOpTestTemplate.template @@ -43,9 +43,15 @@ namespace JIT.HardwareIntrinsics.Arm // Validates passing a local works, using Unsafe.Read test.RunLclVarScenario_UnsafeRead(); + // Validates using the same local multiple times works, using Unsafe.Read + test.RunSameLclVarScenario_UnsafeRead(); + // Validates passing an instance member of a class works test.RunClassFldScenario(); + // Validates using the same instance member of a class multiple times works + test.RunSameClassFldScenario(); + // Validates passing the field of a local struct works test.RunStructLclFldScenario(); @@ -277,6 +283,20 @@ namespace JIT.HardwareIntrinsics.Arm ValidateResult(op1, op2, op3, _dataTable.outArrayPtr); } + public void RunSameLclVarScenario_UnsafeRead() + { + TestLibrary.TestFramework.BeginScenario(nameof(RunSameLclVarScenario_UnsafeRead)); + + var op = Unsafe.Read<{Op1VectorType}<{Op1BaseType}>>(_dataTable.inArray1Ptr); + var op1 = op; + var op2 = op; + var op3 = op; + var result = {Isa}.{Method}(op1, op2, op3); + + Unsafe.Write(_dataTable.outArrayPtr, result); + ValidateResult(op1, op2, op3, _dataTable.outArrayPtr); + } + public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); @@ -287,6 +307,16 @@ namespace JIT.HardwareIntrinsics.Arm ValidateResult(_fld1, _fld2, _fld3, _dataTable.outArrayPtr); } + public void RunSameClassFldScenario() + { + TestLibrary.TestFramework.BeginScenario(nameof(RunSameClassFldScenario)); + + var result = {Isa}.{Method}(_fld1, _fld1, _fld1); + + Unsafe.Write(_dataTable.outArrayPtr, result); + ValidateResult(_fld1, _fld1, _fld1, _dataTable.outArrayPtr); + } + public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.cs b/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.cs new file mode 100644 index 00000000000000..006d14a1dde9e9 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.cs @@ -0,0 +1,63 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using Xunit; +using System.Runtime.CompilerServices; + +// Generated by Fuzzlyn v2.3 on 2024-08-23 10:25:51 +// Run on Arm64 Windows +// Seed: 13938901376337307772-vectort,vector64,vector128,armsve +// Reduced from 210.5 KiB to 1.1 KiB in 00:02:19 +// Hits JIT assert in Release: +// Assertion failed 'nestedOp2->OperIsHWIntrinsic()' in 'Program:Main(Fuzzlyn.ExecutionServer.IRuntime)' during 'Lowering nodeinfo' (IL size 119; hash 0xade6b36b; FullOpts) +// +// File: C:\dev\dotnet\runtime2\src\coreclr\jit\lowerarmarch.cpp Line: 4062 +// +using System; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; + +public struct S0 +{ + public ulong F5; +} + +public class C0 +{ + public int F1; +} + +public class Runtime_1068867 +{ + public static S0 s_7; + public static byte s_14; + + [Fact] + public static void TestEntryPoint() + { + var vr12 = new C0(); + var vr14 = vr12.F1; + var vr15 = Vector128.CreateScalar(vr14).AsVector(); + var vr16 = Vector128.CreateScalar(0).AsVector(); + var vr17 = Vector128.CreateScalar(0).AsVector(); + var vr18 = Vector128.CreateScalar(0).AsVector(); + var vr19 = Vector128.CreateScalar(1).AsVector(); + var vr20 = Sve.ConditionalSelect(vr17, vr18, vr19); + var vr21 = Vector128.CreateScalar(0).AsVector(); + var vr22 = Sve.ConditionalSelect(vr16, vr20, vr21); + // var vr23 = (uint)Sve.GetActiveElementCount(vr15, vr22); + // M17(s_7, vr20 + Consume(vr22); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Consume(T val) + { + } + // public static byte M17(S0 argThis, uint arg0) + // { + // var vr0 = argThis.F5; + // return (byte)(4294967295U | (sbyte)Sve.SaturatingDecrementByActiveElementCount(vr0, Vector.Create(s_14))); + // } +} diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.csproj b/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.csproj new file mode 100644 index 00000000000000..1352ebe3277bc7 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.csproj @@ -0,0 +1,9 @@ + + + True + $(NoWarn),SYSLIB5003 + + + + + From 861831d310e38f4d8029551cdca3f6539c7e92fb Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 6 Sep 2024 10:40:28 +0100 Subject: [PATCH 14/31] Fix formatting --- src/coreclr/jit/lsra.h | 10 +++++----- src/coreclr/jit/lsraarm64.cpp | 29 ++++++++++++++--------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 1476a29189b5ea..339d3fa2eaf0b2 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1444,11 +1444,11 @@ class LinearScan : public LinearScanInterface return nextConsecutiveRefPositionMap; } FORCEINLINE RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); - SingleTypeRegSet getOperandCandidates(GenTreeHWIntrinsic* intrinsicTree, HWIntrinsic intrin, size_t opNum); - GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW); - GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); - GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool *destIsConsecutive); - bool buildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin); + SingleTypeRegSet getOperandCandidates(GenTreeHWIntrinsic* intrinsicTree, HWIntrinsic intrin, size_t opNum); + GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW); + GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); + GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool* destIsConsecutive); + bool buildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin); #endif #ifdef DEBUG diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index b9bdbf38b098b4..d98063ef80692d 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1410,8 +1410,8 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins size_t embNumArgs = embeddedOpNode->GetOperandCount(); // Determine whether this the embedded operation requires delay free - bool embeddedIsRMW = false; - GenTree* embeddedDelayFreeOp = getDelayFreeOp(embeddedOpNode, &embeddedIsRMW); + bool embeddedIsRMW = false; + GenTree* embeddedDelayFreeOp = getDelayFreeOp(embeddedOpNode, &embeddedIsRMW); // Handle Op1 @@ -1451,7 +1451,8 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins user = use.User(); } unsigned resultOpNum = - embeddedOpNode->GetResultOpNumForRmwIntrinsic(user, intrinEmbedded.op1, intrinEmbedded.op2, intrinEmbedded.op3); + embeddedOpNode->GetResultOpNumForRmwIntrinsic(user, intrinEmbedded.op1, intrinEmbedded.op2, + intrinEmbedded.op3); if (resultOpNum != 0) { @@ -1548,7 +1549,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou int srcCount = 0; - // ConditionalSelect with embedded masked operations require special handling if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp())) { @@ -1564,8 +1564,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou GenTree* addrOp = LinearScan::getVectorAddrOperand(intrinsicTree); // Determine whether this is an operation where one of the ops has consecutive registers - bool destIsConsecutive = false; - GenTree* consecutiveOp = getConsecutiveRegistersOperand(intrinsicTree, &destIsConsecutive); + bool destIsConsecutive = false; + GenTree* consecutiveOp = getConsecutiveRegistersOperand(intrinsicTree, &destIsConsecutive); // Build any immediates bool hasImmediateOperand = buildHWIntrinsicImmediate(intrinsicTree, intrin); @@ -1602,7 +1602,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou else { RefPosition* delayUse = BuildUse(operand, candidates); - srcCount+=1; + srcCount += 1; if (opNum == 1) { @@ -1631,7 +1631,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou buildInternalRegisterUses(); - // Build Destination int dstCount = 0; @@ -1976,7 +1975,7 @@ SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicT if (baseElementSize == 8) { - opCandidates= RBM_SVE_INDEXED_D_ELEMENT_ALLOWED_REGS.GetFloatRegSet(); + opCandidates = RBM_SVE_INDEXED_D_ELEMENT_ALLOWED_REGS.GetFloatRegSet(); } else { @@ -2038,7 +2037,7 @@ SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicT // GenTree* LinearScan::getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW) { - *isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); + *isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); const NamedIntrinsic intrinsicId = intrinsicTree->GetHWIntrinsicId(); GenTree* delayFreeOp = nullptr; @@ -2086,7 +2085,7 @@ GenTree* LinearScan::getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isR case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: assert(*isRMW); - delayFreeOp = intrinsicTree->Op(1); + delayFreeOp = intrinsicTree->Op(1); assert(delayFreeOp != nullptr); break; @@ -2142,7 +2141,7 @@ GenTree* LinearScan::getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree) } // Operands that are not loads or stores but do require an address - switch(intrinsicTree->GetHWIntrinsicId()) + switch (intrinsicTree->GetHWIntrinsicId()) { case NI_Sve_PrefetchBytes: case NI_Sve_PrefetchInt16: @@ -2156,7 +2155,7 @@ GenTree* LinearScan::getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree) { return intrinsicTree->Op(2); } - break; + break; default: break; @@ -2175,9 +2174,9 @@ GenTree* LinearScan::getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree) // Return Value: // The operand that requires consecutive registers // -GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool *destIsConsecutive) +GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool* destIsConsecutive) { - *destIsConsecutive = false; + *destIsConsecutive = false; GenTree* consecutiveOp = nullptr; if (!HWIntrinsicInfo::NeedsConsecutiveRegisters(intrin.id)) From 686749e4c41dbdba28ddc97eb42127525469137c Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Sep 2024 11:03:38 +0100 Subject: [PATCH 15/31] Use BuildHWIntrinsicImmediate for conditional select --- src/coreclr/jit/lsra.h | 2 +- src/coreclr/jit/lsraarm64.cpp | 67 +++++++++++------------------------ 2 files changed, 22 insertions(+), 47 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 339d3fa2eaf0b2..87b409ef74616c 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1448,7 +1448,6 @@ class LinearScan : public LinearScanInterface GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW); GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool* destIsConsecutive); - bool buildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin); #endif #ifdef DEBUG @@ -2072,6 +2071,7 @@ class LinearScan : public LinearScanInterface int BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin, int* pDstCount); + void BuildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin); #endif // TARGET_ARM64 #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index d98063ef80692d..ea0c38a2ce3055 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1413,7 +1413,10 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins bool embeddedIsRMW = false; GenTree* embeddedDelayFreeOp = getDelayFreeOp(embeddedOpNode, &embeddedIsRMW); - // Handle Op1 + // Build any immediates + BuildHWIntrinsicImmediate(embeddedOpNode, intrinEmbedded); + + // Build Op1 SingleTypeRegSet predMask = RBM_ALLMASK.GetPredicateRegSet(); if (HWIntrinsicInfo::IsLowMaskedOperation(intrinEmbedded.id)) @@ -1430,7 +1433,7 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins srcCount += BuildOperandUses(intrin.op1, predMask); } - // Handle Op2 and Op3 + // Build Op2 and Op3 if (embeddedIsRMW) { @@ -1459,39 +1462,6 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins embeddedDelayFreeOp = embeddedOpNode->Op(resultOpNum); } } - else if (HWIntrinsicInfo::HasImmediateOperand(intrinEmbedded.id)) - { - // Special handling for embedded intrinsics with immediates: - // We might need an additional register to hold branch targets into the switch table - // that encodes the immediate - bool needsInternalRegister; - switch (intrinEmbedded.id) - { - case NI_Sve_ShiftRightArithmeticForDivide: - assert(embNumArgs == 2); - needsInternalRegister = !embeddedOpNode->Op(2)->isContainedIntOrIImmed(); - break; - - case NI_Sve_AddRotateComplex: - assert(embNumArgs == 3); - needsInternalRegister = !embeddedOpNode->Op(3)->isContainedIntOrIImmed(); - break; - - case NI_Sve_MultiplyAddRotateComplex: - assert(embNumArgs == 4); - needsInternalRegister = !embeddedOpNode->Op(4)->isContainedIntOrIImmed(); - break; - - default: - needsInternalRegister = false; - break; - } - - if (needsInternalRegister) - { - buildInternalIntRegisterDefForNode(embeddedOpNode); - } - } for (size_t argNum = 1; argNum <= embNumArgs; argNum++) { @@ -1568,7 +1538,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou GenTree* consecutiveOp = getConsecutiveRegistersOperand(intrinsicTree, &destIsConsecutive); // Build any immediates - bool hasImmediateOperand = buildHWIntrinsicImmediate(intrinsicTree, intrin); + BuildHWIntrinsicImmediate(intrinsicTree, intrin); // Build all Operands for (int opNum = 1; opNum <= intrin.numOperands; opNum++) @@ -2261,20 +2231,15 @@ GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bo } //------------------------------------------------------------------------ -// buildHWIntrinsicImmediate: Build immediate values +// BuildHWIntrinsicImmediate: Build immediate values // // Arguments: // intrinsicTree - Tree to check // intrin - Intrin to check // -// Return Value: -// True if the intrinsic contains an immediate -// -bool LinearScan::buildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin) +void LinearScan::BuildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin) { - const bool hasImmediateOperand = HWIntrinsicInfo::HasImmediateOperand(intrin.id); - - if (hasImmediateOperand && !HWIntrinsicInfo::NoJmpTableImm(intrin.id)) + if (HWIntrinsicInfo::HasImmediateOperand(intrin.id) && !HWIntrinsicInfo::NoJmpTableImm(intrin.id)) { // We may need to allocate an additional general-purpose register when an intrinsic has a non-const immediate // operand and the intrinsic does not have an alternative non-const fallback form. @@ -2446,6 +2411,18 @@ bool LinearScan::buildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, co setInternalRegsDelayFree = true; break; + case NI_Sve_ShiftRightArithmeticForDivide: + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + break; + + case NI_Sve_AddRotateComplex: + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + break; + + case NI_Sve_MultiplyAddRotateComplex: + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + break; + default: unreached(); } @@ -2457,8 +2434,6 @@ bool LinearScan::buildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, co buildInternalIntRegisterDefForNode(intrinsicTree); } } - - return hasImmediateOperand; } #endif // FEATURE_HW_INTRINSICS From 1f8ed584eba94fd48bd238bdf9a55483f613cc13 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Sep 2024 11:47:20 +0100 Subject: [PATCH 16/31] Remove IsRMW --- src/coreclr/jit/lsra.h | 2 +- src/coreclr/jit/lsraarm64.cpp | 21 +++++++++------------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 87b409ef74616c..8a1cc4c56e4991 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1445,7 +1445,7 @@ class LinearScan : public LinearScanInterface } FORCEINLINE RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); SingleTypeRegSet getOperandCandidates(GenTreeHWIntrinsic* intrinsicTree, HWIntrinsic intrin, size_t opNum); - GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW); + GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree); GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool* destIsConsecutive); #endif diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index ea0c38a2ce3055..b22fe4676e62b3 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1410,8 +1410,7 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins size_t embNumArgs = embeddedOpNode->GetOperandCount(); // Determine whether this the embedded operation requires delay free - bool embeddedIsRMW = false; - GenTree* embeddedDelayFreeOp = getDelayFreeOp(embeddedOpNode, &embeddedIsRMW); + GenTree* embeddedDelayFreeOp = getDelayFreeOp(embeddedOpNode); // Build any immediates BuildHWIntrinsicImmediate(embeddedOpNode, intrinEmbedded); @@ -1435,7 +1434,7 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins // Build Op2 and Op3 - if (embeddedIsRMW) + if (embeddedDelayFreeOp != nullptr) { // If the embedded operation has RMW semantics then record delay-free for the "merge" value @@ -1527,8 +1526,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an operation where an op must be marked delayFree so that it // is not allocated the same register as the target. - bool isRMW = false; - GenTree* delayFreeOp = getDelayFreeOp(intrinsicTree, &isRMW); + GenTree* delayFreeOp = getDelayFreeOp(intrinsicTree); // Determine whether this is an operation where one of the ops is an address GenTree* addrOp = LinearScan::getVectorAddrOperand(intrinsicTree); @@ -1589,7 +1587,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } } } - else if (isRMW) + else if (delayFreeOp != nullptr) { srcCount += BuildDelayFreeUses(operand, delayFreeOp, candidates); } @@ -2000,14 +1998,13 @@ SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicT // // Arguments: // intrinsicTree - Tree to check -// isRMW (out) - Set to true if is a RMW node // // Return Value: // The operand that needs to be delay freed // -GenTree* LinearScan::getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isRMW) +GenTree* LinearScan::getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree) { - *isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); + bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); const NamedIntrinsic intrinsicId = intrinsicTree->GetHWIntrinsicId(); GenTree* delayFreeOp = nullptr; @@ -2054,20 +2051,20 @@ GenTree* LinearScan::getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree, bool* isR case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: - assert(*isRMW); + assert(isRMW); delayFreeOp = intrinsicTree->Op(1); assert(delayFreeOp != nullptr); break; case NI_Sve_CreateBreakPropagateMask: // RMW operates on the second op. - assert(*isRMW); + assert(isRMW); delayFreeOp = intrinsicTree->Op(2); assert(delayFreeOp != nullptr); break; default: - if (*isRMW) + if (isRMW) { if (HWIntrinsicInfo::IsExplicitMaskedOperation(intrinsicId)) { From 2041f6fc27a1f6f439cd07338eaaf2221f8232f0 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 6 Sep 2024 16:33:03 +0100 Subject: [PATCH 17/31] Replace BuildConditionalSelectWithEmbeddedOp() with BuildEmbeddedOperandUses() --- src/coreclr/jit/lsra.h | 5 +- src/coreclr/jit/lsraarm64.cpp | 114 ++++++++++++++-------------------- 2 files changed, 50 insertions(+), 69 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 8a1cc4c56e4991..ae80718b33bf8f 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1448,6 +1448,7 @@ class LinearScan : public LinearScanInterface GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree); GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool* destIsConsecutive); + GenTreeHWIntrinsic* getEmbeddedMaskOperand(const HWIntrinsic intrin); #endif #ifdef DEBUG @@ -2068,10 +2069,8 @@ class LinearScan : public LinearScanInterface #ifdef TARGET_ARM64 int BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwNode = nullptr); void BuildConsecutiveRegistersForDef(GenTree* treeNode, int fieldCount); - int BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrinsicTree, - const HWIntrinsic intrin, - int* pDstCount); void BuildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin); + int BuildEmbeddedOperandUses(GenTreeHWIntrinsic* embeddedOpNode, GenTree* embeddedDelayFreeOp); #endif // TARGET_ARM64 #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index b22fe4676e62b3..92de332c0a41bf 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1370,77 +1370,37 @@ int LinearScan::BuildNode(GenTree* tree) #include "hwintrinsic.h" //------------------------------------------------------------------------ -// BuildHWIntrinsic: Set the NodeInfo for a GT_HWINTRINSIC ConditionalSelect tree with an embedded op. +// BuildEmbeddedOperandUses: Build the uses for an Embedded Mask operand // // These need special handling as the ConditionalSelect and embedded op will be generated as a single instruction, // and possibly prefixed with a MOVPRFX // // // Arguments: -// tree - The GT_HWINTRINSIC node of interest -// intrin - The GT_HWINTRINSIC node as a HWIntrinsic -// pDstCount - OUT parameter - the number of registers defined for the given node +// embeddedOpNode - The embdedded node of interest +// intrin - The embedded node as a HWIntrinsic // // Return Value: // The number of sources consumed by this node. // -int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrinsicTree, - const HWIntrinsic intrin, - int* pDstCount) +int LinearScan::BuildEmbeddedOperandUses(GenTreeHWIntrinsic* embeddedOpNode, GenTree* embeddedDelayFreeOp) { - assert(intrin.id == NI_Sve_ConditionalSelect); - assert(HWIntrinsicInfo::IsMaskedOperation(intrin.id)); - assert(HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)); - assert(!HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)); - assert(!HWIntrinsicInfo::IsLowVectorOperation(intrin.id)); - assert(!HWIntrinsicInfo::IsMultiReg(intrin.id)); - assert(intrin.op1 != nullptr); - assert(varTypeIsMask(intrin.op1->TypeGet())); - assert(intrin.op2 != nullptr); - assert(intrin.op2->OperIs(GT_HWINTRINSIC)); - assert(intrin.op2->IsEmbMaskOp()); - assert(!intrin.op2->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)); - assert(intrin.op3 != nullptr); - assert(intrin.op4 == nullptr); - - int srcCount = 0; - - GenTreeHWIntrinsic* embeddedOpNode = intrin.op2->AsHWIntrinsic(); - const HWIntrinsic intrinEmbedded(embeddedOpNode); - size_t embNumArgs = embeddedOpNode->GetOperandCount(); + assert(embeddedOpNode->IsEmbMaskOp()); + assert(!embeddedOpNode->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)); - // Determine whether this the embedded operation requires delay free - GenTree* embeddedDelayFreeOp = getDelayFreeOp(embeddedOpNode); + int srcCount = 0; + const HWIntrinsic intrinEmbedded(embeddedOpNode); // Build any immediates BuildHWIntrinsicImmediate(embeddedOpNode, intrinEmbedded); - // Build Op1 - - SingleTypeRegSet predMask = RBM_ALLMASK.GetPredicateRegSet(); - if (HWIntrinsicInfo::IsLowMaskedOperation(intrinEmbedded.id)) - { - predMask = RBM_LOWMASK.GetPredicateRegSet(); - } - - if (embeddedDelayFreeOp != nullptr) - { - srcCount += BuildDelayFreeUses(intrin.op1, nullptr, predMask); - } - else - { - srcCount += BuildOperandUses(intrin.op1, predMask); - } - - // Build Op2 and Op3 - if (embeddedDelayFreeOp != nullptr) { // If the embedded operation has RMW semantics then record delay-free for the "merge" value if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmbedded.id)) { - assert(embNumArgs == 3); + assert(intrinEmbedded.numOperands == 3); // For FMA intrinsics, the arguments may get switched around during codegen. // Figure out where the delay free op will be. @@ -1462,7 +1422,7 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins } } - for (size_t argNum = 1; argNum <= embNumArgs; argNum++) + for (size_t argNum = 1; argNum <= intrinEmbedded.numOperands; argNum++) { GenTree* node = embeddedOpNode->Op(argNum); @@ -1483,20 +1443,12 @@ int LinearScan::BuildConditionalSelectWithEmbeddedOp(GenTreeHWIntrinsic* intrins assert((useRefPosition != nullptr && useRefPosition->delayRegFree) || (uses == 0)); } } - - srcCount += BuildDelayFreeUses(intrin.op3, embeddedDelayFreeOp); } else { - srcCount += BuildOperandUses(intrin.op2); - srcCount += BuildOperandUses(intrin.op3); + srcCount += BuildOperandUses(embeddedOpNode); } - buildInternalRegisterUses(); - - BuildDef(intrinsicTree); - - *pDstCount = 1; return srcCount; } @@ -1518,12 +1470,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou int srcCount = 0; - // ConditionalSelect with embedded masked operations require special handling - if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp())) - { - return BuildConditionalSelectWithEmbeddedOp(intrinsicTree, intrin, pDstCount); - } - // Determine whether this is an operation where an op must be marked delayFree so that it // is not allocated the same register as the target. GenTree* delayFreeOp = getDelayFreeOp(intrinsicTree); @@ -1535,6 +1481,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou bool destIsConsecutive = false; GenTree* consecutiveOp = getConsecutiveRegistersOperand(intrinsicTree, &destIsConsecutive); + // Determine whether this is an operation where one of the ops is an embedded mask + GenTreeHWIntrinsic* embeddedOp = getEmbeddedMaskOperand(intrin); + + // If there is an embedded op, then determine if that has an op that must be marked delayFree + if (embeddedOp != nullptr) + { + assert(delayFreeOp == nullptr); + delayFreeOp = getDelayFreeOp(embeddedOp); + } + // Build any immediates BuildHWIntrinsicImmediate(intrinsicTree, intrin); @@ -1547,10 +1503,18 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou SingleTypeRegSet candidates = getOperandCandidates(intrinsicTree, intrin, opNum); - if (addrOp == operand) + if (embeddedOp == operand) { + assert(addrOp != operand); + assert(consecutiveOp != operand); assert(delayFreeOp != operand); + + srcCount += BuildEmbeddedOperandUses(embeddedOp, delayFreeOp); + } + else if (addrOp == operand) + { assert(consecutiveOp != operand); + assert(delayFreeOp != operand); srcCount += BuildAddrUses(operand, candidates); } @@ -2227,6 +2191,24 @@ GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bo return consecutiveOp; } +//------------------------------------------------------------------------ +// +// Arguments: +// intrinsicTree - Tree to check +// +// Return Value: +// The operand that is an embedded mask +// +GenTreeHWIntrinsic* LinearScan::getEmbeddedMaskOperand(const HWIntrinsic intrin) +{ + if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp())) + { + assert(intrin.op2->OperIsHWIntrinsic()); + return intrin.op2->AsHWIntrinsic(); + } + return nullptr; +} + //------------------------------------------------------------------------ // BuildHWIntrinsicImmediate: Build immediate values // From c27b446c10faf02cac1a03799865a501bd464082 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Sep 2024 13:34:32 +0100 Subject: [PATCH 18/31] Revert "Fixes from other PRs to be removed" --- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 7 ++ src/coreclr/jit/lowerarmarch.cpp | 68 +++++++++---------- .../Shared/_SveBinaryOpTestTemplate.template | 29 -------- .../Shared/_SveTernOpTestTemplate.template | 30 -------- .../JitBlue/Runtime_106869/Runtime_106869.cs | 63 ----------------- .../Runtime_106869/Runtime_106869.csproj | 9 --- 6 files changed, 40 insertions(+), 166 deletions(-) delete mode 100644 src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.cs delete mode 100644 src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.csproj diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index aa2f0845507724..bd5027cd4e02d5 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -1106,6 +1106,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) } else if (isRMW) { + assert((targetReg == op1Reg) || (targetReg != op2Reg)); GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op1Reg, /* canSkip */ true); GetEmitter()->emitIns_R_R(ins, emitSize, targetReg, op2Reg, opt); @@ -1123,12 +1124,18 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { if (HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)) { + assert((targetReg == op1Reg) || (targetReg != op1Reg)); + assert((targetReg == op1Reg) || (targetReg != op3Reg)); + GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op2Reg, /* canSkip */ true); GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, op1Reg, op3Reg, opt); } else { + assert((targetReg == op1Reg) || (targetReg != op2Reg)); + assert((targetReg == op1Reg) || (targetReg != op3Reg)); + GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op1Reg, /* canSkip */ true); GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, op2Reg, op3Reg, opt); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 6b96564af8668b..a74bb3651c88f9 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -4050,49 +4050,47 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) GenTree* nestedOp1 = nestedCndSel->Op(1); GenTree* nestedOp2 = nestedCndSel->Op(2); assert(varTypeIsMask(nestedOp1)); + assert(nestedOp2->OperIsHWIntrinsic()); - if (nestedOp2->OperIsHWIntrinsic()) + NamedIntrinsic nestedOp2Id = nestedOp2->AsHWIntrinsic()->GetHWIntrinsicId(); + + // If the nested op uses Pg/Z, then inactive lanes will result in zeros, so can only transform if + // op3 is all zeros. Such a Csel operation is absorbed into the instruction when emitted. Skip this optimisation + // when the nestedOp is a reduce operation. + + if (nestedOp1->IsMaskAllBitsSet() && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && + (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsVectorZero())) { - NamedIntrinsic nestedOp2Id = nestedOp2->AsHWIntrinsic()->GetHWIntrinsicId(); + GenTree* nestedOp2 = nestedCndSel->Op(2); + GenTree* nestedOp3 = nestedCndSel->Op(3); - // If the nested op uses Pg/Z, then inactive lanes will result in zeros, so can only transform if - // op3 is all zeros. Such a Csel operation is absorbed into the instruction when emitted. Skip this - // optimisation when the nestedOp is a reduce operation. + JITDUMP("lowering nested ConditionalSelect HWIntrinisic (before):\n"); + DISPTREERANGE(BlockRange(), cndSelNode); + JITDUMP("\n"); - if (nestedOp1->IsMaskAllBitsSet() && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && - (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsVectorZero())) + // Transform: + // + // CndSel(mask, CndSel(AllTrue, embeddedMask(trueValOp2), trueValOp3), op3) to + // CndSel(mask, embedded(trueValOp2), op3) + // + cndSelNode->Op(2) = nestedCndSel->Op(2); + if (nestedOp3->IsMaskZero()) { - GenTree* nestedOp2 = nestedCndSel->Op(2); - GenTree* nestedOp3 = nestedCndSel->Op(3); - - JITDUMP("lowering nested ConditionalSelect HWIntrinisic (before):\n"); - DISPTREERANGE(BlockRange(), cndSelNode); - JITDUMP("\n"); - - // Transform: - // - // CndSel(mask, CndSel(AllTrue, embeddedMask(trueValOp2), trueValOp3), op3) to - // CndSel(mask, embedded(trueValOp2), op3) - // - cndSelNode->Op(2) = nestedCndSel->Op(2); - if (nestedOp3->IsMaskZero()) - { - BlockRange().Remove(nestedOp3); - } - else - { - nestedOp3->SetUnusedValue(); - } + BlockRange().Remove(nestedOp3); + } + else + { + nestedOp3->SetUnusedValue(); + } - BlockRange().Remove(nestedOp1); - BlockRange().Remove(nestedCndSel); + BlockRange().Remove(nestedOp1); + BlockRange().Remove(nestedCndSel); - JITDUMP("lowering nested ConditionalSelect HWIntrinisic (after):\n"); - DISPTREERANGE(BlockRange(), cndSelNode); - JITDUMP("\n"); + JITDUMP("lowering nested ConditionalSelect HWIntrinisic (after):\n"); + DISPTREERANGE(BlockRange(), cndSelNode); + JITDUMP("\n"); - return cndSelNode; - } + return cndSelNode; } } else if (op1->IsMaskAllBitsSet()) diff --git a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveBinaryOpTestTemplate.template b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveBinaryOpTestTemplate.template index 06133b661e58d3..3ffe8ca273b954 100644 --- a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveBinaryOpTestTemplate.template +++ b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveBinaryOpTestTemplate.template @@ -43,15 +43,9 @@ namespace JIT.HardwareIntrinsics.Arm // Validates passing a local works, using Unsafe.Read test.RunLclVarScenario_UnsafeRead(); - // Validates using the same local multiple times works, using Unsafe.Read - test.RunSameLclVarScenario_UnsafeRead(); - // Validates passing an instance member of a class works test.RunClassFldScenario(); - // Validates using the same instance member of a class multiple times works - test.RunSameClassFldScenario(); - // Validates passing the field of a local struct works test.RunStructLclFldScenario(); @@ -256,19 +250,6 @@ namespace JIT.HardwareIntrinsics.Arm ValidateResult(op1, op2, _dataTable.outArrayPtr); } - public void RunSameLclVarScenario_UnsafeRead() - { - TestLibrary.TestFramework.BeginScenario(nameof(RunSameLclVarScenario_UnsafeRead)); - - var op = Unsafe.Read<{Op1VectorType}<{Op1BaseType}>>(_dataTable.inArray1Ptr); - var op1 = op; - var op2 = op; - var result = {Isa}.{Method}(op1, op2); - - Unsafe.Write(_dataTable.outArrayPtr, result); - ValidateResult(op1, op2, _dataTable.outArrayPtr); - } - public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); @@ -279,16 +260,6 @@ namespace JIT.HardwareIntrinsics.Arm ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } - public void RunSameClassFldScenario() - { - TestLibrary.TestFramework.BeginScenario(nameof(RunSameClassFldScenario)); - - var result = {Isa}.{Method}(_fld1, _fld1); - - Unsafe.Write(_dataTable.outArrayPtr, result); - ValidateResult(_fld1, _fld1, _dataTable.outArrayPtr); - } - public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); diff --git a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveTernOpTestTemplate.template b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveTernOpTestTemplate.template index 13343926763ced..4c3334cb1475c8 100644 --- a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveTernOpTestTemplate.template +++ b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveTernOpTestTemplate.template @@ -43,15 +43,9 @@ namespace JIT.HardwareIntrinsics.Arm // Validates passing a local works, using Unsafe.Read test.RunLclVarScenario_UnsafeRead(); - // Validates using the same local multiple times works, using Unsafe.Read - test.RunSameLclVarScenario_UnsafeRead(); - // Validates passing an instance member of a class works test.RunClassFldScenario(); - // Validates using the same instance member of a class multiple times works - test.RunSameClassFldScenario(); - // Validates passing the field of a local struct works test.RunStructLclFldScenario(); @@ -283,20 +277,6 @@ namespace JIT.HardwareIntrinsics.Arm ValidateResult(op1, op2, op3, _dataTable.outArrayPtr); } - public void RunSameLclVarScenario_UnsafeRead() - { - TestLibrary.TestFramework.BeginScenario(nameof(RunSameLclVarScenario_UnsafeRead)); - - var op = Unsafe.Read<{Op1VectorType}<{Op1BaseType}>>(_dataTable.inArray1Ptr); - var op1 = op; - var op2 = op; - var op3 = op; - var result = {Isa}.{Method}(op1, op2, op3); - - Unsafe.Write(_dataTable.outArrayPtr, result); - ValidateResult(op1, op2, op3, _dataTable.outArrayPtr); - } - public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); @@ -307,16 +287,6 @@ namespace JIT.HardwareIntrinsics.Arm ValidateResult(_fld1, _fld2, _fld3, _dataTable.outArrayPtr); } - public void RunSameClassFldScenario() - { - TestLibrary.TestFramework.BeginScenario(nameof(RunSameClassFldScenario)); - - var result = {Isa}.{Method}(_fld1, _fld1, _fld1); - - Unsafe.Write(_dataTable.outArrayPtr, result); - ValidateResult(_fld1, _fld1, _fld1, _dataTable.outArrayPtr); - } - public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.cs b/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.cs deleted file mode 100644 index 006d14a1dde9e9..00000000000000 --- a/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.cs +++ /dev/null @@ -1,63 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using Xunit; -using System.Runtime.CompilerServices; - -// Generated by Fuzzlyn v2.3 on 2024-08-23 10:25:51 -// Run on Arm64 Windows -// Seed: 13938901376337307772-vectort,vector64,vector128,armsve -// Reduced from 210.5 KiB to 1.1 KiB in 00:02:19 -// Hits JIT assert in Release: -// Assertion failed 'nestedOp2->OperIsHWIntrinsic()' in 'Program:Main(Fuzzlyn.ExecutionServer.IRuntime)' during 'Lowering nodeinfo' (IL size 119; hash 0xade6b36b; FullOpts) -// -// File: C:\dev\dotnet\runtime2\src\coreclr\jit\lowerarmarch.cpp Line: 4062 -// -using System; -using System.Numerics; -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.Arm; - -public struct S0 -{ - public ulong F5; -} - -public class C0 -{ - public int F1; -} - -public class Runtime_1068867 -{ - public static S0 s_7; - public static byte s_14; - - [Fact] - public static void TestEntryPoint() - { - var vr12 = new C0(); - var vr14 = vr12.F1; - var vr15 = Vector128.CreateScalar(vr14).AsVector(); - var vr16 = Vector128.CreateScalar(0).AsVector(); - var vr17 = Vector128.CreateScalar(0).AsVector(); - var vr18 = Vector128.CreateScalar(0).AsVector(); - var vr19 = Vector128.CreateScalar(1).AsVector(); - var vr20 = Sve.ConditionalSelect(vr17, vr18, vr19); - var vr21 = Vector128.CreateScalar(0).AsVector(); - var vr22 = Sve.ConditionalSelect(vr16, vr20, vr21); - // var vr23 = (uint)Sve.GetActiveElementCount(vr15, vr22); - // M17(s_7, vr20 - Consume(vr22); - } - - [MethodImpl(MethodImplOptions.NoInlining)] - static void Consume(T val) - { - } - // public static byte M17(S0 argThis, uint arg0) - // { - // var vr0 = argThis.F5; - // return (byte)(4294967295U | (sbyte)Sve.SaturatingDecrementByActiveElementCount(vr0, Vector.Create(s_14))); - // } -} diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.csproj b/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.csproj deleted file mode 100644 index 1352ebe3277bc7..00000000000000 --- a/src/tests/JIT/Regression/JitBlue/Runtime_106869/Runtime_106869.csproj +++ /dev/null @@ -1,9 +0,0 @@ - - - True - $(NoWarn),SYSLIB5003 - - - - - From 0d8cf9b268200c009f95e9f9424db1e938893aff Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Sep 2024 13:42:16 +0100 Subject: [PATCH 19/31] Move functions --- src/coreclr/jit/lsra.h | 2 +- src/coreclr/jit/lsraarm64.cpp | 804 +++++++++++++++++----------------- 2 files changed, 403 insertions(+), 403 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index ae80718b33bf8f..f705d5253b9ecd 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1445,7 +1445,7 @@ class LinearScan : public LinearScanInterface } FORCEINLINE RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); SingleTypeRegSet getOperandCandidates(GenTreeHWIntrinsic* intrinsicTree, HWIntrinsic intrin, size_t opNum); - GenTree* getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree); + GenTree* getDelayFreeOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool* destIsConsecutive); GenTreeHWIntrinsic* getEmbeddedMaskOperand(const HWIntrinsic intrin); diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 92de332c0a41bf..9e8b5a70aee0b6 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1369,89 +1369,6 @@ int LinearScan::BuildNode(GenTree* tree) #include "hwintrinsic.h" -//------------------------------------------------------------------------ -// BuildEmbeddedOperandUses: Build the uses for an Embedded Mask operand -// -// These need special handling as the ConditionalSelect and embedded op will be generated as a single instruction, -// and possibly prefixed with a MOVPRFX -// -// -// Arguments: -// embeddedOpNode - The embdedded node of interest -// intrin - The embedded node as a HWIntrinsic -// -// Return Value: -// The number of sources consumed by this node. -// -int LinearScan::BuildEmbeddedOperandUses(GenTreeHWIntrinsic* embeddedOpNode, GenTree* embeddedDelayFreeOp) -{ - assert(embeddedOpNode->IsEmbMaskOp()); - assert(!embeddedOpNode->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)); - - int srcCount = 0; - const HWIntrinsic intrinEmbedded(embeddedOpNode); - - // Build any immediates - BuildHWIntrinsicImmediate(embeddedOpNode, intrinEmbedded); - - if (embeddedDelayFreeOp != nullptr) - { - // If the embedded operation has RMW semantics then record delay-free for the "merge" value - - if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmbedded.id)) - { - assert(intrinEmbedded.numOperands == 3); - - // For FMA intrinsics, the arguments may get switched around during codegen. - // Figure out where the delay free op will be. - - LIR::Use use; - GenTree* user = nullptr; - - if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embeddedOpNode, &use)) - { - user = use.User(); - } - unsigned resultOpNum = - embeddedOpNode->GetResultOpNumForRmwIntrinsic(user, intrinEmbedded.op1, intrinEmbedded.op2, - intrinEmbedded.op3); - - if (resultOpNum != 0) - { - embeddedDelayFreeOp = embeddedOpNode->Op(resultOpNum); - } - } - - for (size_t argNum = 1; argNum <= intrinEmbedded.numOperands; argNum++) - { - GenTree* node = embeddedOpNode->Op(argNum); - - if (node == embeddedDelayFreeOp) - { - tgtPrefUse = BuildUse(node); - srcCount++; - } - else - { - RefPosition* useRefPosition = nullptr; - - int uses = BuildDelayFreeUses(node, nullptr, RBM_NONE, &useRefPosition); - srcCount += uses; - - // It is a hard requirement that these are not allocated to the same register as the destination, - // so verify no optimizations kicked in to skip setting the delay-free. - assert((useRefPosition != nullptr && useRefPosition->delayRegFree) || (uses == 0)); - } - } - } - else - { - srcCount += BuildOperandUses(embeddedOpNode); - } - - return srcCount; -} - //------------------------------------------------------------------------ // BuildHWIntrinsic: Set the NodeInfo for a GT_HWINTRINSIC tree. // @@ -1472,7 +1389,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an operation where an op must be marked delayFree so that it // is not allocated the same register as the target. - GenTree* delayFreeOp = getDelayFreeOp(intrinsicTree); + GenTree* delayFreeOp = getDelayFreeOperand(intrinsicTree); // Determine whether this is an operation where one of the ops is an address GenTree* addrOp = LinearScan::getVectorAddrOperand(intrinsicTree); @@ -1488,7 +1405,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (embeddedOp != nullptr) { assert(delayFreeOp == nullptr); - delayFreeOp = getDelayFreeOp(embeddedOp); + delayFreeOp = getDelayFreeOperand(embeddedOp); } // Build any immediates @@ -1866,147 +1783,436 @@ bool RefPosition::isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegist #endif // DEBUG //------------------------------------------------------------------------ -// getOperandCandidates: Get the register candidates for a given operand number +// BuildHWIntrinsicImmediate: Build immediate values // // Arguments: // intrinsicTree - Tree to check // intrin - Intrin to check -// OpNum - The Operand number in the intrinsic tree -// -// Return Value: -// The candidates for the operand number // -SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicTree, HWIntrinsic intrin, size_t opNum) +void LinearScan::BuildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin) { - SingleTypeRegSet opCandidates = RBM_NONE; - - if (HWIntrinsicInfo::IsLowVectorOperation(intrin.id)) + if (HWIntrinsicInfo::HasImmediateOperand(intrin.id) && !HWIntrinsicInfo::NoJmpTableImm(intrin.id)) { - assert(!varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())); + // We may need to allocate an additional general-purpose register when an intrinsic has a non-const immediate + // operand and the intrinsic does not have an alternative non-const fallback form. + // However, for a case when the operand can take only two possible values - zero and one + // the codegen can use cbnz to do conditional branch, so such register is not needed. - bool isLowVectorOpNum = false; + bool needBranchTargetReg = false; - switch (intrin.id) - { - case NI_Sve_DotProductBySelectedScalar: - case NI_Sve_FusedMultiplyAddBySelectedScalar: - case NI_Sve_FusedMultiplySubtractBySelectedScalar: - case NI_Sve_MultiplyAddRotateComplexBySelectedScalar: - isLowVectorOpNum = (opNum == 3); - break; - case NI_Sve_MultiplyBySelectedScalar: - isLowVectorOpNum = (opNum == 2); - break; - default: - unreached(); - } + int immLowerBound = 0; + int immUpperBound = 0; - if (isLowVectorOpNum) + if (intrin.category == HW_Category_SIMDByIndexedElement) { - unsigned baseElementSize = genTypeSize(intrin.baseType); + var_types indexedElementOpType; - if (baseElementSize == 8) + if (intrin.numOperands == 2) { - opCandidates = RBM_SVE_INDEXED_D_ELEMENT_ALLOWED_REGS.GetFloatRegSet(); + indexedElementOpType = intrin.op1->TypeGet(); + } + else if (intrin.numOperands == 3) + { + indexedElementOpType = intrin.op2->TypeGet(); } else { - assert(baseElementSize == 4); - opCandidates = RBM_SVE_INDEXED_S_ELEMENT_ALLOWED_REGS.GetFloatRegSet(); + assert(intrin.numOperands == 4); + indexedElementOpType = intrin.op3->TypeGet(); } - } - } - else if ((intrin.category == HW_Category_SIMDByIndexedElement) && (genTypeSize(intrin.baseType) == 2)) - { - // Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g. - // "MLA (by element)") have encoding that restricts what registers that can be used for the indexed element when - // the element size is H (i.e. 2 bytes). - if ((intrin.numOperands == 4) || (intrin.numOperands == 3 && !HWIntrinsicInfo::HasImmediateOperand(intrin.id))) - { - opCandidates = (opNum == 3) ? RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet() : opCandidates; - } - else if (intrin.id == NI_Sve_DuplicateSelectedScalarToVector) - { - // Do nothing + assert(varTypeIsSIMD(indexedElementOpType)); + + const unsigned int indexedElementSimdSize = genTypeSize(indexedElementOpType); + HWIntrinsicInfo::lookupImmBounds(intrin.id, indexedElementSimdSize, intrin.baseType, 1, &immLowerBound, + &immUpperBound); } else { - opCandidates = (opNum == 2) ? RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet() : opCandidates; + HWIntrinsicInfo::lookupImmBounds(intrin.id, intrinsicTree->GetSimdSize(), intrin.baseType, 1, + &immLowerBound, &immUpperBound); } - } - else if (opNum == 1 && HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)) - { - assert(varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())); - opCandidates = RBM_ALLMASK.GetPredicateRegSet(); - - if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)) + if ((immLowerBound != 0) || (immUpperBound != 1)) { - opCandidates = RBM_LOWMASK.GetPredicateRegSet(); - } - } - else if (varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())) - { - opCandidates = RBM_ALLMASK.GetPredicateRegSet(); - } - - return opCandidates; -} - -//------------------------------------------------------------------------ -// getDelayFreeOp: Get the delay free characteristics of the HWIntrinsic -// -// For a RMW intrinsic preference the RMW operand to the target. -// For a simple move semantic between two SIMD registers, then preference the source operand. -// -// Arguments: -// intrinsicTree - Tree to check -// -// Return Value: -// The operand that needs to be delay freed -// -GenTree* LinearScan::getDelayFreeOp(GenTreeHWIntrinsic* intrinsicTree) -{ - bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); + if ((intrin.category == HW_Category_SIMDByIndexedElement) || + (intrin.category == HW_Category_ShiftLeftByImmediate) || + (intrin.category == HW_Category_ShiftRightByImmediate)) + { + switch (intrin.numOperands) + { + case 4: + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + break; - const NamedIntrinsic intrinsicId = intrinsicTree->GetHWIntrinsicId(); - GenTree* delayFreeOp = nullptr; + case 3: + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + break; - switch (intrinsicId) - { - case NI_Vector64_CreateScalarUnsafe: - case NI_Vector128_CreateScalarUnsafe: - if (varTypeIsFloating(intrinsicTree->Op(1))) - { - delayFreeOp = intrinsicTree->Op(1); - assert(delayFreeOp != nullptr); - } - break; + case 2: + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + break; - case NI_AdvSimd_Arm64_DuplicateToVector64: - if (intrinsicTree->Op(1)->TypeGet() == TYP_DOUBLE) - { - delayFreeOp = intrinsicTree->Op(1); - assert(delayFreeOp != nullptr); + default: + unreached(); + } } - break; - - case NI_Vector64_ToScalar: - case NI_Vector128_ToScalar: - if (varTypeIsFloating(intrinsicTree)) + else { - delayFreeOp = intrinsicTree->Op(1); - assert(delayFreeOp != nullptr); - } - break; + switch (intrin.id) + { + case NI_AdvSimd_DuplicateSelectedScalarToVector64: + case NI_AdvSimd_DuplicateSelectedScalarToVector128: + case NI_AdvSimd_Extract: + case NI_AdvSimd_Insert: + case NI_AdvSimd_InsertScalar: + case NI_AdvSimd_LoadAndInsertScalar: + case NI_AdvSimd_LoadAndInsertScalarVector64x2: + case NI_AdvSimd_LoadAndInsertScalarVector64x3: + case NI_AdvSimd_LoadAndInsertScalarVector64x4: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: + case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128: + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + break; - case NI_Vector64_ToVector128Unsafe: - case NI_Vector128_AsVector128Unsafe: - case NI_Vector128_AsVector3: - case NI_Vector128_GetLower: - delayFreeOp = intrinsicTree->Op(1); - assert(delayFreeOp != nullptr); + case NI_AdvSimd_ExtractVector64: + case NI_AdvSimd_ExtractVector128: + case NI_AdvSimd_StoreSelectedScalar: + case NI_AdvSimd_Arm64_StoreSelectedScalar: + case NI_Sve_PrefetchBytes: + case NI_Sve_PrefetchInt16: + case NI_Sve_PrefetchInt32: + case NI_Sve_PrefetchInt64: + case NI_Sve_ExtractVector: + case NI_Sve_TrigonometricMultiplyAddCoefficient: + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + break; + + case NI_AdvSimd_Arm64_InsertSelectedScalar: + assert(intrin.op2->isContainedIntOrIImmed()); + assert(intrin.op4->isContainedIntOrIImmed()); + break; + + case NI_Sve_CreateTrueMaskByte: + case NI_Sve_CreateTrueMaskDouble: + case NI_Sve_CreateTrueMaskInt16: + case NI_Sve_CreateTrueMaskInt32: + case NI_Sve_CreateTrueMaskInt64: + case NI_Sve_CreateTrueMaskSByte: + case NI_Sve_CreateTrueMaskSingle: + case NI_Sve_CreateTrueMaskUInt16: + case NI_Sve_CreateTrueMaskUInt32: + case NI_Sve_CreateTrueMaskUInt64: + case NI_Sve_Count16BitElements: + case NI_Sve_Count32BitElements: + case NI_Sve_Count64BitElements: + case NI_Sve_Count8BitElements: + needBranchTargetReg = !intrin.op1->isContainedIntOrIImmed(); + break; + + case NI_Sve_GatherPrefetch8Bit: + case NI_Sve_GatherPrefetch16Bit: + case NI_Sve_GatherPrefetch32Bit: + case NI_Sve_GatherPrefetch64Bit: + if (!varTypeIsSIMD(intrin.op2->gtType)) + { + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + } + else + { + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + } + break; + + case NI_Sve_SaturatingDecrementBy16BitElementCount: + case NI_Sve_SaturatingDecrementBy32BitElementCount: + case NI_Sve_SaturatingDecrementBy64BitElementCount: + case NI_Sve_SaturatingDecrementBy8BitElementCount: + case NI_Sve_SaturatingIncrementBy16BitElementCount: + case NI_Sve_SaturatingIncrementBy32BitElementCount: + case NI_Sve_SaturatingIncrementBy64BitElementCount: + case NI_Sve_SaturatingIncrementBy8BitElementCount: + case NI_Sve_SaturatingDecrementBy16BitElementCountScalar: + case NI_Sve_SaturatingDecrementBy32BitElementCountScalar: + case NI_Sve_SaturatingDecrementBy64BitElementCountScalar: + case NI_Sve_SaturatingIncrementBy16BitElementCountScalar: + case NI_Sve_SaturatingIncrementBy32BitElementCountScalar: + case NI_Sve_SaturatingIncrementBy64BitElementCountScalar: + // Can only avoid generating a table if both immediates are constant. + assert(intrin.op2->isContainedIntOrIImmed() == intrin.op3->isContainedIntOrIImmed()); + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + // Ensure that internal does not collide with destination. + setInternalRegsDelayFree = true; + break; + + case NI_Sve_MultiplyAddRotateComplexBySelectedScalar: + // This API has two immediates, one of which is used to index pairs of floats in a vector. + // For a vector width of 128 bits, this means the index's range is [0, 1], + // which means we will skip the above jump table register check, + // even though we might need a jump table for the second immediate. + // Thus, this API is special-cased, and does not use the HW_Category_SIMDByIndexedElement path. + // Also, only one internal register is needed for the jump table; + // we will combine the two immediates into one jump table. + + // Can only avoid generating a table if both immediates are constant. + assert(intrin.op4->isContainedIntOrIImmed() == intrin.op5->isContainedIntOrIImmed()); + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + // Ensure that internal does not collide with destination. + setInternalRegsDelayFree = true; + break; + + case NI_Sve_ShiftRightArithmeticForDivide: + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + break; + + case NI_Sve_AddRotateComplex: + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + break; + + case NI_Sve_MultiplyAddRotateComplex: + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + break; + + default: + unreached(); + } + } + } + + if (needBranchTargetReg) + { + buildInternalIntRegisterDefForNode(intrinsicTree); + } + } +} + +//------------------------------------------------------------------------ +// BuildEmbeddedOperandUses: Build the uses for an Embedded Mask operand +// +// These need special handling as the ConditionalSelect and embedded op will be generated as a single instruction, +// and possibly prefixed with a MOVPRFX +// +// +// Arguments: +// embeddedOpNode - The embdedded node of interest +// intrin - The embedded node as a HWIntrinsic +// +// Return Value: +// The number of sources consumed by this node. +// +int LinearScan::BuildEmbeddedOperandUses(GenTreeHWIntrinsic* embeddedOpNode, GenTree* embeddedDelayFreeOp) +{ + assert(embeddedOpNode->IsEmbMaskOp()); + assert(!embeddedOpNode->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)); + + int srcCount = 0; + const HWIntrinsic intrinEmbedded(embeddedOpNode); + + // Build any immediates + BuildHWIntrinsicImmediate(embeddedOpNode, intrinEmbedded); + + if (embeddedDelayFreeOp != nullptr) + { + // If the embedded operation has RMW semantics then record delay-free for the "merge" value + + if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmbedded.id)) + { + assert(intrinEmbedded.numOperands == 3); + + // For FMA intrinsics, the arguments may get switched around during codegen. + // Figure out where the delay free op will be. + + LIR::Use use; + GenTree* user = nullptr; + + if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embeddedOpNode, &use)) + { + user = use.User(); + } + unsigned resultOpNum = + embeddedOpNode->GetResultOpNumForRmwIntrinsic(user, intrinEmbedded.op1, intrinEmbedded.op2, + intrinEmbedded.op3); + + if (resultOpNum != 0) + { + embeddedDelayFreeOp = embeddedOpNode->Op(resultOpNum); + } + } + + for (size_t argNum = 1; argNum <= intrinEmbedded.numOperands; argNum++) + { + GenTree* node = embeddedOpNode->Op(argNum); + + if (node == embeddedDelayFreeOp) + { + tgtPrefUse = BuildUse(node); + srcCount++; + } + else + { + RefPosition* useRefPosition = nullptr; + + int uses = BuildDelayFreeUses(node, nullptr, RBM_NONE, &useRefPosition); + srcCount += uses; + + // It is a hard requirement that these are not allocated to the same register as the destination, + // so verify no optimizations kicked in to skip setting the delay-free. + assert((useRefPosition != nullptr && useRefPosition->delayRegFree) || (uses == 0)); + } + } + } + else + { + srcCount += BuildOperandUses(embeddedOpNode); + } + + return srcCount; +} + +//------------------------------------------------------------------------ +// getOperandCandidates: Get the register candidates for a given operand number +// +// Arguments: +// intrinsicTree - Tree to check +// intrin - Intrin to check +// OpNum - The Operand number in the intrinsic tree +// +// Return Value: +// The candidates for the operand number +// +SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicTree, HWIntrinsic intrin, size_t opNum) +{ + SingleTypeRegSet opCandidates = RBM_NONE; + + if (HWIntrinsicInfo::IsLowVectorOperation(intrin.id)) + { + assert(!varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())); + + bool isLowVectorOpNum = false; + + switch (intrin.id) + { + case NI_Sve_DotProductBySelectedScalar: + case NI_Sve_FusedMultiplyAddBySelectedScalar: + case NI_Sve_FusedMultiplySubtractBySelectedScalar: + case NI_Sve_MultiplyAddRotateComplexBySelectedScalar: + isLowVectorOpNum = (opNum == 3); + break; + case NI_Sve_MultiplyBySelectedScalar: + isLowVectorOpNum = (opNum == 2); + break; + default: + unreached(); + } + + if (isLowVectorOpNum) + { + unsigned baseElementSize = genTypeSize(intrin.baseType); + + if (baseElementSize == 8) + { + opCandidates = RBM_SVE_INDEXED_D_ELEMENT_ALLOWED_REGS.GetFloatRegSet(); + } + else + { + assert(baseElementSize == 4); + opCandidates = RBM_SVE_INDEXED_S_ELEMENT_ALLOWED_REGS.GetFloatRegSet(); + } + } + } + else if ((intrin.category == HW_Category_SIMDByIndexedElement) && (genTypeSize(intrin.baseType) == 2)) + { + // Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g. + // "MLA (by element)") have encoding that restricts what registers that can be used for the indexed element when + // the element size is H (i.e. 2 bytes). + + if ((intrin.numOperands == 4) || (intrin.numOperands == 3 && !HWIntrinsicInfo::HasImmediateOperand(intrin.id))) + { + opCandidates = (opNum == 3) ? RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet() : opCandidates; + } + else if (intrin.id == NI_Sve_DuplicateSelectedScalarToVector) + { + // Do nothing + } + else + { + opCandidates = (opNum == 2) ? RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet() : opCandidates; + } + } + else if (opNum == 1 && HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)) + { + assert(varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())); + + opCandidates = RBM_ALLMASK.GetPredicateRegSet(); + + if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)) + { + opCandidates = RBM_LOWMASK.GetPredicateRegSet(); + } + } + else if (varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())) + { + opCandidates = RBM_ALLMASK.GetPredicateRegSet(); + } + + return opCandidates; +} + +//------------------------------------------------------------------------ +// getDelayFreeOperand: Get the delay free characteristics of the HWIntrinsic +// +// For a RMW intrinsic preference the RMW operand to the target. +// For a simple move semantic between two SIMD registers, then preference the source operand. +// +// Arguments: +// intrinsicTree - Tree to check +// +// Return Value: +// The operand that needs to be delay freed +// +GenTree* LinearScan::getDelayFreeOperand(GenTreeHWIntrinsic* intrinsicTree) +{ + bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); + + const NamedIntrinsic intrinsicId = intrinsicTree->GetHWIntrinsicId(); + GenTree* delayFreeOp = nullptr; + + switch (intrinsicId) + { + case NI_Vector64_CreateScalarUnsafe: + case NI_Vector128_CreateScalarUnsafe: + if (varTypeIsFloating(intrinsicTree->Op(1))) + { + delayFreeOp = intrinsicTree->Op(1); + assert(delayFreeOp != nullptr); + } + break; + + case NI_AdvSimd_Arm64_DuplicateToVector64: + if (intrinsicTree->Op(1)->TypeGet() == TYP_DOUBLE) + { + delayFreeOp = intrinsicTree->Op(1); + assert(delayFreeOp != nullptr); + } + break; + + case NI_Vector64_ToScalar: + case NI_Vector128_ToScalar: + if (varTypeIsFloating(intrinsicTree)) + { + delayFreeOp = intrinsicTree->Op(1); + assert(delayFreeOp != nullptr); + } + break; + + case NI_Vector64_ToVector128Unsafe: + case NI_Vector128_AsVector128Unsafe: + case NI_Vector128_AsVector3: + case NI_Vector128_GetLower: + delayFreeOp = intrinsicTree->Op(1); + assert(delayFreeOp != nullptr); break; case NI_AdvSimd_LoadAndInsertScalarVector64x2: @@ -2209,212 +2415,6 @@ GenTreeHWIntrinsic* LinearScan::getEmbeddedMaskOperand(const HWIntrinsic intrin) return nullptr; } -//------------------------------------------------------------------------ -// BuildHWIntrinsicImmediate: Build immediate values -// -// Arguments: -// intrinsicTree - Tree to check -// intrin - Intrin to check -// -void LinearScan::BuildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin) -{ - if (HWIntrinsicInfo::HasImmediateOperand(intrin.id) && !HWIntrinsicInfo::NoJmpTableImm(intrin.id)) - { - // We may need to allocate an additional general-purpose register when an intrinsic has a non-const immediate - // operand and the intrinsic does not have an alternative non-const fallback form. - // However, for a case when the operand can take only two possible values - zero and one - // the codegen can use cbnz to do conditional branch, so such register is not needed. - - bool needBranchTargetReg = false; - - int immLowerBound = 0; - int immUpperBound = 0; - - if (intrin.category == HW_Category_SIMDByIndexedElement) - { - var_types indexedElementOpType; - - if (intrin.numOperands == 2) - { - indexedElementOpType = intrin.op1->TypeGet(); - } - else if (intrin.numOperands == 3) - { - indexedElementOpType = intrin.op2->TypeGet(); - } - else - { - assert(intrin.numOperands == 4); - indexedElementOpType = intrin.op3->TypeGet(); - } - - assert(varTypeIsSIMD(indexedElementOpType)); - - const unsigned int indexedElementSimdSize = genTypeSize(indexedElementOpType); - HWIntrinsicInfo::lookupImmBounds(intrin.id, indexedElementSimdSize, intrin.baseType, 1, &immLowerBound, - &immUpperBound); - } - else - { - HWIntrinsicInfo::lookupImmBounds(intrin.id, intrinsicTree->GetSimdSize(), intrin.baseType, 1, - &immLowerBound, &immUpperBound); - } - - if ((immLowerBound != 0) || (immUpperBound != 1)) - { - if ((intrin.category == HW_Category_SIMDByIndexedElement) || - (intrin.category == HW_Category_ShiftLeftByImmediate) || - (intrin.category == HW_Category_ShiftRightByImmediate)) - { - switch (intrin.numOperands) - { - case 4: - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - break; - - case 3: - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - break; - - case 2: - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - break; - - default: - unreached(); - } - } - else - { - switch (intrin.id) - { - case NI_AdvSimd_DuplicateSelectedScalarToVector64: - case NI_AdvSimd_DuplicateSelectedScalarToVector128: - case NI_AdvSimd_Extract: - case NI_AdvSimd_Insert: - case NI_AdvSimd_InsertScalar: - case NI_AdvSimd_LoadAndInsertScalar: - case NI_AdvSimd_LoadAndInsertScalarVector64x2: - case NI_AdvSimd_LoadAndInsertScalarVector64x3: - case NI_AdvSimd_LoadAndInsertScalarVector64x4: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: - case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128: - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - break; - - case NI_AdvSimd_ExtractVector64: - case NI_AdvSimd_ExtractVector128: - case NI_AdvSimd_StoreSelectedScalar: - case NI_AdvSimd_Arm64_StoreSelectedScalar: - case NI_Sve_PrefetchBytes: - case NI_Sve_PrefetchInt16: - case NI_Sve_PrefetchInt32: - case NI_Sve_PrefetchInt64: - case NI_Sve_ExtractVector: - case NI_Sve_TrigonometricMultiplyAddCoefficient: - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - break; - - case NI_AdvSimd_Arm64_InsertSelectedScalar: - assert(intrin.op2->isContainedIntOrIImmed()); - assert(intrin.op4->isContainedIntOrIImmed()); - break; - - case NI_Sve_CreateTrueMaskByte: - case NI_Sve_CreateTrueMaskDouble: - case NI_Sve_CreateTrueMaskInt16: - case NI_Sve_CreateTrueMaskInt32: - case NI_Sve_CreateTrueMaskInt64: - case NI_Sve_CreateTrueMaskSByte: - case NI_Sve_CreateTrueMaskSingle: - case NI_Sve_CreateTrueMaskUInt16: - case NI_Sve_CreateTrueMaskUInt32: - case NI_Sve_CreateTrueMaskUInt64: - case NI_Sve_Count16BitElements: - case NI_Sve_Count32BitElements: - case NI_Sve_Count64BitElements: - case NI_Sve_Count8BitElements: - needBranchTargetReg = !intrin.op1->isContainedIntOrIImmed(); - break; - - case NI_Sve_GatherPrefetch8Bit: - case NI_Sve_GatherPrefetch16Bit: - case NI_Sve_GatherPrefetch32Bit: - case NI_Sve_GatherPrefetch64Bit: - if (!varTypeIsSIMD(intrin.op2->gtType)) - { - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - } - else - { - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - } - break; - - case NI_Sve_SaturatingDecrementBy16BitElementCount: - case NI_Sve_SaturatingDecrementBy32BitElementCount: - case NI_Sve_SaturatingDecrementBy64BitElementCount: - case NI_Sve_SaturatingDecrementBy8BitElementCount: - case NI_Sve_SaturatingIncrementBy16BitElementCount: - case NI_Sve_SaturatingIncrementBy32BitElementCount: - case NI_Sve_SaturatingIncrementBy64BitElementCount: - case NI_Sve_SaturatingIncrementBy8BitElementCount: - case NI_Sve_SaturatingDecrementBy16BitElementCountScalar: - case NI_Sve_SaturatingDecrementBy32BitElementCountScalar: - case NI_Sve_SaturatingDecrementBy64BitElementCountScalar: - case NI_Sve_SaturatingIncrementBy16BitElementCountScalar: - case NI_Sve_SaturatingIncrementBy32BitElementCountScalar: - case NI_Sve_SaturatingIncrementBy64BitElementCountScalar: - // Can only avoid generating a table if both immediates are constant. - assert(intrin.op2->isContainedIntOrIImmed() == intrin.op3->isContainedIntOrIImmed()); - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - // Ensure that internal does not collide with destination. - setInternalRegsDelayFree = true; - break; - - case NI_Sve_MultiplyAddRotateComplexBySelectedScalar: - // This API has two immediates, one of which is used to index pairs of floats in a vector. - // For a vector width of 128 bits, this means the index's range is [0, 1], - // which means we will skip the above jump table register check, - // even though we might need a jump table for the second immediate. - // Thus, this API is special-cased, and does not use the HW_Category_SIMDByIndexedElement path. - // Also, only one internal register is needed for the jump table; - // we will combine the two immediates into one jump table. - - // Can only avoid generating a table if both immediates are constant. - assert(intrin.op4->isContainedIntOrIImmed() == intrin.op5->isContainedIntOrIImmed()); - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - // Ensure that internal does not collide with destination. - setInternalRegsDelayFree = true; - break; - - case NI_Sve_ShiftRightArithmeticForDivide: - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - break; - - case NI_Sve_AddRotateComplex: - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - break; - - case NI_Sve_MultiplyAddRotateComplex: - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - break; - - default: - unreached(); - } - } - } - - if (needBranchTargetReg) - { - buildInternalIntRegisterDefForNode(intrinsicTree); - } - } -} - #endif // FEATURE_HW_INTRINSICS #endif // TARGET_ARM64 From cb99816dcee802cf29e37ced3d9c02253a63eaa6 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Sep 2024 13:47:49 +0100 Subject: [PATCH 20/31] Move functions --- src/coreclr/jit/lsraarm64.cpp | 690 +++++++++++++++++----------------- 1 file changed, 345 insertions(+), 345 deletions(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 9e8b5a70aee0b6..d7c801e552d673 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1515,273 +1515,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou return srcCount; } -//------------------------------------------------------------------------ -// BuildConsecutiveRegistersForUse: Build ref position(s) for `treeNode` that has a -// requirement of allocating consecutive registers. It will create the RefTypeUse -// RefPositions for as many consecutive registers are needed for `treeNode` and in -// between, it might contain RefTypeUpperVectorRestore RefPositions. -// -// For the first RefPosition of the series, it sets the `regCount` field equal to -// the number of subsequent RefPositions (including the first one) involved for this -// treeNode. For the subsequent RefPositions, it sets the `regCount` to 0. For all -// the RefPositions created, it sets the `needsConsecutive` flag so it can be used to -// identify these RefPositions during allocation. -// -// It also populates a `RefPositionMap` to access the subsequent RefPositions from -// a given RefPosition. This was preferred rather than adding a field in RefPosition -// for this purpose. -// -// Arguments: -// treeNode - The GT_HWINTRINSIC node of interest -// rmwNode - Read-modify-write node. -// -// Return Value: -// The number of sources consumed by this node. -// -int LinearScan::BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwNode) -{ - int srcCount = 0; - Interval* rmwInterval = nullptr; - bool rmwIsLastUse = false; - if (rmwNode != nullptr) - { - if (isCandidateLocalRef(rmwNode)) - { - rmwInterval = getIntervalForLocalVarNode(rmwNode->AsLclVar()); - rmwIsLastUse = rmwNode->AsLclVar()->IsLastUse(0); - } - } - if (treeNode->OperIsFieldList()) - { - assert(compiler->info.compNeedsConsecutiveRegisters); - - unsigned regCount = 0; - RefPosition* firstRefPos = nullptr; - RefPosition* currRefPos = nullptr; - RefPosition* lastRefPos = nullptr; - - NextConsecutiveRefPositionsMap* refPositionMap = getNextConsecutiveRefPositionsMap(); - for (GenTreeFieldList::Use& use : treeNode->AsFieldList()->Uses()) - { - RefPosition* restoreRefPos = nullptr; - RefPositionIterator prevRefPos = refPositions.backPosition(); - - currRefPos = BuildUse(use.GetNode(), RBM_NONE, 0); - - // Check if restore RefPositions were created - RefPositionIterator tailRefPos = refPositions.backPosition(); - assert(tailRefPos == currRefPos); - prevRefPos++; - if (prevRefPos != tailRefPos) - { - restoreRefPos = prevRefPos; - assert(restoreRefPos->refType == RefTypeUpperVectorRestore); - } - - currRefPos->needsConsecutive = true; - currRefPos->regCount = 0; -#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE - if (restoreRefPos != nullptr) - { - // If there was a restoreRefPosition created, make sure to link it - // as well so during register assignment, we could visit it and - // make sure that it doesn't get assigned one of register that is part - // of consecutive registers we are allocating for this treeNode. - // See assignConsecutiveRegisters(). - restoreRefPos->needsConsecutive = true; - restoreRefPos->regCount = 0; - if (firstRefPos == nullptr) - { - // Always set the non UpperVectorRestore as the firstRefPos. - // UpperVectorRestore can be assigned to a different independent - // register. - // See TODO-CQ in assignConsecutiveRegisters(). - firstRefPos = currRefPos; - } - refPositionMap->Set(lastRefPos, restoreRefPos, LinearScan::NextConsecutiveRefPositionsMap::Overwrite); - refPositionMap->Set(restoreRefPos, currRefPos, LinearScan::NextConsecutiveRefPositionsMap::Overwrite); - - if (rmwNode != nullptr) - { - // If we have rmwNode, determine if the restoreRefPos should be set to delay-free. - if ((restoreRefPos->getInterval() != rmwInterval) || (!rmwIsLastUse && !restoreRefPos->lastUse)) - { - setDelayFree(restoreRefPos); - } - } - } - else -#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE - { - if (firstRefPos == nullptr) - { - firstRefPos = currRefPos; - } - refPositionMap->Set(lastRefPos, currRefPos, LinearScan::NextConsecutiveRefPositionsMap::Overwrite); - } - - refPositionMap->Set(currRefPos, nullptr); - - lastRefPos = currRefPos; - regCount++; - if (rmwNode != nullptr) - { - // If we have rmwNode, determine if the currRefPos should be set to delay-free. - if ((currRefPos->getInterval() != rmwInterval) || (!rmwIsLastUse && !currRefPos->lastUse)) - { - setDelayFree(currRefPos); - } - } - } - - // Set `regCount` to actual consecutive registers count for first ref-position. - // For others, set 0 so we can identify that this is non-first RefPosition. - firstRefPos->regCount = regCount; - -#ifdef DEBUG - // Set the minimum register candidates needed for stress to work. - currRefPos = firstRefPos; - while (currRefPos != nullptr) - { - currRefPos->minRegCandidateCount = regCount; - currRefPos = getNextConsecutiveRefPosition(currRefPos); - } -#endif - srcCount += regCount; - } - else - { - RefPositionIterator refPositionMark = refPositions.backPosition(); - int refPositionsAdded = BuildOperandUses(treeNode); - - if (rmwNode != nullptr) - { - // Check all the newly created RefPositions for delay free - RefPositionIterator iter = refPositionMark; - - for (iter++; iter != refPositions.end(); iter++) - { - RefPosition* refPositionAdded = &(*iter); - - // If we have rmwNode, determine if the refPositionAdded should be set to delay-free. - if ((refPositionAdded->getInterval() != rmwInterval) || (!rmwIsLastUse && !refPositionAdded->lastUse)) - { - setDelayFree(refPositionAdded); - } - } - } - - srcCount += refPositionsAdded; - } - - return srcCount; -} - -//------------------------------------------------------------------------ -// BuildConsecutiveRegistersForDef: Build RefTypeDef ref position(s) for -// `treeNode` that produces `registerCount` consecutive registers. -// -// For the first RefPosition of the series, it sets the `regCount` field equal to -// the total number of RefPositions (including the first one) involved for this -// treeNode. For the subsequent RefPositions, it sets the `regCount` to 0. For all -// the RefPositions created, it sets the `needsConsecutive` flag so it can be used to -// identify these RefPositions during allocation. -// -// It also populates a `RefPositionMap` to access the subsequent RefPositions from -// a given RefPosition. This was preferred rather than adding a field in RefPosition -// for this purpose. -// -// Arguments: -// treeNode - The GT_HWINTRINSIC node of interest -// registerCount - Number of registers the treeNode produces -// -void LinearScan::BuildConsecutiveRegistersForDef(GenTree* treeNode, int registerCount) -{ - assert(registerCount > 1); - assert(compiler->info.compNeedsConsecutiveRegisters); - - RefPosition* currRefPos = nullptr; - RefPosition* lastRefPos = nullptr; - - NextConsecutiveRefPositionsMap* refPositionMap = getNextConsecutiveRefPositionsMap(); - for (int fieldIdx = 0; fieldIdx < registerCount; fieldIdx++) - { - currRefPos = BuildDef(treeNode, RBM_NONE, fieldIdx); - currRefPos->needsConsecutive = true; - currRefPos->regCount = 0; -#ifdef DEBUG - // Set the minimum register candidates needed for stress to work. - currRefPos->minRegCandidateCount = registerCount; -#endif - if (fieldIdx == 0) - { - // Set `regCount` to actual consecutive registers count for first ref-position. - // For others, set 0 so we can identify that this is non-first RefPosition. - - currRefPos->regCount = registerCount; - } - - refPositionMap->Set(lastRefPos, currRefPos, LinearScan::NextConsecutiveRefPositionsMap::Overwrite); - refPositionMap->Set(currRefPos, nullptr); - - lastRefPos = currRefPos; - } -} - -#ifdef DEBUG -//------------------------------------------------------------------------ -// isLiveAtConsecutiveRegistersLoc: Check if the refPosition is live at the location -// where consecutive registers are needed. This is used during JitStressRegs to -// not constrain the register requirements for such refpositions, because a lot -// of registers will be busy. For RefTypeUse, it will just see if the nodeLocation -// matches with the tracking `consecutiveRegistersLocation`. For Def, it will check -// the underlying `GenTree*` to see if the tree that produced it had consecutive -// registers requirement. -// -// -// Arguments: -// consecutiveRegistersLocation - The most recent location where consecutive -// registers were needed. -// -// Returns: If the refposition is live at same location which has the requirement of -// consecutive registers. -// -bool RefPosition::isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegistersLocation) -{ - if (needsConsecutive) - { - return true; - } - - bool atConsecutiveRegsLoc = consecutiveRegistersLocation == nodeLocation; - bool treeNeedsConsecutiveRegisters = false; - - if ((treeNode != nullptr) && treeNode->OperIsHWIntrinsic()) - { - const HWIntrinsic intrin(treeNode->AsHWIntrinsic()); - treeNeedsConsecutiveRegisters = HWIntrinsicInfo::NeedsConsecutiveRegisters(intrin.id); - } - - if (refType == RefTypeDef) - { - return treeNeedsConsecutiveRegisters; - } - else if (refType == RefTypeUse) - { - if (isIntervalRef() && getInterval()->isInternal) - { - return treeNeedsConsecutiveRegisters; - } - return atConsecutiveRegsLoc; - } - else if (refType == RefTypeUpperVectorRestore) - { - return atConsecutiveRegsLoc; - } - return false; -} -#endif // DEBUG - //------------------------------------------------------------------------ // BuildHWIntrinsicImmediate: Build immediate values // @@ -1956,120 +1689,387 @@ void LinearScan::BuildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, co // Also, only one internal register is needed for the jump table; // we will combine the two immediates into one jump table. - // Can only avoid generating a table if both immediates are constant. - assert(intrin.op4->isContainedIntOrIImmed() == intrin.op5->isContainedIntOrIImmed()); - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - // Ensure that internal does not collide with destination. - setInternalRegsDelayFree = true; - break; + // Can only avoid generating a table if both immediates are constant. + assert(intrin.op4->isContainedIntOrIImmed() == intrin.op5->isContainedIntOrIImmed()); + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + // Ensure that internal does not collide with destination. + setInternalRegsDelayFree = true; + break; + + case NI_Sve_ShiftRightArithmeticForDivide: + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + break; + + case NI_Sve_AddRotateComplex: + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + break; + + case NI_Sve_MultiplyAddRotateComplex: + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + break; + + default: + unreached(); + } + } + } + + if (needBranchTargetReg) + { + buildInternalIntRegisterDefForNode(intrinsicTree); + } + } +} + +//------------------------------------------------------------------------ +// BuildEmbeddedOperandUses: Build the uses for an Embedded Mask operand +// +// These need special handling as the ConditionalSelect and embedded op will be generated as a single instruction, +// and possibly prefixed with a MOVPRFX +// +// +// Arguments: +// embeddedOpNode - The embdedded node of interest +// intrin - The embedded node as a HWIntrinsic +// +// Return Value: +// The number of sources consumed by this node. +// +int LinearScan::BuildEmbeddedOperandUses(GenTreeHWIntrinsic* embeddedOpNode, GenTree* embeddedDelayFreeOp) +{ + assert(embeddedOpNode->IsEmbMaskOp()); + assert(!embeddedOpNode->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)); + + int srcCount = 0; + const HWIntrinsic intrinEmbedded(embeddedOpNode); + + // Build any immediates + BuildHWIntrinsicImmediate(embeddedOpNode, intrinEmbedded); + + if (embeddedDelayFreeOp != nullptr) + { + // If the embedded operation has RMW semantics then record delay-free for the "merge" value + + if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmbedded.id)) + { + assert(intrinEmbedded.numOperands == 3); + + // For FMA intrinsics, the arguments may get switched around during codegen. + // Figure out where the delay free op will be. + + LIR::Use use; + GenTree* user = nullptr; + + if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embeddedOpNode, &use)) + { + user = use.User(); + } + unsigned resultOpNum = + embeddedOpNode->GetResultOpNumForRmwIntrinsic(user, intrinEmbedded.op1, intrinEmbedded.op2, + intrinEmbedded.op3); + + if (resultOpNum != 0) + { + embeddedDelayFreeOp = embeddedOpNode->Op(resultOpNum); + } + } + + for (size_t argNum = 1; argNum <= intrinEmbedded.numOperands; argNum++) + { + GenTree* node = embeddedOpNode->Op(argNum); + + if (node == embeddedDelayFreeOp) + { + tgtPrefUse = BuildUse(node); + srcCount++; + } + else + { + RefPosition* useRefPosition = nullptr; + + int uses = BuildDelayFreeUses(node, nullptr, RBM_NONE, &useRefPosition); + srcCount += uses; + + // It is a hard requirement that these are not allocated to the same register as the destination, + // so verify no optimizations kicked in to skip setting the delay-free. + assert((useRefPosition != nullptr && useRefPosition->delayRegFree) || (uses == 0)); + } + } + } + else + { + srcCount += BuildOperandUses(embeddedOpNode); + } + + return srcCount; +} + +//------------------------------------------------------------------------ +// BuildConsecutiveRegistersForUse: Build ref position(s) for `treeNode` that has a +// requirement of allocating consecutive registers. It will create the RefTypeUse +// RefPositions for as many consecutive registers are needed for `treeNode` and in +// between, it might contain RefTypeUpperVectorRestore RefPositions. +// +// For the first RefPosition of the series, it sets the `regCount` field equal to +// the number of subsequent RefPositions (including the first one) involved for this +// treeNode. For the subsequent RefPositions, it sets the `regCount` to 0. For all +// the RefPositions created, it sets the `needsConsecutive` flag so it can be used to +// identify these RefPositions during allocation. +// +// It also populates a `RefPositionMap` to access the subsequent RefPositions from +// a given RefPosition. This was preferred rather than adding a field in RefPosition +// for this purpose. +// +// Arguments: +// treeNode - The GT_HWINTRINSIC node of interest +// rmwNode - Read-modify-write node. +// +// Return Value: +// The number of sources consumed by this node. +// +int LinearScan::BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwNode) +{ + int srcCount = 0; + Interval* rmwInterval = nullptr; + bool rmwIsLastUse = false; + if (rmwNode != nullptr) + { + if (isCandidateLocalRef(rmwNode)) + { + rmwInterval = getIntervalForLocalVarNode(rmwNode->AsLclVar()); + rmwIsLastUse = rmwNode->AsLclVar()->IsLastUse(0); + } + } + if (treeNode->OperIsFieldList()) + { + assert(compiler->info.compNeedsConsecutiveRegisters); + + unsigned regCount = 0; + RefPosition* firstRefPos = nullptr; + RefPosition* currRefPos = nullptr; + RefPosition* lastRefPos = nullptr; + + NextConsecutiveRefPositionsMap* refPositionMap = getNextConsecutiveRefPositionsMap(); + for (GenTreeFieldList::Use& use : treeNode->AsFieldList()->Uses()) + { + RefPosition* restoreRefPos = nullptr; + RefPositionIterator prevRefPos = refPositions.backPosition(); + + currRefPos = BuildUse(use.GetNode(), RBM_NONE, 0); + + // Check if restore RefPositions were created + RefPositionIterator tailRefPos = refPositions.backPosition(); + assert(tailRefPos == currRefPos); + prevRefPos++; + if (prevRefPos != tailRefPos) + { + restoreRefPos = prevRefPos; + assert(restoreRefPos->refType == RefTypeUpperVectorRestore); + } - case NI_Sve_ShiftRightArithmeticForDivide: - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - break; + currRefPos->needsConsecutive = true; + currRefPos->regCount = 0; +#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE + if (restoreRefPos != nullptr) + { + // If there was a restoreRefPosition created, make sure to link it + // as well so during register assignment, we could visit it and + // make sure that it doesn't get assigned one of register that is part + // of consecutive registers we are allocating for this treeNode. + // See assignConsecutiveRegisters(). + restoreRefPos->needsConsecutive = true; + restoreRefPos->regCount = 0; + if (firstRefPos == nullptr) + { + // Always set the non UpperVectorRestore as the firstRefPos. + // UpperVectorRestore can be assigned to a different independent + // register. + // See TODO-CQ in assignConsecutiveRegisters(). + firstRefPos = currRefPos; + } + refPositionMap->Set(lastRefPos, restoreRefPos, LinearScan::NextConsecutiveRefPositionsMap::Overwrite); + refPositionMap->Set(restoreRefPos, currRefPos, LinearScan::NextConsecutiveRefPositionsMap::Overwrite); - case NI_Sve_AddRotateComplex: - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - break; + if (rmwNode != nullptr) + { + // If we have rmwNode, determine if the restoreRefPos should be set to delay-free. + if ((restoreRefPos->getInterval() != rmwInterval) || (!rmwIsLastUse && !restoreRefPos->lastUse)) + { + setDelayFree(restoreRefPos); + } + } + } + else +#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE + { + if (firstRefPos == nullptr) + { + firstRefPos = currRefPos; + } + refPositionMap->Set(lastRefPos, currRefPos, LinearScan::NextConsecutiveRefPositionsMap::Overwrite); + } - case NI_Sve_MultiplyAddRotateComplex: - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - break; + refPositionMap->Set(currRefPos, nullptr); - default: - unreached(); + lastRefPos = currRefPos; + regCount++; + if (rmwNode != nullptr) + { + // If we have rmwNode, determine if the currRefPos should be set to delay-free. + if ((currRefPos->getInterval() != rmwInterval) || (!rmwIsLastUse && !currRefPos->lastUse)) + { + setDelayFree(currRefPos); } } } - if (needBranchTargetReg) + // Set `regCount` to actual consecutive registers count for first ref-position. + // For others, set 0 so we can identify that this is non-first RefPosition. + firstRefPos->regCount = regCount; + +#ifdef DEBUG + // Set the minimum register candidates needed for stress to work. + currRefPos = firstRefPos; + while (currRefPos != nullptr) { - buildInternalIntRegisterDefForNode(intrinsicTree); + currRefPos->minRegCandidateCount = regCount; + currRefPos = getNextConsecutiveRefPosition(currRefPos); + } +#endif + srcCount += regCount; + } + else + { + RefPositionIterator refPositionMark = refPositions.backPosition(); + int refPositionsAdded = BuildOperandUses(treeNode); + + if (rmwNode != nullptr) + { + // Check all the newly created RefPositions for delay free + RefPositionIterator iter = refPositionMark; + + for (iter++; iter != refPositions.end(); iter++) + { + RefPosition* refPositionAdded = &(*iter); + + // If we have rmwNode, determine if the refPositionAdded should be set to delay-free. + if ((refPositionAdded->getInterval() != rmwInterval) || (!rmwIsLastUse && !refPositionAdded->lastUse)) + { + setDelayFree(refPositionAdded); + } + } } + + srcCount += refPositionsAdded; } + + return srcCount; } //------------------------------------------------------------------------ -// BuildEmbeddedOperandUses: Build the uses for an Embedded Mask operand +// BuildConsecutiveRegistersForDef: Build RefTypeDef ref position(s) for +// `treeNode` that produces `registerCount` consecutive registers. // -// These need special handling as the ConditionalSelect and embedded op will be generated as a single instruction, -// and possibly prefixed with a MOVPRFX +// For the first RefPosition of the series, it sets the `regCount` field equal to +// the total number of RefPositions (including the first one) involved for this +// treeNode. For the subsequent RefPositions, it sets the `regCount` to 0. For all +// the RefPositions created, it sets the `needsConsecutive` flag so it can be used to +// identify these RefPositions during allocation. // +// It also populates a `RefPositionMap` to access the subsequent RefPositions from +// a given RefPosition. This was preferred rather than adding a field in RefPosition +// for this purpose. // // Arguments: -// embeddedOpNode - The embdedded node of interest -// intrin - The embedded node as a HWIntrinsic -// -// Return Value: -// The number of sources consumed by this node. +// treeNode - The GT_HWINTRINSIC node of interest +// registerCount - Number of registers the treeNode produces // -int LinearScan::BuildEmbeddedOperandUses(GenTreeHWIntrinsic* embeddedOpNode, GenTree* embeddedDelayFreeOp) +void LinearScan::BuildConsecutiveRegistersForDef(GenTree* treeNode, int registerCount) { - assert(embeddedOpNode->IsEmbMaskOp()); - assert(!embeddedOpNode->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)); - - int srcCount = 0; - const HWIntrinsic intrinEmbedded(embeddedOpNode); + assert(registerCount > 1); + assert(compiler->info.compNeedsConsecutiveRegisters); - // Build any immediates - BuildHWIntrinsicImmediate(embeddedOpNode, intrinEmbedded); + RefPosition* currRefPos = nullptr; + RefPosition* lastRefPos = nullptr; - if (embeddedDelayFreeOp != nullptr) + NextConsecutiveRefPositionsMap* refPositionMap = getNextConsecutiveRefPositionsMap(); + for (int fieldIdx = 0; fieldIdx < registerCount; fieldIdx++) { - // If the embedded operation has RMW semantics then record delay-free for the "merge" value - - if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmbedded.id)) + currRefPos = BuildDef(treeNode, RBM_NONE, fieldIdx); + currRefPos->needsConsecutive = true; + currRefPos->regCount = 0; +#ifdef DEBUG + // Set the minimum register candidates needed for stress to work. + currRefPos->minRegCandidateCount = registerCount; +#endif + if (fieldIdx == 0) { - assert(intrinEmbedded.numOperands == 3); - - // For FMA intrinsics, the arguments may get switched around during codegen. - // Figure out where the delay free op will be. + // Set `regCount` to actual consecutive registers count for first ref-position. + // For others, set 0 so we can identify that this is non-first RefPosition. - LIR::Use use; - GenTree* user = nullptr; + currRefPos->regCount = registerCount; + } - if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embeddedOpNode, &use)) - { - user = use.User(); - } - unsigned resultOpNum = - embeddedOpNode->GetResultOpNumForRmwIntrinsic(user, intrinEmbedded.op1, intrinEmbedded.op2, - intrinEmbedded.op3); + refPositionMap->Set(lastRefPos, currRefPos, LinearScan::NextConsecutiveRefPositionsMap::Overwrite); + refPositionMap->Set(currRefPos, nullptr); - if (resultOpNum != 0) - { - embeddedDelayFreeOp = embeddedOpNode->Op(resultOpNum); - } - } + lastRefPos = currRefPos; + } +} - for (size_t argNum = 1; argNum <= intrinEmbedded.numOperands; argNum++) - { - GenTree* node = embeddedOpNode->Op(argNum); +#ifdef DEBUG +//------------------------------------------------------------------------ +// isLiveAtConsecutiveRegistersLoc: Check if the refPosition is live at the location +// where consecutive registers are needed. This is used during JitStressRegs to +// not constrain the register requirements for such refpositions, because a lot +// of registers will be busy. For RefTypeUse, it will just see if the nodeLocation +// matches with the tracking `consecutiveRegistersLocation`. For Def, it will check +// the underlying `GenTree*` to see if the tree that produced it had consecutive +// registers requirement. +// +// +// Arguments: +// consecutiveRegistersLocation - The most recent location where consecutive +// registers were needed. +// +// Returns: If the refposition is live at same location which has the requirement of +// consecutive registers. +// +bool RefPosition::isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegistersLocation) +{ + if (needsConsecutive) + { + return true; + } - if (node == embeddedDelayFreeOp) - { - tgtPrefUse = BuildUse(node); - srcCount++; - } - else - { - RefPosition* useRefPosition = nullptr; + bool atConsecutiveRegsLoc = consecutiveRegistersLocation == nodeLocation; + bool treeNeedsConsecutiveRegisters = false; - int uses = BuildDelayFreeUses(node, nullptr, RBM_NONE, &useRefPosition); - srcCount += uses; + if ((treeNode != nullptr) && treeNode->OperIsHWIntrinsic()) + { + const HWIntrinsic intrin(treeNode->AsHWIntrinsic()); + treeNeedsConsecutiveRegisters = HWIntrinsicInfo::NeedsConsecutiveRegisters(intrin.id); + } - // It is a hard requirement that these are not allocated to the same register as the destination, - // so verify no optimizations kicked in to skip setting the delay-free. - assert((useRefPosition != nullptr && useRefPosition->delayRegFree) || (uses == 0)); - } + if (refType == RefTypeDef) + { + return treeNeedsConsecutiveRegisters; + } + else if (refType == RefTypeUse) + { + if (isIntervalRef() && getInterval()->isInternal) + { + return treeNeedsConsecutiveRegisters; } + return atConsecutiveRegsLoc; } - else + else if (refType == RefTypeUpperVectorRestore) { - srcCount += BuildOperandUses(embeddedOpNode); + return atConsecutiveRegsLoc; } - - return srcCount; + return false; } +#endif // DEBUG //------------------------------------------------------------------------ // getOperandCandidates: Get the register candidates for a given operand number From f5d34acbc2b66bae935ffecf1ea368691c6b7aee Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Sep 2024 11:46:56 +0100 Subject: [PATCH 21/31] Remove failing unary tests --- .../Arm/Shared/_SveMinimalUnaryOpTestTemplate.template | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveMinimalUnaryOpTestTemplate.template b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveMinimalUnaryOpTestTemplate.template index 5de4a13faa6ad0..5667254430b037 100644 --- a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveMinimalUnaryOpTestTemplate.template +++ b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveMinimalUnaryOpTestTemplate.template @@ -52,10 +52,10 @@ namespace JIT.HardwareIntrinsics.Arm test.RunStructFldScenario(); // Validates executing the test inside conditional, with op3 as falseValue - test.ConditionalSelect_FalseOp(); +// test.ConditionalSelect_FalseOp(); // Validates executing the test inside conditional, with op3 as zero - test.ConditionalSelect_ZeroOp(); +// test.ConditionalSelect_ZeroOp(); } else { From 357281e63ee7577f3762c9ca02aafc2ce840a7a6 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Sep 2024 14:59:05 +0100 Subject: [PATCH 22/31] Fix opNum type --- src/coreclr/jit/lsraarm64.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index d7c801e552d673..26fad32f6d8cf0 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1412,7 +1412,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou BuildHWIntrinsicImmediate(intrinsicTree, intrin); // Build all Operands - for (int opNum = 1; opNum <= intrin.numOperands; opNum++) + for (size_t opNum = 1; opNum <= intrin.numOperands; opNum++) { GenTree* operand = intrinsicTree->Op(opNum); From 00c33e80a965dc5070e513d803cf9d5d229748c2 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Sep 2024 16:44:19 +0100 Subject: [PATCH 23/31] Revert "Remove failing unary tests" --- .../Arm/Shared/_SveMinimalUnaryOpTestTemplate.template | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveMinimalUnaryOpTestTemplate.template b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveMinimalUnaryOpTestTemplate.template index 5667254430b037..5de4a13faa6ad0 100644 --- a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveMinimalUnaryOpTestTemplate.template +++ b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveMinimalUnaryOpTestTemplate.template @@ -52,10 +52,10 @@ namespace JIT.HardwareIntrinsics.Arm test.RunStructFldScenario(); // Validates executing the test inside conditional, with op3 as falseValue -// test.ConditionalSelect_FalseOp(); + test.ConditionalSelect_FalseOp(); // Validates executing the test inside conditional, with op3 as zero -// test.ConditionalSelect_ZeroOp(); + test.ConditionalSelect_ZeroOp(); } else { From e58f641b44d799d0d5ba623f563947cc71b7462c Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 11 Sep 2024 12:14:32 +0100 Subject: [PATCH 24/31] Remove cases from getDelayFreeOperand that are handled by default --- src/coreclr/jit/lsraarm64.cpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 26fad32f6d8cf0..963495174107eb 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -2215,17 +2215,6 @@ GenTree* LinearScan::getDelayFreeOperand(GenTreeHWIntrinsic* intrinsicTree) assert(delayFreeOp != nullptr); break; - case NI_AdvSimd_LoadAndInsertScalarVector64x2: - case NI_AdvSimd_LoadAndInsertScalarVector64x3: - case NI_AdvSimd_LoadAndInsertScalarVector64x4: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: - assert(isRMW); - delayFreeOp = intrinsicTree->Op(1); - assert(delayFreeOp != nullptr); - break; - case NI_Sve_CreateBreakPropagateMask: // RMW operates on the second op. assert(isRMW); From faa608ba6d71d814fef4a63d27c51dce9c6120c9 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 12 Sep 2024 07:55:52 +0100 Subject: [PATCH 25/31] review cleanups --- src/coreclr/jit/lsraarm64.cpp | 437 +++++++++++++++++----------------- 1 file changed, 220 insertions(+), 217 deletions(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 963495174107eb..77203021b567b8 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1392,7 +1392,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou GenTree* delayFreeOp = getDelayFreeOperand(intrinsicTree); // Determine whether this is an operation where one of the ops is an address - GenTree* addrOp = LinearScan::getVectorAddrOperand(intrinsicTree); + GenTree* addrOp = getVectorAddrOperand(intrinsicTree); // Determine whether this is an operation where one of the ops has consecutive registers bool destIsConsecutive = false; @@ -1516,208 +1516,210 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } //------------------------------------------------------------------------ -// BuildHWIntrinsicImmediate: Build immediate values +// BuildHWIntrinsicImmediate: Build immediate operands of intrinsics, if any. // // Arguments: -// intrinsicTree - Tree to check -// intrin - Intrin to check +// intrinsicTree - Intrinsic tree node for which need to build RefPositions for +// intrin - Underlying intrinsic // void LinearScan::BuildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, const HWIntrinsic intrin) { - if (HWIntrinsicInfo::HasImmediateOperand(intrin.id) && !HWIntrinsicInfo::NoJmpTableImm(intrin.id)) + if (!HWIntrinsicInfo::HasImmediateOperand(intrin.id) || HWIntrinsicInfo::NoJmpTableImm(intrin.id)) { - // We may need to allocate an additional general-purpose register when an intrinsic has a non-const immediate - // operand and the intrinsic does not have an alternative non-const fallback form. - // However, for a case when the operand can take only two possible values - zero and one - // the codegen can use cbnz to do conditional branch, so such register is not needed. - - bool needBranchTargetReg = false; + return; + } - int immLowerBound = 0; - int immUpperBound = 0; + // We may need to allocate an additional general-purpose register when an intrinsic has a non-const immediate + // operand and the intrinsic does not have an alternative non-const fallback form. + // However, for a case when the operand can take only two possible values - zero and one + // the codegen can use cbnz to do conditional branch, so such register is not needed. - if (intrin.category == HW_Category_SIMDByIndexedElement) - { - var_types indexedElementOpType; + bool needBranchTargetReg = false; - if (intrin.numOperands == 2) - { - indexedElementOpType = intrin.op1->TypeGet(); - } - else if (intrin.numOperands == 3) - { - indexedElementOpType = intrin.op2->TypeGet(); - } - else - { - assert(intrin.numOperands == 4); - indexedElementOpType = intrin.op3->TypeGet(); - } + int immLowerBound = 0; + int immUpperBound = 0; - assert(varTypeIsSIMD(indexedElementOpType)); + if (intrin.category == HW_Category_SIMDByIndexedElement) + { + var_types indexedElementOpType; - const unsigned int indexedElementSimdSize = genTypeSize(indexedElementOpType); - HWIntrinsicInfo::lookupImmBounds(intrin.id, indexedElementSimdSize, intrin.baseType, 1, &immLowerBound, - &immUpperBound); + if (intrin.numOperands == 2) + { + indexedElementOpType = intrin.op1->TypeGet(); + } + else if (intrin.numOperands == 3) + { + indexedElementOpType = intrin.op2->TypeGet(); } else { - HWIntrinsicInfo::lookupImmBounds(intrin.id, intrinsicTree->GetSimdSize(), intrin.baseType, 1, - &immLowerBound, &immUpperBound); + assert(intrin.numOperands == 4); + indexedElementOpType = intrin.op3->TypeGet(); } - if ((immLowerBound != 0) || (immUpperBound != 1)) + assert(varTypeIsSIMD(indexedElementOpType)); + + const unsigned int indexedElementSimdSize = genTypeSize(indexedElementOpType); + HWIntrinsicInfo::lookupImmBounds(intrin.id, indexedElementSimdSize, intrin.baseType, 1, &immLowerBound, + &immUpperBound); + } + else + { + HWIntrinsicInfo::lookupImmBounds(intrin.id, intrinsicTree->GetSimdSize(), intrin.baseType, 1, &immLowerBound, + &immUpperBound); + } + + if ((immLowerBound != 0) || (immUpperBound != 1)) + { + if ((intrin.category == HW_Category_SIMDByIndexedElement) || + (intrin.category == HW_Category_ShiftLeftByImmediate) || + (intrin.category == HW_Category_ShiftRightByImmediate)) { - if ((intrin.category == HW_Category_SIMDByIndexedElement) || - (intrin.category == HW_Category_ShiftLeftByImmediate) || - (intrin.category == HW_Category_ShiftRightByImmediate)) + switch (intrin.numOperands) { - switch (intrin.numOperands) - { - case 4: - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - break; + case 4: + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + break; - case 3: - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - break; + case 3: + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + break; - case 2: - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - break; + case 2: + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + break; - default: - unreached(); - } + default: + unreached(); } - else - { - switch (intrin.id) - { - case NI_AdvSimd_DuplicateSelectedScalarToVector64: - case NI_AdvSimd_DuplicateSelectedScalarToVector128: - case NI_AdvSimd_Extract: - case NI_AdvSimd_Insert: - case NI_AdvSimd_InsertScalar: - case NI_AdvSimd_LoadAndInsertScalar: - case NI_AdvSimd_LoadAndInsertScalarVector64x2: - case NI_AdvSimd_LoadAndInsertScalarVector64x3: - case NI_AdvSimd_LoadAndInsertScalarVector64x4: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: - case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: - case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128: - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - break; - - case NI_AdvSimd_ExtractVector64: - case NI_AdvSimd_ExtractVector128: - case NI_AdvSimd_StoreSelectedScalar: - case NI_AdvSimd_Arm64_StoreSelectedScalar: - case NI_Sve_PrefetchBytes: - case NI_Sve_PrefetchInt16: - case NI_Sve_PrefetchInt32: - case NI_Sve_PrefetchInt64: - case NI_Sve_ExtractVector: - case NI_Sve_TrigonometricMultiplyAddCoefficient: - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - break; - - case NI_AdvSimd_Arm64_InsertSelectedScalar: - assert(intrin.op2->isContainedIntOrIImmed()); - assert(intrin.op4->isContainedIntOrIImmed()); - break; - - case NI_Sve_CreateTrueMaskByte: - case NI_Sve_CreateTrueMaskDouble: - case NI_Sve_CreateTrueMaskInt16: - case NI_Sve_CreateTrueMaskInt32: - case NI_Sve_CreateTrueMaskInt64: - case NI_Sve_CreateTrueMaskSByte: - case NI_Sve_CreateTrueMaskSingle: - case NI_Sve_CreateTrueMaskUInt16: - case NI_Sve_CreateTrueMaskUInt32: - case NI_Sve_CreateTrueMaskUInt64: - case NI_Sve_Count16BitElements: - case NI_Sve_Count32BitElements: - case NI_Sve_Count64BitElements: - case NI_Sve_Count8BitElements: - needBranchTargetReg = !intrin.op1->isContainedIntOrIImmed(); - break; - - case NI_Sve_GatherPrefetch8Bit: - case NI_Sve_GatherPrefetch16Bit: - case NI_Sve_GatherPrefetch32Bit: - case NI_Sve_GatherPrefetch64Bit: - if (!varTypeIsSIMD(intrin.op2->gtType)) - { - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - } - else - { - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - } - break; - - case NI_Sve_SaturatingDecrementBy16BitElementCount: - case NI_Sve_SaturatingDecrementBy32BitElementCount: - case NI_Sve_SaturatingDecrementBy64BitElementCount: - case NI_Sve_SaturatingDecrementBy8BitElementCount: - case NI_Sve_SaturatingIncrementBy16BitElementCount: - case NI_Sve_SaturatingIncrementBy32BitElementCount: - case NI_Sve_SaturatingIncrementBy64BitElementCount: - case NI_Sve_SaturatingIncrementBy8BitElementCount: - case NI_Sve_SaturatingDecrementBy16BitElementCountScalar: - case NI_Sve_SaturatingDecrementBy32BitElementCountScalar: - case NI_Sve_SaturatingDecrementBy64BitElementCountScalar: - case NI_Sve_SaturatingIncrementBy16BitElementCountScalar: - case NI_Sve_SaturatingIncrementBy32BitElementCountScalar: - case NI_Sve_SaturatingIncrementBy64BitElementCountScalar: - // Can only avoid generating a table if both immediates are constant. - assert(intrin.op2->isContainedIntOrIImmed() == intrin.op3->isContainedIntOrIImmed()); - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - // Ensure that internal does not collide with destination. - setInternalRegsDelayFree = true; - break; - - case NI_Sve_MultiplyAddRotateComplexBySelectedScalar: - // This API has two immediates, one of which is used to index pairs of floats in a vector. - // For a vector width of 128 bits, this means the index's range is [0, 1], - // which means we will skip the above jump table register check, - // even though we might need a jump table for the second immediate. - // Thus, this API is special-cased, and does not use the HW_Category_SIMDByIndexedElement path. - // Also, only one internal register is needed for the jump table; - // we will combine the two immediates into one jump table. - - // Can only avoid generating a table if both immediates are constant. - assert(intrin.op4->isContainedIntOrIImmed() == intrin.op5->isContainedIntOrIImmed()); - needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - // Ensure that internal does not collide with destination. - setInternalRegsDelayFree = true; - break; + } + else + { + switch (intrin.id) + { + case NI_AdvSimd_DuplicateSelectedScalarToVector64: + case NI_AdvSimd_DuplicateSelectedScalarToVector128: + case NI_AdvSimd_Extract: + case NI_AdvSimd_Insert: + case NI_AdvSimd_InsertScalar: + case NI_AdvSimd_LoadAndInsertScalar: + case NI_AdvSimd_LoadAndInsertScalarVector64x2: + case NI_AdvSimd_LoadAndInsertScalarVector64x3: + case NI_AdvSimd_LoadAndInsertScalarVector64x4: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3: + case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4: + case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128: + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + break; - case NI_Sve_ShiftRightArithmeticForDivide: - needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); - break; + case NI_AdvSimd_ExtractVector64: + case NI_AdvSimd_ExtractVector128: + case NI_AdvSimd_StoreSelectedScalar: + case NI_AdvSimd_Arm64_StoreSelectedScalar: + case NI_Sve_PrefetchBytes: + case NI_Sve_PrefetchInt16: + case NI_Sve_PrefetchInt32: + case NI_Sve_PrefetchInt64: + case NI_Sve_ExtractVector: + case NI_Sve_TrigonometricMultiplyAddCoefficient: + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + break; - case NI_Sve_AddRotateComplex: - needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); - break; + case NI_AdvSimd_Arm64_InsertSelectedScalar: + assert(intrin.op2->isContainedIntOrIImmed()); + assert(intrin.op4->isContainedIntOrIImmed()); + break; + + case NI_Sve_CreateTrueMaskByte: + case NI_Sve_CreateTrueMaskDouble: + case NI_Sve_CreateTrueMaskInt16: + case NI_Sve_CreateTrueMaskInt32: + case NI_Sve_CreateTrueMaskInt64: + case NI_Sve_CreateTrueMaskSByte: + case NI_Sve_CreateTrueMaskSingle: + case NI_Sve_CreateTrueMaskUInt16: + case NI_Sve_CreateTrueMaskUInt32: + case NI_Sve_CreateTrueMaskUInt64: + case NI_Sve_Count16BitElements: + case NI_Sve_Count32BitElements: + case NI_Sve_Count64BitElements: + case NI_Sve_Count8BitElements: + needBranchTargetReg = !intrin.op1->isContainedIntOrIImmed(); + break; - case NI_Sve_MultiplyAddRotateComplex: + case NI_Sve_GatherPrefetch8Bit: + case NI_Sve_GatherPrefetch16Bit: + case NI_Sve_GatherPrefetch32Bit: + case NI_Sve_GatherPrefetch64Bit: + if (!varTypeIsSIMD(intrin.op2->gtType)) + { needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); - break; + } + else + { + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + } + break; - default: - unreached(); - } + case NI_Sve_SaturatingDecrementBy16BitElementCount: + case NI_Sve_SaturatingDecrementBy32BitElementCount: + case NI_Sve_SaturatingDecrementBy64BitElementCount: + case NI_Sve_SaturatingDecrementBy8BitElementCount: + case NI_Sve_SaturatingIncrementBy16BitElementCount: + case NI_Sve_SaturatingIncrementBy32BitElementCount: + case NI_Sve_SaturatingIncrementBy64BitElementCount: + case NI_Sve_SaturatingIncrementBy8BitElementCount: + case NI_Sve_SaturatingDecrementBy16BitElementCountScalar: + case NI_Sve_SaturatingDecrementBy32BitElementCountScalar: + case NI_Sve_SaturatingDecrementBy64BitElementCountScalar: + case NI_Sve_SaturatingIncrementBy16BitElementCountScalar: + case NI_Sve_SaturatingIncrementBy32BitElementCountScalar: + case NI_Sve_SaturatingIncrementBy64BitElementCountScalar: + // Can only avoid generating a table if both immediates are constant. + assert(intrin.op2->isContainedIntOrIImmed() == intrin.op3->isContainedIntOrIImmed()); + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + // Ensure that internal does not collide with destination. + setInternalRegsDelayFree = true; + break; + + case NI_Sve_MultiplyAddRotateComplexBySelectedScalar: + // This API has two immediates, one of which is used to index pairs of floats in a vector. + // For a vector width of 128 bits, this means the index's range is [0, 1], + // which means we will skip the above jump table register check, + // even though we might need a jump table for the second immediate. + // Thus, this API is special-cased, and does not use the HW_Category_SIMDByIndexedElement path. + // Also, only one internal register is needed for the jump table; + // we will combine the two immediates into one jump table. + + // Can only avoid generating a table if both immediates are constant. + assert(intrin.op4->isContainedIntOrIImmed() == intrin.op5->isContainedIntOrIImmed()); + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + // Ensure that internal does not collide with destination. + setInternalRegsDelayFree = true; + break; + + case NI_Sve_ShiftRightArithmeticForDivide: + needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed(); + break; + + case NI_Sve_AddRotateComplex: + needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed(); + break; + + case NI_Sve_MultiplyAddRotateComplex: + needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); + break; + + default: + unreached(); } } + } - if (needBranchTargetReg) - { - buildInternalIntRegisterDefForNode(intrinsicTree); - } + if (needBranchTargetReg) + { + buildInternalIntRegisterDefForNode(intrinsicTree); } } @@ -1746,60 +1748,61 @@ int LinearScan::BuildEmbeddedOperandUses(GenTreeHWIntrinsic* embeddedOpNode, Gen // Build any immediates BuildHWIntrinsicImmediate(embeddedOpNode, intrinEmbedded); - if (embeddedDelayFreeOp != nullptr) + if (embeddedDelayFreeOp == nullptr) { - // If the embedded operation has RMW semantics then record delay-free for the "merge" value + srcCount += BuildOperandUses(embeddedOpNode); + return srcCount; + } - if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmbedded.id)) - { - assert(intrinEmbedded.numOperands == 3); + // If the embedded operation has RMW semantics then record delay-free for the "merge" value + + if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmbedded.id)) + { + assert(intrinEmbedded.numOperands == 3); - // For FMA intrinsics, the arguments may get switched around during codegen. - // Figure out where the delay free op will be. + // For FMA intrinsics, the arguments may get switched around during codegen. + // Figure out where the delay free op will be. - LIR::Use use; - GenTree* user = nullptr; + LIR::Use use; + GenTree* user = nullptr; - if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embeddedOpNode, &use)) - { - user = use.User(); - } - unsigned resultOpNum = - embeddedOpNode->GetResultOpNumForRmwIntrinsic(user, intrinEmbedded.op1, intrinEmbedded.op2, - intrinEmbedded.op3); + if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embeddedOpNode, &use)) + { + user = use.User(); + } + unsigned resultOpNum = embeddedOpNode->GetResultOpNumForRmwIntrinsic(user, intrinEmbedded.op1, + intrinEmbedded.op2, intrinEmbedded.op3); - if (resultOpNum != 0) - { - embeddedDelayFreeOp = embeddedOpNode->Op(resultOpNum); - } + if (resultOpNum != 0) + { + embeddedDelayFreeOp = embeddedOpNode->Op(resultOpNum); } + } + + for (size_t argNum = 1; argNum <= intrinEmbedded.numOperands; argNum++) + { + GenTree* node = embeddedOpNode->Op(argNum); - for (size_t argNum = 1; argNum <= intrinEmbedded.numOperands; argNum++) + if (node == embeddedDelayFreeOp) + { + tgtPrefUse = BuildUse(node); + srcCount++; + } + else { - GenTree* node = embeddedOpNode->Op(argNum); + // TODO-CQ : The requirements here may not hold depending on the usage of movprfx in codegen, + // and if so then the uses of delay free can be reduced. - if (node == embeddedDelayFreeOp) - { - tgtPrefUse = BuildUse(node); - srcCount++; - } - else - { - RefPosition* useRefPosition = nullptr; + RefPosition* useRefPosition = nullptr; - int uses = BuildDelayFreeUses(node, nullptr, RBM_NONE, &useRefPosition); - srcCount += uses; + int uses = BuildDelayFreeUses(node, nullptr, RBM_NONE, &useRefPosition); + srcCount += uses; - // It is a hard requirement that these are not allocated to the same register as the destination, - // so verify no optimizations kicked in to skip setting the delay-free. - assert((useRefPosition != nullptr && useRefPosition->delayRegFree) || (uses == 0)); - } + // It is a hard requirement that these are not allocated to the same register as the destination, + // so verify no optimizations kicked in to skip setting the delay-free. + assert((useRefPosition != nullptr && useRefPosition->delayRegFree) || (uses == 0)); } } - else - { - srcCount += BuildOperandUses(embeddedOpNode); - } return srcCount; } @@ -2076,8 +2079,8 @@ bool RefPosition::isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegist // // Arguments: // intrinsicTree - Tree to check -// intrin - Intrin to check -// OpNum - The Operand number in the intrinsic tree +// intrin - The tree as a HWIntrinsic +// OpNum - The Operand number in the intrinsic tree // // Return Value: // The candidates for the operand number @@ -2246,7 +2249,7 @@ GenTree* LinearScan::getDelayFreeOperand(GenTreeHWIntrinsic* intrinsicTree) // getVectorAddrOperand: Get the address operand of the HWIntrinsic, if any // // Arguments: -// intrinsicTree - Node to check +// intrinsicTree - Tree to check // // Return Value: // The operand that is an address @@ -2294,7 +2297,7 @@ GenTree* LinearScan::getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree) // getConsecutiveRegistersOperand: Get the consecutive operand of the HWIntrinsic, if any // // Arguments: -// intrinsicTree - Tree to check +// intrinsicTree - Tree to check // destIsConsecutive (out) - if the destination requires consective registers // // Return Value: From a4da9457a488635e76929142a7dc7aadd0a06ddd Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 13 Sep 2024 14:50:50 +0100 Subject: [PATCH 26/31] Simplify masks in getOperandCandidates() --- src/coreclr/jit/lsraarm64.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 77203021b567b8..fbc0b62f95037a 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -2144,21 +2144,16 @@ SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicT opCandidates = (opNum == 2) ? RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet() : opCandidates; } } - else if (opNum == 1 && HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id)) + else if (varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())) { - assert(varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())); - opCandidates = RBM_ALLMASK.GetPredicateRegSet(); - if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)) + // Any low mask operands are always in op1 + if (opNum == 1 && HWIntrinsicInfo::IsLowMaskedOperation(intrin.id)) { opCandidates = RBM_LOWMASK.GetPredicateRegSet(); } } - else if (varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())) - { - opCandidates = RBM_ALLMASK.GetPredicateRegSet(); - } return opCandidates; } From 3d9e99750b82a30b7eab2611cfadded49bcd2b41 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 13 Sep 2024 14:53:03 +0100 Subject: [PATCH 27/31] Remove IsMaskedOperation() --- src/coreclr/jit/hwintrinsic.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index 139f2fbddeabd3..bb1a10a0c64c8d 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -947,12 +947,6 @@ struct HWIntrinsicInfo return (flags & HW_Flag_Scalable) != 0; } - static bool IsMaskedOperation(NamedIntrinsic id) - { - const HWIntrinsicFlag flags = lookupFlags(id); - return IsLowMaskedOperation(id) || IsOptionalEmbeddedMaskedOperation(id) || IsExplicitMaskedOperation(id); - } - static bool IsLowMaskedOperation(NamedIntrinsic id) { const HWIntrinsicFlag flags = lookupFlags(id); From 0b45899fcaa476ac3d334abba35ba594db64fefd Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 13 Sep 2024 14:53:37 +0100 Subject: [PATCH 28/31] Check for optional embedded masks in getDelayFreeOperand --- src/coreclr/jit/lsra.h | 2 +- src/coreclr/jit/lsraarm64.cpp | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index f705d5253b9ecd..6222646d155ea7 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1445,7 +1445,7 @@ class LinearScan : public LinearScanInterface } FORCEINLINE RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); SingleTypeRegSet getOperandCandidates(GenTreeHWIntrinsic* intrinsicTree, HWIntrinsic intrin, size_t opNum); - GenTree* getDelayFreeOperand(GenTreeHWIntrinsic* intrinsicTree); + GenTree* getDelayFreeOperand(GenTreeHWIntrinsic* intrinsicTree, bool embedded = false); GenTree* getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree); GenTree* getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool* destIsConsecutive); GenTreeHWIntrinsic* getEmbeddedMaskOperand(const HWIntrinsic intrin); diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index fbc0b62f95037a..e4ddd0e1afbde8 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1405,7 +1405,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (embeddedOp != nullptr) { assert(delayFreeOp == nullptr); - delayFreeOp = getDelayFreeOperand(embeddedOp); + delayFreeOp = getDelayFreeOperand(embeddedOp, /* embedded */ true); } // Build any immediates @@ -2166,11 +2166,12 @@ SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicT // // Arguments: // intrinsicTree - Tree to check +// embeddedOp - The embedded operand, if any // // Return Value: // The operand that needs to be delay freed // -GenTree* LinearScan::getDelayFreeOperand(GenTreeHWIntrinsic* intrinsicTree) +GenTree* LinearScan::getDelayFreeOperand(GenTreeHWIntrinsic* intrinsicTree, bool embedded) { bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); @@ -2229,6 +2230,16 @@ GenTree* LinearScan::getDelayFreeOperand(GenTreeHWIntrinsic* intrinsicTree) delayFreeOp = intrinsicTree->Op(2); assert(delayFreeOp != nullptr); } + else if (HWIntrinsicInfo::IsOptionalEmbeddedMaskedOperation(intrinsicId)) + { + // For optional embedded masks, RMW behavior only holds when the operand is embedded. + // The non-embedded variant is always not RMW. + if (embedded) + { + delayFreeOp = intrinsicTree->Op(1); + assert(delayFreeOp != nullptr); + } + } else { delayFreeOp = intrinsicTree->Op(1); From 7e3dd12dcd5661041d32df7699f2f9c7f330115e Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 26 Sep 2024 17:11:52 +0100 Subject: [PATCH 29/31] Only call BuildDelayFreeUses when register types match --- src/coreclr/jit/lsraarm64.cpp | 5 ++++- src/coreclr/jit/lsrabuild.cpp | 27 ++++++--------------------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index e4ddd0e1afbde8..0328249c9f7857 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1468,7 +1468,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } } } - else if (delayFreeOp != nullptr) + // Only build as delay free use if register types match + else if ((delayFreeOp != nullptr) && + (varTypeUsesSameRegType(delayFreeOp->TypeGet(), operand->TypeGet()) || + (delayFreeOp->IsMultiRegNode() && varTypeUsesFloatReg(operand->TypeGet())))) { srcCount += BuildDelayFreeUses(operand, delayFreeOp, candidates); } diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 368a5e82d53c75..4b1cc1b0f8c231 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3847,24 +3847,15 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, } } - // Don't mark as delay free if there is a mismatch in register types - bool addDelayFreeUses = false; // Multi register nodes should not go via this route. assert(!node->IsMultiRegNode()); - // Multi register nodes should always use fp registers (this includes vectors). - assert(varTypeUsesFloatReg(node->TypeGet()) || !node->IsMultiRegNode()); - if (rmwNode == nullptr || varTypeUsesSameRegType(rmwNode->TypeGet(), node->TypeGet()) || - (rmwNode->IsMultiRegNode() && varTypeUsesFloatReg(node->TypeGet()))) - { - addDelayFreeUses = true; - } + // The rmwNode should have the same register type as the node + assert(rmwNode == nullptr || varTypeUsesSameRegType(rmwNode->TypeGet(), node->TypeGet()) || + (rmwNode->IsMultiRegNode() && varTypeUsesFloatReg(node->TypeGet()))); if (use != nullptr) { - if (addDelayFreeUses) - { - AddDelayFreeUses(use, rmwNode); - } + AddDelayFreeUses(use, rmwNode); if (useRefPositionRef != nullptr) { *useRefPositionRef = use; @@ -3880,20 +3871,14 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, if (addrMode->HasBase() && !addrMode->Base()->isContained()) { use = BuildUse(addrMode->Base(), candidates); - if (addDelayFreeUses) - { - AddDelayFreeUses(use, rmwNode); - } + AddDelayFreeUses(use, rmwNode); srcCount++; } if (addrMode->HasIndex() && !addrMode->Index()->isContained()) { use = BuildUse(addrMode->Index(), candidates); - if (addDelayFreeUses) - { - AddDelayFreeUses(use, rmwNode); - } + AddDelayFreeUses(use, rmwNode); srcCount++; } From df8263b3737e5b5696022ac257b654e967866a6e Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 27 Sep 2024 14:09:08 +0100 Subject: [PATCH 30/31] Assert only on Arm64 --- src/coreclr/jit/lsrabuild.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 4b1cc1b0f8c231..2bca7fbdb256ba 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3847,11 +3847,13 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, } } +#ifdef TARGET_ARM64 // Multi register nodes should not go via this route. assert(!node->IsMultiRegNode()); // The rmwNode should have the same register type as the node assert(rmwNode == nullptr || varTypeUsesSameRegType(rmwNode->TypeGet(), node->TypeGet()) || (rmwNode->IsMultiRegNode() && varTypeUsesFloatReg(node->TypeGet()))); +#endif if (use != nullptr) { From b5a712ddfa161d6b9afe00b77401f2d5ecf05b6c Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 2 Oct 2024 12:15:11 +0100 Subject: [PATCH 31/31] Comment fixups --- src/coreclr/jit/lsraarm64.cpp | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 0328249c9f7857..d124404beb0b86 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1734,8 +1734,8 @@ void LinearScan::BuildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, co // // // Arguments: -// embeddedOpNode - The embdedded node of interest -// intrin - The embedded node as a HWIntrinsic +// embeddedOpNode - The embedded node of interest +// embeddedDelayFreeOp - The delay free operand corresponding to the embedded node // // Return Value: // The number of sources consumed by this node. @@ -2083,7 +2083,7 @@ bool RefPosition::isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegist // Arguments: // intrinsicTree - Tree to check // intrin - The tree as a HWIntrinsic -// OpNum - The Operand number in the intrinsic tree +// opNum - The Operand number in the intrinsic tree // // Return Value: // The candidates for the operand number @@ -2092,6 +2092,8 @@ SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicT { SingleTypeRegSet opCandidates = RBM_NONE; + assert(opNum <= intrinsicTree->GetOperandCount()); + if (HWIntrinsicInfo::IsLowVectorOperation(intrin.id)) { assert(!varTypeIsMask(intrinsicTree->Op(opNum)->TypeGet())); @@ -2164,12 +2166,12 @@ SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicT //------------------------------------------------------------------------ // getDelayFreeOperand: Get the delay free characteristics of the HWIntrinsic // -// For a RMW intrinsic preference the RMW operand to the target. -// For a simple move semantic between two SIMD registers, then preference the source operand. +// For a RMW intrinsic, prefer the RMW operand to the target. +// For a simple move semantic between two SIMD registers, then prefer the source operand. // // Arguments: // intrinsicTree - Tree to check -// embeddedOp - The embedded operand, if any +// embedded - If this is an embedded operand // // Return Value: // The operand that needs to be delay freed @@ -2310,7 +2312,10 @@ GenTree* LinearScan::getVectorAddrOperand(GenTreeHWIntrinsic* intrinsicTree) // destIsConsecutive (out) - if the destination requires consective registers // // Return Value: -// The operand that requires consecutive registers +// The operand that requires consecutive registers. +// +// If only the destination requires consecutive registers then destIsConsecutive will be +// true and the function will return nullptr. // GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bool* destIsConsecutive) { @@ -2401,7 +2406,7 @@ GenTree* LinearScan::getConsecutiveRegistersOperand(const HWIntrinsic intrin, bo //------------------------------------------------------------------------ // // Arguments: -// intrinsicTree - Tree to check +// intrin - Tree to check // // Return Value: // The operand that is an embedded mask