From 7ebea2149e66f7dfb72dd66dc10982e16891eb34 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 13 Sep 2024 11:43:00 +0100 Subject: [PATCH 1/8] ARM64: Fix lsra for AdvSimd_LoadAndInsertScalar --- src/coreclr/jit/lsraarm64.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 79bf3c16778df6..4946a2e5eebb87 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1724,7 +1724,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } } } - else if (intrinsicTree->OperIsMemoryLoadOrStore()) + else if ((intrinsicTree->OperIsMemoryLoadOrStore()) && (intrin.id != NI_AdvSimd_LoadAndInsertScalar)) { srcCount += BuildAddrUses(intrin.op1); } @@ -2151,7 +2151,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { SingleTypeRegSet candidates = lowVectorOperandNum == 3 ? lowVectorCandidates : RBM_NONE; - if (isRMW) + if (intrin.id == NI_AdvSimd_LoadAndInsertScalar) + { + srcCount += BuildAddrUses(intrin.op3); + } + else if (isRMW) { srcCount += BuildDelayFreeUses(intrin.op3, (tgtPrefOp2 ? intrin.op2 : intrin.op1), candidates); } From 067a31109ff5501b7d7aac67fcd69c6cc2f79bea Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 16 Sep 2024 09:34:02 +0100 Subject: [PATCH 2/8] Fix comment --- src/coreclr/jit/hwintrinsic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index c907fd95d036f5..8fd35e92f64758 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -204,7 +204,7 @@ enum HWIntrinsicFlag : unsigned int // The intrinsic uses a mask in arg1 to select elements present in the result HW_Flag_ExplicitMaskedOperation = 0x20000, - // The intrinsic uses a mask in arg1 (either explicitly, embdedd or optionally embedded) to select elements present + // The intrinsic uses a mask in arg1 (either explicitly, embedded or optionally embedded) to select elements present // in the result, and must use a low register. HW_Flag_LowMaskedOperation = 0x40000, From f467f77527b4a71abc07e753a4953eccb5f60d0e Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 16 Sep 2024 13:59:00 +0100 Subject: [PATCH 3/8] Expand LoadAndInsertScalar testing --- .../Shared/LoadAndInsertScalarTest.template | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/LoadAndInsertScalarTest.template b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/LoadAndInsertScalarTest.template index 12c87a5134c74a..4d51a82f999b94 100644 --- a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/LoadAndInsertScalarTest.template +++ b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/LoadAndInsertScalarTest.template @@ -45,6 +45,9 @@ namespace JIT.HardwareIntrinsics.Arm // Validates passing an instance member of a class works test.RunClassFldScenario(); + // Validates passing an non const value works + test.RunClassFldScenario_NotConstant(); + // Validates passing the field of a local struct works test.RunStructLclFldScenario(); @@ -150,6 +153,7 @@ namespace JIT.HardwareIntrinsics.Arm private static {Op1BaseType}[] _data1 = new {Op1BaseType}[Op1ElementCount]; private {Op1VectorType}<{Op1BaseType}> _fld1; + private byte _fld2; private {Op1BaseType} _fld3; private DataTable _dataTable; @@ -161,6 +165,7 @@ namespace JIT.HardwareIntrinsics.Arm for (var i = 0; i < Op1ElementCount; i++) { _data1[i] = {NextValueOp1}; } Unsafe.CopyBlockUnaligned(ref Unsafe.As<{Op1VectorType}<{Op1BaseType}>, byte>(ref _fld1), ref Unsafe.As<{Op1BaseType}, byte>(ref _data1[0]), (uint)Unsafe.SizeOf<{Op1VectorType}<{Op1BaseType}>>()); + _fld2 = {ElementIndex}; _fld3 = {NextValueOp3}; for (var i = 0; i < Op1ElementCount; i++) { _data1[i] = {NextValueOp1}; } @@ -247,6 +252,20 @@ namespace JIT.HardwareIntrinsics.Arm ValidateResult(_fld1, _fld3, _dataTable.outArrayPtr); } + public void RunClassFldScenario_NotConstant() + { + TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_NotConstant)); + + fixed ({Op1BaseType}* pFld3 = &_fld3) + { + var result = {Isa}.{Method}(_fld1, _fld2, pFld3); + + Unsafe.Write(_dataTable.outArrayPtr, result); + } + + ValidateResult(_fld1, _fld3, _dataTable.outArrayPtr); + } + public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); From 2d9392dc0192261a882860862a33d031be56112e Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 16 Sep 2024 13:59:36 +0100 Subject: [PATCH 4/8] Only delayfree when register types match --- src/coreclr/jit/lsrabuild.cpp | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 3c390ee213941c..22bb091cfe5051 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3846,9 +3846,23 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, return 0; } } + + // Don't mark as delay free if there is a mismatch in register types + bool addDelayFreeUses = false; + // Multi register nodes should no go via this route. + assert(!node->IsMultiRegNode()); + if (rmwNode == nullptr || varTypeUsesSameRegType(rmwNode->TypeGet(), node->TypeGet()) + || (rmwNode->IsMultiRegNode() && varTypeUsesFloatReg(node->TypeGet()))) + { + addDelayFreeUses = true; + } + if (use != nullptr) { - AddDelayFreeUses(use, rmwNode); + if (addDelayFreeUses) + { + AddDelayFreeUses(use, rmwNode); + } if (useRefPositionRef != nullptr) { *useRefPositionRef = use; @@ -3864,15 +3878,20 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, if (addrMode->HasBase() && !addrMode->Base()->isContained()) { use = BuildUse(addrMode->Base(), candidates); - AddDelayFreeUses(use, rmwNode); - + if (addDelayFreeUses) + { + AddDelayFreeUses(use, rmwNode); + } srcCount++; } + if (addrMode->HasIndex() && !addrMode->Index()->isContained()) { use = BuildUse(addrMode->Index(), candidates); - AddDelayFreeUses(use, rmwNode); - + if (addDelayFreeUses) + { + AddDelayFreeUses(use, rmwNode); + } srcCount++; } From 6f5b8f586965c72d6739e2a146531a7df7620916 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 16 Sep 2024 15:28:34 +0100 Subject: [PATCH 5/8] fix formatting --- src/coreclr/jit/lsrabuild.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 22bb091cfe5051..aae4c12d61068a 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3851,8 +3851,8 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, bool addDelayFreeUses = false; // Multi register nodes should no go via this route. assert(!node->IsMultiRegNode()); - if (rmwNode == nullptr || varTypeUsesSameRegType(rmwNode->TypeGet(), node->TypeGet()) - || (rmwNode->IsMultiRegNode() && varTypeUsesFloatReg(node->TypeGet()))) + if (rmwNode == nullptr || varTypeUsesSameRegType(rmwNode->TypeGet(), node->TypeGet()) || + (rmwNode->IsMultiRegNode() && varTypeUsesFloatReg(node->TypeGet()))) { addDelayFreeUses = true; } From 5476323351033f2328a66f564a25337043abda12 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 20 Sep 2024 12:46:10 +0100 Subject: [PATCH 6/8] Fix comment --- src/coreclr/jit/lsrabuild.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index aae4c12d61068a..1e8937b1bf046e 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3849,7 +3849,7 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, // Don't mark as delay free if there is a mismatch in register types bool addDelayFreeUses = false; - // Multi register nodes should no go via this route. + // Multi register nodes should not go via this route. assert(!node->IsMultiRegNode()); if (rmwNode == nullptr || varTypeUsesSameRegType(rmwNode->TypeGet(), node->TypeGet()) || (rmwNode->IsMultiRegNode() && varTypeUsesFloatReg(node->TypeGet()))) From b1b063c553aadff8773f09cb0f2f031bd736c30f Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 23 Sep 2024 09:57:36 +0100 Subject: [PATCH 7/8] Check that multiregister nodes use fp registers --- src/coreclr/jit/lsraarm64.cpp | 3 ++- src/coreclr/jit/lsrabuild.cpp | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 4946a2e5eebb87..b08e0b47d2dbde 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1663,12 +1663,13 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert(isRMW); assert(intrin.op1->OperIs(GT_FIELD_LIST)); + GenTreeFieldList* op1 = intrin.op1->AsFieldList(); assert(compiler->info.compNeedsConsecutiveRegisters); for (GenTreeFieldList::Use& use : op1->Uses()) { - BuildDelayFreeUses(use.GetNode(), intrinsicTree); + BuildDelayFreeUses(use.GetNode(), intrin.op1); srcCount++; } } diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 1e8937b1bf046e..5180edd4077bec 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3851,6 +3851,9 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, bool addDelayFreeUses = false; // Multi register nodes should not go via this route. assert(!node->IsMultiRegNode()); + // Multi register nodes should always use fp registers (this includes vectors). + assert(rmwNode == nullptr || varTypeUsesFloatReg(rmwNode->TypeGet()) || !rmwNode->IsMultiRegNode()); + assert(varTypeUsesFloatReg(node->TypeGet()) || !node->IsMultiRegNode()); if (rmwNode == nullptr || varTypeUsesSameRegType(rmwNode->TypeGet(), node->TypeGet()) || (rmwNode->IsMultiRegNode() && varTypeUsesFloatReg(node->TypeGet()))) { From a8ee4626a5fb6703a53a381036db40f772386188 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 24 Sep 2024 08:33:52 +0100 Subject: [PATCH 8/8] Revert rmw assert --- src/coreclr/jit/lsraarm64.cpp | 2 +- src/coreclr/jit/lsrabuild.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index b08e0b47d2dbde..3118f81f3b29b8 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1669,7 +1669,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou for (GenTreeFieldList::Use& use : op1->Uses()) { - BuildDelayFreeUses(use.GetNode(), intrin.op1); + BuildDelayFreeUses(use.GetNode(), intrinsicTree); srcCount++; } } diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 5180edd4077bec..368a5e82d53c75 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3852,7 +3852,6 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, // Multi register nodes should not go via this route. assert(!node->IsMultiRegNode()); // Multi register nodes should always use fp registers (this includes vectors). - assert(rmwNode == nullptr || varTypeUsesFloatReg(rmwNode->TypeGet()) || !rmwNode->IsMultiRegNode()); assert(varTypeUsesFloatReg(node->TypeGet()) || !node->IsMultiRegNode()); if (rmwNode == nullptr || varTypeUsesSameRegType(rmwNode->TypeGet(), node->TypeGet()) || (rmwNode->IsMultiRegNode() && varTypeUsesFloatReg(node->TypeGet())))