diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index 9c5347eb7af9f5..7b58a9f4d6ad39 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -2570,6 +2570,213 @@ GenTree* Compiler::optVNBasedFoldExpr_Call_Memmove(GenTreeCall* call) return result; } +#ifdef FEATURE_HW_INTRINSICS +//------------------------------------------------------------------------------ +// AllComponentsEitherZeroOrAllBitsSet: Checks whether a value number represents +// a SIMD value where each component is either zero or all-bits-set. +// +// Arguments: +// comp - Compiler object +// vn - The value number to check +// baseType - The expected SIMD element base type +// +// Return Value: +// true if the VN is known to produce 0/AllBitsSet per element. +// +static bool AllComponentsEitherZeroOrAllBitsSet(Compiler* comp, ValueNum vn, var_types baseType) +{ + // Check for SIMD constant vectors (all-zero or all-bits-set) + // TODO: we can be less conservative and allow components to be + // either all-zero or all-bits-set, but not necessarily the same across the entire vector. + if (comp->vnStore->IsVNConstant(vn) && (comp->vnStore->TypeOfVN(vn) == TYP_SIMD16)) + { + simd16_t val = comp->vnStore->GetConstantSimd16(vn); + return val.IsAllBitsSet() || val.IsZero(); + } + + VNFuncApp funcApp; + if (!comp->vnStore->GetVNFunc(vn, &funcApp) || (funcApp.m_func < VNF_HWI_FIRST) || (funcApp.m_func > VNF_HWI_LAST)) + { + return false; + } + + bool isScalar; + NamedIntrinsic ni = static_cast(funcApp.m_func - VNF_HWI_FIRST + NI_HW_INTRINSIC_START + 1); + genTreeOps op = GenTreeHWIntrinsic::GetOperForHWIntrinsicId(ni, baseType, &isScalar); + + if (isScalar) + { + return false; + } + + switch (op) + { + case GT_EQ: + case GT_NE: + case GT_GT: + case GT_GE: + case GT_LE: + case GT_LT: + if ((funcApp.m_arity == 3) && varTypeIsIntegral(baseType)) + { + // Check if the 3rd argument (base type) matches the expected base type for the intrinsic. + // It can actually be even wider than the base type of the vector. + VNFuncApp baseTypeFuncApp; + if (comp->vnStore->GetVNFunc(funcApp.m_args[2], &baseTypeFuncApp) && + (baseTypeFuncApp.FuncIs(VNF_SimdType))) + { + return genTypeSize(baseType) <= + genTypeSize((var_types)comp->vnStore->GetConstantInt32(baseTypeFuncApp.m_args[1])); + } + } + return false; + + // For these operations we don't need to check the base type, as they are guaranteed to produce 0/AllBitsSet + // if the inputs do. + + case GT_NOT: + // ~0 = AllBitsSet, ~AllBitsSet = 0 + return AllComponentsEitherZeroOrAllBitsSet(comp, funcApp.m_args[0], baseType); + + case GT_OR: + case GT_AND: + case GT_XOR: + case GT_AND_NOT: + return AllComponentsEitherZeroOrAllBitsSet(comp, funcApp.m_args[0], baseType) && + AllComponentsEitherZeroOrAllBitsSet(comp, funcApp.m_args[1], baseType); + + default: + return false; + } +} + +//------------------------------------------------------------------------------ +// optVNBasedFoldExpr_HWIntrinsic: Folds given HWIntrinsic using VN to a simpler tree. +// +// Arguments: +// block - The block containing the tree. +// parent - The parent node of the tree. +// hw - The HWIntrinsic to fold +// +// Return Value: +// Returns a new tree or nullptr if nothing is changed. +// +GenTree* Compiler::optVNBasedFoldExpr_HWIntrinsic(BasicBlock* block, GenTree* parent, GenTreeHWIntrinsic* hw) +{ + // IndexOfWhereAllBitsSet and LastIndexOfWhereAllBitsSet can be simplified if + // we know that the input vector has only 0/AllBitsSet components. + // This is only needed for ARM64 where we don't have a movemask-like instruction. +#ifdef TARGET_ARM64 + if (hw->OperIsHWIntrinsic(NI_Vector128_IndexOfWhereAllBitsSet) || + hw->OperIsHWIntrinsic(NI_Vector128_LastIndexOfWhereAllBitsSet)) + { + var_types baseType = hw->GetSimdBaseType(); + + auto vnVisitor = [this, baseType](ValueNum vn) -> ValueNumStore::VNVisit { + if (AllComponentsEitherZeroOrAllBitsSet(this, vn, baseType)) + { + return ValueNumStore::VNVisit::Continue; + } + return ValueNumStore::VNVisit::Abort; + }; + + // Check via VNVisitReachingVNs so we also cover cases where the input is a PHI node. + if (vnStore->VNVisitReachingVNs(optConservativeNormalVN(hw->Op(1)), vnVisitor) == + ValueNumStore::VNVisit::Continue) + { + bool isLastIndex = hw->OperIsHWIntrinsic(NI_Vector128_LastIndexOfWhereAllBitsSet); + + // Expand using the SHRN trick: SHRN narrows each element by shifting right #4, + // producing a packed 64-bit mask. Then CTZ/CLZ to find the element index. + // + // Each element contributes a fixed number of bits to the mask: + // byte -> 4 bits (16 elements → 64 bits), divisor = 4 (>> 2) + // short -> 8 bits (8 elements → 64 bits), divisor = 8 (>> 3) + // int -> 16 bits (4 elements → 64 bits), divisor = 16 (>> 4) + // long -> 32 bits (2 elements → 64 bits), divisor = 32 (>> 5) + + var_types shrnBaseType; + int ctzDivisorLog2; + + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + shrnBaseType = TYP_UBYTE; + ctzDivisorLog2 = 2; + break; + case TYP_SHORT: + case TYP_USHORT: + shrnBaseType = TYP_UBYTE; + ctzDivisorLog2 = 3; + break; + case TYP_INT: + case TYP_UINT: + shrnBaseType = TYP_USHORT; + ctzDivisorLog2 = 4; + break; + case TYP_LONG: + case TYP_ULONG: + shrnBaseType = TYP_UINT; + ctzDivisorLog2 = 5; + break; + default: + return nullptr; + } + + GenTree* op1 = hw->Op(1); + + // SHRN(input, #4) → Vector64 + GenTree* shrn = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, gtNewIconNode(4), + NI_AdvSimd_ShiftRightLogicalNarrowingLower, shrnBaseType, 8); + + // ToScalar as uint64 + GenTree* mask = gtNewSimdHWIntrinsicNode(TYP_LONG, shrn, NI_Vector64_ToScalar, TYP_ULONG, 8); + + // Store mask in temp (needed for both bit-scan and condition) + unsigned maskTmp = lvaGrabTemp(true DEBUGARG("SHRN mask temp")); + lvaTable[maskTmp].lvType = TYP_LONG; + GenTree* maskStore = gtNewTempStore(maskTmp, mask); + GenTree* maskLcl1 = gtNewLclvNode(maskTmp, TYP_LONG); + GenTree* maskLcl2 = gtNewLclvNode(maskTmp, TYP_LONG); + + GenTree* idx; + + if (isLastIndex) + { + // (63 - CLZ64(mask)) >> ctzDivisorLog2 + GenTree* clz = gtNewScalarHWIntrinsicNode(TYP_INT, maskLcl1, NI_ArmBase_Arm64_LeadingZeroCount); + GenTree* sub = gtNewOperNode(GT_SUB, TYP_INT, gtNewIconNode(63), clz); + idx = gtNewOperNode(GT_RSZ, TYP_INT, sub, gtNewIconNode(ctzDivisorLog2)); + } + else + { + // CTZ64(mask) >> ctzDivisorLog2 + GenTree* rbit = gtNewScalarHWIntrinsicNode(TYP_LONG, maskLcl1, NI_ArmBase_Arm64_ReverseElementBits); + GenTree* clz = gtNewScalarHWIntrinsicNode(TYP_INT, rbit, NI_ArmBase_Arm64_LeadingZeroCount); + idx = gtNewOperNode(GT_RSZ, TYP_INT, clz, gtNewIconNode(ctzDivisorLog2)); + } + + // mask != 0 ? idx : -1 + GenTree* cond = gtNewOperNode(GT_NE, TYP_INT, maskLcl2, gtNewIconNode(0, TYP_LONG)); + GenTree* select = gtNewConditionalNode(GT_SELECT, cond, idx, gtNewIconNode(-1, TYP_INT), TYP_INT); + + // COMMA chain: (maskTmp = mask, mask != 0 ? idx : -1) + GenTree* result = gtNewOperNode(GT_COMMA, TYP_INT, maskStore, select); + + JITDUMP("Expanding NI_Vector128_IndexOfWhereAllBitsSet or NI_Vector128_LastIndexOfWhereAllBitsSet to:\n"); + DISPTREE(result); + JITDUMP("\n"); + + return result; + } + } +#endif // TARGET_ARM64 + + return nullptr; +} +#endif // FEATURE_HW_INTRINSICS + //------------------------------------------------------------------------------ // optVNBasedFoldExpr_Call: Folds given call using VN to a simpler tree. // @@ -2680,6 +2887,11 @@ GenTree* Compiler::optVNBasedFoldExpr(BasicBlock* block, GenTree* parent, GenTre case GT_CALL: return optVNBasedFoldExpr_Call(block, parent, tree->AsCall()); +#ifdef FEATURE_HW_INTRINSICS + case GT_HWINTRINSIC: + return optVNBasedFoldExpr_HWIntrinsic(block, parent, tree->AsHWIntrinsic()); +#endif + // We can add more VN-based foldings here. default: diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 327d7c9358da7f..9cf652046eb143 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8666,9 +8666,12 @@ class Compiler GenTree* optVNBasedFoldConstExpr(BasicBlock* block, GenTree* parent, GenTree* tree); GenTree* optVNBasedFoldExpr(BasicBlock* block, GenTree* parent, GenTree* tree); GenTree* optVNBasedFoldExpr_Call(BasicBlock* block, GenTree* parent, GenTreeCall* call); - GenTree* optVNBasedFoldExpr_Call_Memmove(GenTreeCall* call); - GenTree* optVNBasedFoldExpr_Call_Memset(GenTreeCall* call); - GenTree* optVNBasedFoldExpr_Call_Memcmp(GenTreeCall* call); +#ifdef FEATURE_HW_INTRINSICS + GenTree* optVNBasedFoldExpr_HWIntrinsic(BasicBlock* block, GenTree* parent, GenTreeHWIntrinsic* call); +#endif + GenTree* optVNBasedFoldExpr_Call_Memmove(GenTreeCall* call); + GenTree* optVNBasedFoldExpr_Call_Memset(GenTreeCall* call); + GenTree* optVNBasedFoldExpr_Call_Memcmp(GenTreeCall* call); AssertionIndex GetAssertionCount() { diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 5a18c0fecf850d..bcd8b342967b3b 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -1403,6 +1403,22 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector128_IndexOfWhereAllBitsSet: + case NI_Vector128_LastIndexOfWhereAllBitsSet: + { + assert(sig->numArgs == 1); + + if (varTypeIsFloating(simdBaseType)) + { + // Float/double use the managed fallback (needs AsInt32/AsInt64 reinterpretation). + break; + } + + op1 = impSIMDPopStack(); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseType, simdSize); + break; + } + case NI_Vector64_Floor: case NI_Vector128_Floor: { diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h index 1c6d080b66eaba..9dafbe98527d16 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/jit/hwintrinsiclistarm64.h @@ -203,6 +203,7 @@ HARDWARE_INTRINSIC(Vector128, GreaterThanAny, HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqualAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqualAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector128, IndexOfWhereAllBitsSet, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, IsEvenInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, IsFinite, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, IsInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) @@ -216,6 +217,7 @@ HARDWARE_INTRINSIC(Vector128, IsPositive, HARDWARE_INTRINSIC(Vector128, IsPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, IsSubnormal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, IsZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector128, LastIndexOfWhereAllBitsSet, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, LessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, LessThanAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, LessThanAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index e32a1b13ef6b52..1690891bef36f7 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -605,6 +605,15 @@ void Rationalizer::RewriteHWIntrinsic(GenTree** use, Compiler::GenTreeStack& par break; } +#if defined(TARGET_ARM64) + case NI_Vector128_IndexOfWhereAllBitsSet: + case NI_Vector128_LastIndexOfWhereAllBitsSet: + { + RewriteHWIntrinsicIndexOfWhereAllBitsSet(use, parents); + break; + } +#endif // TARGET_ARM64 + default: { break; @@ -1580,6 +1589,104 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree unreached(); #endif } + +#if defined(TARGET_ARM64) +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicIndexOfWhereAllBitsSet: Fallback expansion of IndexOfWhereAllBitsSet and +// LastIndexOfWhereAllBitsSet when the VN-based SHRN optimization did not apply. +// +// Expands to: CmpEq(input, AllBitsSet) → ExtractMostSignificantBits → CTZ/CLZ → SELECT +// +// This reuses the existing RewriteHWIntrinsicExtractMsb by transforming the node into +// an EMSB intrinsic and calling it directly (since post-order won't revisit the node). +// +void Rationalizer::RewriteHWIntrinsicIndexOfWhereAllBitsSet(GenTree** use, Compiler::GenTreeStack& parents) +{ + GenTreeHWIntrinsic* node = (*use)->AsHWIntrinsic(); + + NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + bool isLastIndex = (intrinsicId == NI_Vector128_LastIndexOfWhereAllBitsSet); + + GenTree* op1 = node->Op(1); + + assert(!varTypeIsFloating(simdBaseType)); + + // Step 1: Insert CmpEq(input, AllBitsSet) before the node + GenTree* allBitsSet = m_compiler->gtNewAllBitsSetConNode(simdType); + BlockRange().InsertAfter(op1, allBitsSet); + + GenTree* cmp = m_compiler->gtNewSimdCmpOpNode(GT_EQ, simdType, op1, allBitsSet, simdBaseType, simdSize); + BlockRange().InsertAfter(allBitsSet, cmp); + + // Step 2: Transform node into ExtractMostSignificantBits and expand it + node->ChangeHWIntrinsicId(NI_Vector128_ExtractMostSignificantBits); + node->SetSimdBaseType(simdBaseType); + node->Op(1) = cmp; + + RewriteHWIntrinsicExtractMsb(use, parents); + + // After RewriteHWIntrinsicExtractMsb, *use points to the EMSB result (TYP_INT or cast node). + GenTree* emsbResult = *use; + + // Step 3: Store EMSB result in temp (needed for both bit-scan and SELECT condition) + LIR::Use emsbUse; + LIR::Use::MakeDummyUse(BlockRange(), emsbResult, &emsbUse); + emsbUse.ReplaceWithLclVar(m_compiler); + GenTree* emsbLcl = emsbUse.Def(); + + // Step 4: bit-scan to find element index + GenTree* scanResult; + + if (isLastIndex) + { + // LastIndexOf: 31 - CLZ(emsb) + GenTree* clz = m_compiler->gtNewScalarHWIntrinsicNode(TYP_INT, emsbLcl, NI_ArmBase_LeadingZeroCount); + BlockRange().InsertAfter(emsbLcl, clz); + + GenTree* icon31 = m_compiler->gtNewIconNode(31); + BlockRange().InsertAfter(clz, icon31); + + scanResult = m_compiler->gtNewOperNode(GT_SUB, TYP_INT, icon31, clz); + BlockRange().InsertAfter(icon31, scanResult); + } + else + { + // IndexOf: CTZ = RBIT + CLZ (TrailingZeroCount) + GenTree* rbit = m_compiler->gtNewScalarHWIntrinsicNode(TYP_INT, emsbLcl, NI_ArmBase_ReverseElementBits); + BlockRange().InsertAfter(emsbLcl, rbit); + + scanResult = m_compiler->gtNewScalarHWIntrinsicNode(TYP_INT, rbit, NI_ArmBase_LeadingZeroCount); + BlockRange().InsertAfter(rbit, scanResult); + } + + // Step 5: GT_SELECT(emsbResult != 0, scanResult, -1) + GenTree* emsbCond = m_compiler->gtClone(emsbLcl); + BlockRange().InsertAfter(scanResult, emsbCond); + + GenTree* minus1 = m_compiler->gtNewIconNode(-1, TYP_INT); + BlockRange().InsertAfter(emsbCond, minus1); + + GenTreeConditional* select = m_compiler->gtNewConditionalNode(GT_SELECT, emsbCond, scanResult, minus1, TYP_INT); + BlockRange().InsertAfter(minus1, select); + + // Replace the current use with the select node + if (parents.Height() > 1) + { + parents.Top(1)->ReplaceOperand(use, select); + } + else + { + *use = select; + } + + (void)parents.Pop(); + parents.Push(select); +} +#endif // TARGET_ARM64 + #endif // FEATURE_HW_INTRINSICS #ifdef TARGET_ARM64 diff --git a/src/coreclr/jit/rationalize.h b/src/coreclr/jit/rationalize.h index d0449be7c70a56..1c61dc079011e8 100644 --- a/src/coreclr/jit/rationalize.h +++ b/src/coreclr/jit/rationalize.h @@ -65,6 +65,9 @@ class Rationalizer final : public Phase #endif // TARGET_XARCH void RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTreeStack& parents); +#if defined(TARGET_ARM64) + void RewriteHWIntrinsicIndexOfWhereAllBitsSet(GenTree** use, Compiler::GenTreeStack& parents); +#endif // TARGET_ARM64 #endif // FEATURE_HW_INTRINSICS #ifdef TARGET_ARM64 diff --git a/src/tests/JIT/HardwareIntrinsics/General/HwiOp/IndexOfWhereAllBitsSet.cs b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/IndexOfWhereAllBitsSet.cs new file mode 100644 index 00000000000000..bb8df1a6d73668 --- /dev/null +++ b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/IndexOfWhereAllBitsSet.cs @@ -0,0 +1,193 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using Xunit; + +/// +/// Tests for the ARM64 JIT intrinsic expansion of Vector128.IndexOfWhereAllBitsSet +/// and Vector128.LastIndexOfWhereAllBitsSet, covering both the generic path (raw vectors) +/// and the optimized SHRN fast-path (input from comparisons / bitwise combos). +/// +public static class IndexOfWhereAllBitsSet +{ + // ===================== IndexOfWhereAllBitsSet: generic path ===================== + + [Fact] + public static void IndexOf_Byte_Generic() + { + var v = Vector128.Create((byte)0, 0, 0, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + Assert.Equal(3, GenericByte(v)); + } + + [Fact] + public static void IndexOf_Int_Generic() + { + Assert.Equal(1, GenericInt(Vector128.Create(0, -1, 0, 0))); + } + + [Fact] + public static void IndexOf_NoMatch_Generic() + { + Assert.Equal(-1, GenericInt(Vector128.Create(0, 0, 0, 0))); + } + + [Fact] + public static void IndexOf_FirstElem_Generic() + { + Assert.Equal(0, GenericInt(Vector128.Create(-1, 0, 0, 0))); + } + + [Fact] + public static void IndexOf_AllSet_Generic() + { + Assert.Equal(0, GenericInt(Vector128.Create(-1, -1, -1, -1))); + } + + // Vectors with partial bits set (not 0, not AllBitsSet) — must not false-match + [Fact] + public static void IndexOf_Byte_PartialBits_Generic() + { + // 0x10 is non-zero but not 0xFF — should not match + var v = Vector128.Create((byte)0x10, 0x80, 0x7F, 0xFE, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + Assert.Equal(4, GenericByte(v)); + } + + [Fact] + public static void IndexOf_Short_PartialBits_Generic() + { + // 0x00FF and 0xFF00 are non-zero but not 0xFFFF — should not match + var v = Vector128.Create((short)0x00FF, unchecked((short)0xFF00), 0x7FFF, -1, 0, 0, 0, 0); + Assert.Equal(3, GenericShort(v)); + } + + [Fact] + public static void IndexOf_Int_PartialBits_Generic() + { + // 0x80000000 and 0x7FFFFFFF are non-zero but not -1 + var v = Vector128.Create(int.MinValue, int.MaxValue, -1, 0); + Assert.Equal(2, GenericInt(v)); + } + + [Fact] + public static void IndexOf_NoAllBitsSet_Generic() + { + // No element is AllBitsSet, all have partial bits + var v = Vector128.Create(1, 2, 3, 4); + Assert.Equal(-1, GenericInt(v)); + } + + // ===================== IndexOfWhereAllBitsSet: optimized path ===================== + + [Fact] + public static void IndexOf_Byte_Optimized() + { + var v = Vector128.Create((byte)0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + Assert.Equal(3, OptimizedByteEq(v, 42)); + } + + [Fact] + public static void IndexOf_Int_Optimized() + { + Assert.Equal(2, OptimizedIntEq(Vector128.Create(0, 0, 99, 0), 99)); + } + + [Fact] + public static void IndexOf_Short_Optimized() + { + Assert.Equal(5, OptimizedShortEq(Vector128.Create((short)0, 0, 0, 0, 0, 7, 0, 0), 7)); + } + + [Fact] + public static void IndexOf_Long_Optimized() + { + Assert.Equal(1, OptimizedLongEq(Vector128.Create(0L, -1L), -1L)); + } + + [Fact] + public static void IndexOf_NoMatch_Optimized() + { + Assert.Equal(-1, OptimizedIntEq(Vector128.Create(1, 2, 3, 4), 99)); + } + + [Fact] + public static void IndexOf_FirstElem_Optimized() + { + Assert.Equal(0, OptimizedIntEq(Vector128.Create(99, 0, 0, 0), 99)); + } + + // ===================== LastIndexOfWhereAllBitsSet: generic path ===================== + + [Fact] + public static void LastIndexOf_Int_Generic() + { + Assert.Equal(3, GenericLastInt(Vector128.Create(-1, 0, 0, -1))); + } + + [Fact] + public static void LastIndexOf_NoMatch_Generic() + { + Assert.Equal(-1, GenericLastInt(Vector128.Create(0, 0, 0, 0))); + } + + // ===================== LastIndexOfWhereAllBitsSet: optimized path ===================== + + [Fact] + public static void LastIndexOf_Int_Optimized() + { + Assert.Equal(2, OptimizedLastIntEq(Vector128.Create(99, 0, 99, 0), 99)); + } + + [Fact] + public static void LastIndexOf_Byte_Optimized() + { + var v = Vector128.Create((byte)42, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42); + Assert.Equal(15, OptimizedLastByteEq(v, 42)); + } + + [Fact] + public static void LastIndexOf_NoMatch_Optimized() + { + Assert.Equal(-1, OptimizedLastIntEq(Vector128.Create(1, 2, 3, 4), 99)); + } + + // ===================== Helpers ===================== + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + static int GenericByte(Vector128 v) => Vector128.IndexOfWhereAllBitsSet(v); + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + static int GenericInt(Vector128 v) => Vector128.IndexOfWhereAllBitsSet(v); + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + static int GenericShort(Vector128 v) => Vector128.IndexOfWhereAllBitsSet(v); + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + static int GenericLastInt(Vector128 v) => Vector128.LastIndexOfWhereAllBitsSet(v); + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + static int OptimizedByteEq(Vector128 v, byte needle) + => Vector128.IndexOfWhereAllBitsSet(Vector128.Equals(v, Vector128.Create(needle))); + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + static int OptimizedIntEq(Vector128 v, int needle) + => Vector128.IndexOfWhereAllBitsSet(Vector128.Equals(v, Vector128.Create(needle))); + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + static int OptimizedShortEq(Vector128 v, short needle) + => Vector128.IndexOfWhereAllBitsSet(Vector128.Equals(v, Vector128.Create(needle))); + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + static int OptimizedLongEq(Vector128 v, long needle) + => Vector128.IndexOfWhereAllBitsSet(Vector128.Equals(v, Vector128.Create(needle))); + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + static int OptimizedLastIntEq(Vector128 v, int needle) + => Vector128.LastIndexOfWhereAllBitsSet(Vector128.Equals(v, Vector128.Create(needle))); + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] + static int OptimizedLastByteEq(Vector128 v, byte needle) + => Vector128.LastIndexOfWhereAllBitsSet(Vector128.Equals(v, Vector128.Create(needle))); +} diff --git a/src/tests/JIT/HardwareIntrinsics/General/HwiOp/IndexOfWhereAllBitsSet.csproj b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/IndexOfWhereAllBitsSet.csproj new file mode 100644 index 00000000000000..501217e4d86892 --- /dev/null +++ b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/IndexOfWhereAllBitsSet.csproj @@ -0,0 +1,9 @@ + + + None + True + + + + +