From b2f2866e60d1bd56b8d61676f03bd601bc16a1c2 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 9 Apr 2026 10:45:45 -0700 Subject: [PATCH 1/3] Improve the codegen for Count, IndexOf, and LastIndexOf on Arm64 --- .../Runtime/Intrinsics/Vector128.Numerics.cs | 40 +++----- .../System/Runtime/Intrinsics/Vector128.cs | 98 ++++++++++++++++-- .../System/Runtime/Intrinsics/Vector256.cs | 65 ++++++++++-- .../System/Runtime/Intrinsics/Vector512.cs | 65 ++++++++++-- .../src/System/Runtime/Intrinsics/Vector64.cs | 99 +++++++++++++++++-- 5 files changed, 311 insertions(+), 56 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.Numerics.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.Numerics.cs index 0ccc17eb0e85c1..a5147923638941 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.Numerics.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.Numerics.cs @@ -183,78 +183,62 @@ public static Vector AsVector(this Vector128 value) /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int Count(Vector2 vector, float value) => BitOperations.PopCount(Equals(vector.AsVector128(), Create(value, value, -1, -1)).ExtractMostSignificantBits()); + internal static int Count(Vector2 vector, float value) => CountMatches(Equals(vector.AsVector128(), Create(value, value, -1, -1))); /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int Count(Vector3 vector, float value) => BitOperations.PopCount(Equals(vector.AsVector128(), Create(value, value, value, -1)).ExtractMostSignificantBits()); + internal static int Count(Vector3 vector, float value) => CountMatches(Equals(vector.AsVector128(), Create(value, value, value, -1))); /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int CountWhereAllBitsSet(Vector2 vector) => BitOperations.PopCount(Equals(vector.AsVector128().AsInt32(), Vector128.AllBitsSet).ExtractMostSignificantBits()); + internal static int CountWhereAllBitsSet(Vector2 vector) => CountWhereAllBitsSet(vector.AsVector128()); /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int CountWhereAllBitsSet(Vector3 vector) => BitOperations.PopCount(Equals(vector.AsVector128().AsInt32(), Vector128.AllBitsSet).ExtractMostSignificantBits()); + internal static int CountWhereAllBitsSet(Vector3 vector) => CountWhereAllBitsSet(vector.AsVector128()); /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int IndexOf(Vector2 vector, float value) - { - int result = BitOperations.TrailingZeroCount(Equals(vector.AsVector128(), Create(value, value, -1, -1)).ExtractMostSignificantBits()); - return (result != 32) ? result : -1; - } + internal static int IndexOf(Vector2 vector, float value) => IndexOfFirstMatch(Equals(vector.AsVector128(), Create(value, value, -1, -1))); /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int IndexOf(Vector3 vector, float value) - { - int result = BitOperations.TrailingZeroCount(Equals(vector.AsVector128(), Create(value, value, value, -1)).ExtractMostSignificantBits()); - return (result != 32) ? result : -1; - } + internal static int IndexOf(Vector3 vector, float value) => IndexOfFirstMatch(Equals(vector.AsVector128(), Create(value, value, value, -1))); /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int IndexOfWhereAllBitsSet(Vector2 vector) - { - int result = BitOperations.TrailingZeroCount(Equals(vector.AsVector128().AsInt32(), Vector128.AllBitsSet).ExtractMostSignificantBits()); - return (result != 32) ? result : -1; - } + internal static int IndexOfWhereAllBitsSet(Vector2 vector) => IndexOfWhereAllBitsSet(vector.AsVector128()); /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int IndexOfWhereAllBitsSet(Vector3 vector) - { - int result = BitOperations.TrailingZeroCount(Equals(vector.AsVector128().AsInt32(), Vector128.AllBitsSet).ExtractMostSignificantBits()); - return (result != 32) ? result : -1; - } + internal static int IndexOfWhereAllBitsSet(Vector3 vector) => IndexOfWhereAllBitsSet(vector.AsVector128()); /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int LastIndexOf(Vector2 vector, float value) => 31 - BitOperations.LeadingZeroCount(Equals(vector.AsVector128(), Create(value, value, -1, -1)).ExtractMostSignificantBits()); + internal static int LastIndexOf(Vector2 vector, float value) => IndexOfLastMatch(Equals(vector.AsVector128(), Create(value, value, -1, -1))); /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int LastIndexOf(Vector3 vector, float value) => 31 - BitOperations.LeadingZeroCount(Equals(vector.AsVector128(), Create(value, value, value, -1)).ExtractMostSignificantBits()); + internal static int LastIndexOf(Vector3 vector, float value) => IndexOfLastMatch(Equals(vector.AsVector128(), Create(value, value, value, -1))); /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int LastIndexOfWhereAllBitsSet(Vector2 vector) => 31 - BitOperations.LeadingZeroCount(Equals(vector.AsVector128().AsInt32(), Vector128.AllBitsSet).ExtractMostSignificantBits()); + internal static int LastIndexOfWhereAllBitsSet(Vector2 vector) => LastIndexOfWhereAllBitsSet(vector.AsVector128()); /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int LastIndexOfWhereAllBitsSet(Vector3 vector) => 31 - BitOperations.LeadingZeroCount(Equals(vector.AsVector128().AsInt32(), Vector128.AllBitsSet).ExtractMostSignificantBits()); + internal static int LastIndexOfWhereAllBitsSet(Vector3 vector) => LastIndexOfWhereAllBitsSet(vector.AsVector128()); /// [Intrinsic] diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index dc20ffbd556bf5..1ba14646ab81f6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -898,7 +898,7 @@ public static Vector128 Cos(Vector128 vector) /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int Count(Vector128 vector, T value) => BitOperations.PopCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); + public static int Count(Vector128 vector, T value) => CountMatches(Equals(vector, Create(value))); /// [Intrinsic] @@ -1978,11 +1978,7 @@ public static Vector128 Hypot(Vector128 x, Vector128 y) /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int IndexOf(Vector128 vector, T value) - { - int result = BitOperations.TrailingZeroCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); - return (result != 32) ? result : -1; - } + public static int IndexOf(Vector128 vector, T value) => IndexOfFirstMatch(Equals(vector, Create(value))); /// [Intrinsic] @@ -2213,7 +2209,7 @@ public static Vector128 IsSubnormal(Vector128 vector) /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int LastIndexOf(Vector128 vector, T value) => 31 - BitOperations.LeadingZeroCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); + public static int LastIndexOf(Vector128 vector, T value) => IndexOfLastMatch(Equals(vector, Create(value))); /// [Intrinsic] @@ -4505,6 +4501,66 @@ public static Vector128 WithUpper(this Vector128 vector, Vector64 va [Intrinsic] public static Vector128 Xor(Vector128 left, Vector128 right) => left ^ right; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(AdvSimd))] + internal static ulong AdvSimdExtractBitMask(Vector128 vector) + { + if (!AdvSimd.IsSupported) + { + ThrowHelper.ThrowNotSupportedException(); + } + + // This expects vector to have each element be one of Zero or AllBitsSet + // and will not produce correct results otherwise. + // + // Given this, we can treat it as ushort and do a logical-right-shift by 4 to + // compact the mask into half the space, giving us the following possibilities for + // each pair of bytes: + // * 0x00_00 - 0x00 + // * 0x00_FF - 0x0F + // * 0xFF_00 - 0xF0 + // * 0xFF_FF - 0xFF + // + // This allows us to extract the full metadata as a 64-bit scalar which can be then + // be consumed by bit-counting APIs, such as PopCount, LeadingZeroCount, or TrailingZeroCount, + // and then adjusted by AdvSimdFixupBitCount to get the actual count of elements + // that were masked. + + return AdvSimd.ShiftRightLogicalNarrowingLower(vector.AsUInt16(), 4).AsUInt64().ToScalar(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(AdvSimd))] + internal static int AdvSimdFixupBitCount(int bitCount) + { + if (!AdvSimd.IsSupported) + { + ThrowHelper.ThrowNotSupportedException(); + } + + // This API is meant to be consumed alongside AdvSimdExtractBitMask and will + // not produce correct results for arbitary inputs. It adjusts the bit count + // assuming that sequences of 1 or 0 were in groups of 4 bits per byte. + + unsafe + { + return bitCount >>> (2 + int.Log2(sizeof(T))); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int CountMatches(Vector128 vector) + { + if (AdvSimd.IsSupported) + { + return AdvSimdFixupBitCount(BitOperations.PopCount(AdvSimdExtractBitMask(vector))); + } + else + { + return BitOperations.PopCount(vector.ExtractMostSignificantBits()); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static T GetElementUnsafe(in this Vector128 vector, int index) { @@ -4513,6 +4569,34 @@ internal static T GetElementUnsafe(in this Vector128 vector, int index) return Unsafe.Add(ref address, index); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IndexOfFirstMatch(Vector128 vector) + { + if (AdvSimd.IsSupported) + { + int result = AdvSimdFixupBitCount(BitOperations.TrailingZeroCount(AdvSimdExtractBitMask(vector))); + return (result != Vector128.Count) ? result : -1; + } + else + { + int result = BitOperations.TrailingZeroCount(vector.ExtractMostSignificantBits()); + return (result != 32) ? result : -1; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IndexOfLastMatch(Vector128 vector) + { + if (AdvSimd.IsSupported) + { + return (Vector128.Count - 1) - AdvSimdFixupBitCount(BitOperations.LeadingZeroCount(AdvSimdExtractBitMask(vector))); + } + else + { + return 31 - BitOperations.LeadingZeroCount(vector.ExtractMostSignificantBits()); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static void SetElementUnsafe(in this Vector128 vector, int index, T value) { diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index c51c1f5329ef74..cd446c7646ac03 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -899,7 +899,7 @@ public static Vector256 Cos(Vector256 vector) /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int Count(Vector256 vector, T value) => BitOperations.PopCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); + public static int Count(Vector256 vector, T value) => CountMatches(Equals(vector, Create(value))); /// [Intrinsic] @@ -2056,11 +2056,7 @@ public static Vector256 Hypot(Vector256 x, Vector256 y) /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int IndexOf(Vector256 vector, T value) - { - int result = BitOperations.TrailingZeroCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); - return (result != 32) ? result : -1; - } + public static int IndexOf(Vector256 vector, T value) => IndexOfFirstMatch(Equals(vector, Create(value))); /// [Intrinsic] @@ -2291,7 +2287,7 @@ public static Vector256 IsSubnormal(Vector256 vector) /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int LastIndexOf(Vector256 vector, T value) => 31 - BitOperations.LeadingZeroCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); + public static int LastIndexOf(Vector256 vector, T value) => IndexOfLastMatch(Equals(vector, Create(value))); /// [Intrinsic] @@ -4452,6 +4448,20 @@ public static Vector256 WithUpper(this Vector256 vector, Vector128 v [Intrinsic] public static Vector256 Xor(Vector256 left, Vector256 right) => left ^ right; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int CountMatches(Vector256 vector) + { + if (Vector256.IsHardwareAccelerated) + { + return BitOperations.PopCount(vector.ExtractMostSignificantBits()); + } + else + { + return Vector128.CountMatches(vector._lower) + + Vector128.CountMatches(vector._upper); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static T GetElementUnsafe(in this Vector256 vector, int index) { @@ -4460,6 +4470,47 @@ internal static T GetElementUnsafe(in this Vector256 vector, int index) return Unsafe.Add(ref address, index); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IndexOfFirstMatch(Vector256 vector) + { + if (Vector256.IsHardwareAccelerated) + { + int result = BitOperations.TrailingZeroCount(vector.ExtractMostSignificantBits()); + return (result != 32) ? result : -1; + } + else + { + int result = Vector128.IndexOfFirstMatch(vector._lower); + + if (result >= 0) + { + return result; + } + + result = Vector128.IndexOfFirstMatch(vector._upper); + return result + ((result >= 0) ? Vector128.Count : 0); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IndexOfLastMatch(Vector256 vector) + { + if (Vector256.IsHardwareAccelerated) + { + return 31 - BitOperations.LeadingZeroCount(vector.ExtractMostSignificantBits()); + } + else + { + int result = Vector128.IndexOfLastMatch(vector._upper); + + if (result >= 0) + { + return result + Vector128.Count; + } + return Vector128.IndexOfLastMatch(vector._lower); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static void SetElementUnsafe(in this Vector256 vector, int index, T value) { diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs index 5bc4b5e0964a52..6e71109c3ab9c5 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs @@ -795,7 +795,7 @@ public static Vector512 Cos(Vector512 vector) /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int Count(Vector512 vector, T value) => BitOperations.PopCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); + public static int Count(Vector512 vector, T value) => CountMatches(Equals(vector, Create(value))); /// [Intrinsic] @@ -2075,11 +2075,7 @@ public static Vector512 Hypot(Vector512 x, Vector512 y) /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int IndexOf(Vector512 vector, T value) - { - int result = BitOperations.TrailingZeroCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); - return (result != 64) ? result : -1; - } + public static int IndexOf(Vector512 vector, T value) => IndexOfFirstMatch(Equals(vector, Create(value))); /// [Intrinsic] @@ -2310,7 +2306,7 @@ public static Vector512 IsSubnormal(Vector512 vector) /// [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int LastIndexOf(Vector512 vector, T value) => 63 - BitOperations.LeadingZeroCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); + public static int LastIndexOf(Vector512 vector, T value) => IndexOfLastMatch(Equals(vector, Create(value))); /// [Intrinsic] @@ -4422,6 +4418,20 @@ public static Vector512 WithUpper(this Vector512 vector, Vector256 v [Intrinsic] public static Vector512 Xor(Vector512 left, Vector512 right) => left ^ right; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int CountMatches(Vector512 vector) + { + if (Vector512.IsHardwareAccelerated) + { + return BitOperations.PopCount(vector.ExtractMostSignificantBits()); + } + else + { + return Vector256.CountMatches(vector._lower) + + Vector256.CountMatches(vector._upper); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static T GetElementUnsafe(in this Vector512 vector, int index) { @@ -4430,6 +4440,47 @@ internal static T GetElementUnsafe(in this Vector512 vector, int index) return Unsafe.Add(ref address, index); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IndexOfFirstMatch(Vector512 vector) + { + if (Vector512.IsHardwareAccelerated) + { + int result = BitOperations.TrailingZeroCount(vector.ExtractMostSignificantBits()); + return (result != 64) ? result : -1; + } + else + { + int result = Vector256.IndexOfFirstMatch(vector._lower); + + if (result >= 0) + { + return result; + } + + result = Vector256.IndexOfFirstMatch(vector._upper); + return result + ((result >= 0) ? Vector256.Count : 0); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IndexOfLastMatch(Vector512 vector) + { + if (Vector512.IsHardwareAccelerated) + { + return 63 - BitOperations.LeadingZeroCount(vector.ExtractMostSignificantBits()); + } + else + { + int result = Vector256.IndexOfLastMatch(vector._upper); + + if (result >= 0) + { + return result + Vector256.Count; + } + return Vector256.IndexOfLastMatch(vector._lower); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static void SetElementUnsafe(in this Vector512 vector, int index, T value) { diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs index 7077fe391347e6..2b94436cd4446b 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs @@ -6,6 +6,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.Arm; namespace System.Runtime.Intrinsics { @@ -866,7 +867,7 @@ public static Vector64 Cos(Vector64 vector) /// The type of and () is not supported. [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int Count(Vector64 vector, T value) => BitOperations.PopCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); + public static int Count(Vector64 vector, T value) => CountMatches(Equals(vector, Create(value))); /// Determines the number of elements in a vector that have all their bits set. /// The type of the elements in the vector. @@ -1888,11 +1889,7 @@ public static Vector64 Hypot(Vector64 x, Vector64 y) /// The type of and () is not supported. [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int IndexOf(Vector64 vector, T value) - { - int result = BitOperations.TrailingZeroCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); - return (result != 32) ? result : -1; - } + public static int IndexOf(Vector64 vector, T value) => IndexOfFirstMatch(Equals(vector, Create(value))); /// Determines the index of the first element in a vector that has all bits set. /// The type of the elements in the vector. @@ -2132,7 +2129,7 @@ public static Vector64 IsSubnormal(Vector64 vector) /// The type of and () is not supported. [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int LastIndexOf(Vector64 vector, T value) => 31 - BitOperations.LeadingZeroCount(Equals(vector, Create(value)).ExtractMostSignificantBits()); + public static int LastIndexOf(Vector64 vector, T value) => IndexOfLastMatch(Equals(vector, Create(value))); /// Determines the index of the last element in a vector that has all bits set. /// The type of the elements in the vector. @@ -4400,6 +4397,66 @@ public static Vector64 WithElement(this Vector64 vector, int index, T v [Intrinsic] public static Vector64 Xor(Vector64 left, Vector64 right) => left ^ right; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(AdvSimd))] + internal static uint AdvSimdExtractBitMask(Vector64 vector) + { + if (!AdvSimd.IsSupported) + { + ThrowHelper.ThrowNotSupportedException(); + } + + // This expects vector to have each element be one of Zero or AllBitsSet + // and will not produce correct results otherwise. + // + // Given this, we can treat it as ushort and do a logical-right-shift by 4 to + // compact the mask into half the space, giving us the following possibilities for + // each pair of bytes: + // * 0x00_00 - 0x00 + // * 0x00_FF - 0x0F + // * 0xFF_00 - 0xF0 + // * 0xFF_FF - 0xFF + // + // This allows us to extract the full metadata as a 32-bit scalar which can be then + // be consumed by bit-counting APIs, such as PopCount, LeadingZeroCount, or TrailingZeroCount, + // and then adjusted by AdvSimdFixupBitCount to get the actual count of elements + // that were masked. + + return AdvSimd.ShiftRightLogicalNarrowingLower(vector.ToVector128().AsUInt16(), 4).AsUInt32().ToScalar(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(AdvSimd))] + internal static int AdvSimdFixupBitCount(int bitCount) + { + if (!AdvSimd.IsSupported) + { + ThrowHelper.ThrowNotSupportedException(); + } + + // This API is meant to be consumed alongside AdvSimdExtractBitMask and will + // not produce correct results for arbitary inputs. It adjusts the bit count + // assuming that sequences of 1 or 0 were in groups of 4 bits per byte. + + unsafe + { + return bitCount >>> (2 + int.Log2(sizeof(T))); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int CountMatches(Vector64 vector) + { + if (AdvSimd.IsSupported) + { + return AdvSimdFixupBitCount(BitOperations.PopCount(AdvSimdExtractBitMask(vector))); + } + else + { + return BitOperations.PopCount(vector.ExtractMostSignificantBits()); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static T GetElementUnsafe(in this Vector64 vector, int index) { @@ -4408,6 +4465,34 @@ internal static T GetElementUnsafe(in this Vector64 vector, int index) return Unsafe.Add(ref address, index); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IndexOfFirstMatch(Vector64 vector) + { + if (AdvSimd.IsSupported) + { + int result = AdvSimdFixupBitCount(BitOperations.TrailingZeroCount(AdvSimdExtractBitMask(vector))); + return (result != Vector64.Count) ? result : -1; + } + else + { + int result = BitOperations.TrailingZeroCount(vector.ExtractMostSignificantBits()); + return (result != 32) ? result : -1; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IndexOfLastMatch(Vector64 vector) + { + if (AdvSimd.IsSupported) + { + return (Vector64.Count - 1) - AdvSimdFixupBitCount(BitOperations.LeadingZeroCount(AdvSimdExtractBitMask(vector))); + } + else + { + return 31 - BitOperations.LeadingZeroCount(vector.ExtractMostSignificantBits()); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static void SetElementUnsafe(in this Vector64 vector, int index, T value) { From 21503a6972c7345d7d99856dcaee91e4ad699781 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 10 Apr 2026 15:22:13 -0700 Subject: [PATCH 2/3] Update IndexOfAnyAsciiSearcher to use the dedicated IndexOf and LastIndexOf vector APIs --- .../SearchValues/IndexOfAnyAsciiSearcher.cs | 68 ++++++++++++------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs index 5d7a7caa440ac6..bc3cdfb9928ae7 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs @@ -1255,8 +1255,7 @@ private static Vector256 IndexOfAnyLookup(Vector256 source private static unsafe int ComputeLastIndex(ref T searchSpace, ref T current, Vector128 result) where TNegator : struct, INegator { - uint mask = TNegator.ExtractMask(result) & 0xFFFF; - int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); + int offsetInVector = TNegator.IndexOfLastMatch(result); return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); } @@ -1264,8 +1263,8 @@ private static unsafe int ComputeLastIndex(ref T searchSpace, ref T private static unsafe int ComputeLastIndexOverlapped(ref T searchSpace, ref T secondVector, Vector128 result) where TNegator : struct, INegator { - uint mask = TNegator.ExtractMask(result) & 0xFFFF; - int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); + int offsetInVector = TNegator.IndexOfLastMatch(result); + if (offsetInVector < Vector128.Count) { return offsetInVector; @@ -1285,9 +1284,7 @@ private static unsafe int ComputeLastIndex(ref T searchSpace, ref T result = PackedSpanHelpers.FixUpPackedVector256Result(result); } - uint mask = TNegator.ExtractMask(result); - - int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); + int offsetInVector = TNegator.IndexOfLastMatch(result); return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); } @@ -1301,9 +1298,8 @@ private static unsafe int ComputeLastIndexOverlapped(ref T searchSp result = PackedSpanHelpers.FixUpPackedVector256Result(result); } - uint mask = TNegator.ExtractMask(result); + int offsetInVector = TNegator.IndexOfLastMatch(result); - int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); if (offsetInVector < Vector256.Count) { return offsetInVector; @@ -1318,8 +1314,10 @@ internal interface INegator static abstract bool NegateIfNeeded(bool result); static abstract Vector128 NegateIfNeeded(Vector128 result); static abstract Vector256 NegateIfNeeded(Vector256 result); - static abstract uint ExtractMask(Vector128 result); - static abstract uint ExtractMask(Vector256 result); + static abstract int IndexOfFirstMatch(Vector128 result); + static abstract int IndexOfFirstMatch(Vector256 result); + static abstract int IndexOfLastMatch(Vector128 result); + static abstract int IndexOfLastMatch(Vector256 result); } internal readonly struct DontNegate : INegator @@ -1328,13 +1326,21 @@ internal interface INegator public static Vector128 NegateIfNeeded(Vector128 result) => result; public static Vector256 NegateIfNeeded(Vector256 result) => result; [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static uint ExtractMask(Vector128 result) => ~Vector128.Equals(result, Vector128.Zero).ExtractMostSignificantBits(); + public static int IndexOfFirstMatch(Vector128 result) => Vector128.IndexOfFirstMatch(~Vector128.Equals(result, Vector128.Zero)); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int IndexOfFirstMatch(Vector256 result) => Vector256.IndexOfFirstMatch(~Vector256.Equals(result, Vector256.Zero)); [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static uint ExtractMask(Vector256 result) => ~Vector256.Equals(result, Vector256.Zero).ExtractMostSignificantBits(); + public static int IndexOfLastMatch(Vector128 result) => Vector128.IndexOfLastMatch(~Vector128.Equals(result, Vector128.Zero)); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int IndexOfLastMatch(Vector256 result) => Vector256.IndexOfLastMatch(~Vector256.Equals(result, Vector256.Zero)); } internal readonly struct Negate : INegator { + // AdvSimd expects that a given element is strictly Zero or AllBitsSet + // so we need to ensure that we normalize the input prior to calling + // IndexOfFirstMatch or IndexOfLastMatch + public static bool NegateIfNeeded(bool result) => !result; // This is intentionally testing for equality with 0 instead of "~result". // We want to know if any character didn't match, as that means it should be treated as a match for the -Except method. @@ -1343,9 +1349,27 @@ internal interface INegator [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 NegateIfNeeded(Vector256 result) => Vector256.Equals(result, Vector256.Zero); [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static uint ExtractMask(Vector128 result) => result.ExtractMostSignificantBits(); + public static int IndexOfFirstMatch(Vector128 result) + { + if (AdvSimd.IsSupported) + { + result = (result.AsSByte() >> 7).AsByte(); + } + return Vector128.IndexOfFirstMatch(result); + } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static uint ExtractMask(Vector256 result) => result.ExtractMostSignificantBits(); + public static int IndexOfFirstMatch(Vector256 result) => Vector256.IndexOfFirstMatch(result); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int IndexOfLastMatch(Vector128 result) + { + if (AdvSimd.IsSupported) + { + result = (result.AsSByte() >> 7).AsByte(); + } + return Vector128.IndexOfLastMatch(result); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int IndexOfLastMatch(Vector256 result) => Vector256.IndexOfLastMatch(result); } internal interface IOptimizations @@ -1470,8 +1494,7 @@ public static int ScalarResult(ref T searchSpace, ref T current) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int FirstIndex(ref T searchSpace, ref T current, Vector128 result) where TNegator : struct, INegator { - uint mask = TNegator.ExtractMask(result); - int offsetInVector = BitOperations.TrailingZeroCount(mask); + int offsetInVector = TNegator.IndexOfFirstMatch(result); return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); } @@ -1484,17 +1507,15 @@ public static int FirstIndex(ref T searchSpace, ref T current, Vector2 result = PackedSpanHelpers.FixUpPackedVector256Result(result); } - uint mask = TNegator.ExtractMask(result); - - int offsetInVector = BitOperations.TrailingZeroCount(mask); + int offsetInVector = TNegator.IndexOfFirstMatch(result); return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector128 result) where TNegator : struct, INegator { - uint mask = TNegator.ExtractMask(result); - int offsetInVector = BitOperations.TrailingZeroCount(mask); + int offsetInVector = TNegator.IndexOfFirstMatch(result); + if (offsetInVector >= Vector128.Count) { // We matched within the second vector @@ -1513,9 +1534,8 @@ public static int FirstIndexOverlapped(ref T searchSpace, ref T curren result = PackedSpanHelpers.FixUpPackedVector256Result(result); } - uint mask = TNegator.ExtractMask(result); + int offsetInVector = TNegator.IndexOfFirstMatch(result); - int offsetInVector = BitOperations.TrailingZeroCount(mask); if (offsetInVector >= Vector256.Count) { // We matched within the second vector From 5474a17fae5cccb85354b24c60486d5d63669b18 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 10 Apr 2026 15:37:38 -0700 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../src/System/Runtime/Intrinsics/Vector128.cs | 4 ++-- .../src/System/Runtime/Intrinsics/Vector64.cs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index 1ba14646ab81f6..1cebc6f55a6b81 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -4521,7 +4521,7 @@ internal static ulong AdvSimdExtractBitMask(Vector128 vector) // * 0xFF_00 - 0xF0 // * 0xFF_FF - 0xFF // - // This allows us to extract the full metadata as a 64-bit scalar which can be then + // This allows us to extract the full metadata as a 64-bit scalar which can then // be consumed by bit-counting APIs, such as PopCount, LeadingZeroCount, or TrailingZeroCount, // and then adjusted by AdvSimdFixupBitCount to get the actual count of elements // that were masked. @@ -4539,7 +4539,7 @@ internal static int AdvSimdFixupBitCount(int bitCount) } // This API is meant to be consumed alongside AdvSimdExtractBitMask and will - // not produce correct results for arbitary inputs. It adjusts the bit count + // not produce correct results for arbitrary inputs. It adjusts the bit count // assuming that sequences of 1 or 0 were in groups of 4 bits per byte. unsafe diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs index 2b94436cd4446b..8655d9778f0529 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs @@ -4417,7 +4417,7 @@ internal static uint AdvSimdExtractBitMask(Vector64 vector) // * 0xFF_00 - 0xF0 // * 0xFF_FF - 0xFF // - // This allows us to extract the full metadata as a 32-bit scalar which can be then + // This allows us to extract the full metadata as a 32-bit scalar which can then // be consumed by bit-counting APIs, such as PopCount, LeadingZeroCount, or TrailingZeroCount, // and then adjusted by AdvSimdFixupBitCount to get the actual count of elements // that were masked. @@ -4435,7 +4435,7 @@ internal static int AdvSimdFixupBitCount(int bitCount) } // This API is meant to be consumed alongside AdvSimdExtractBitMask and will - // not produce correct results for arbitary inputs. It adjusts the bit count + // not produce correct results for arbitrary inputs. It adjusts the bit count // assuming that sequences of 1 or 0 were in groups of 4 bits per byte. unsafe