From 826e568d5f0c263b23f4b0e173c5c30bd2ac7015 Mon Sep 17 00:00:00 2001 From: yesmey Date: Mon, 7 Feb 2022 12:36:35 +0100 Subject: [PATCH 1/7] Impove the vectorized path for String.Split - Implement Vector265 for longer strings - Simplify the Vector code and use new cross-platform intrinsic API - Use ref _firstChar instead of ref MemoryMarshal.GetReference(this.AsSpan()); - Use unsigned check for separators.Length so that two redundant range checks are optimized away --- .../src/System/String.Manipulation.cs | 99 +++++++------------ 1 file changed, 34 insertions(+), 65 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs index 0097b92631ff0e..24e498ba9cb08c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs @@ -1637,27 +1637,13 @@ private void MakeSeparatorList(ReadOnlySpan separators, ref ValueListBuild } // Special-case the common cases of 1, 2, and 3 separators, with manual comparisons against each separator. - else if (separators.Length <= 3) + else if (separators.Length <= 3u) { char sep0, sep1, sep2; sep0 = separators[0]; sep1 = separators.Length > 1 ? separators[1] : sep0; sep2 = separators.Length > 2 ? separators[2] : sep1; - - if (Length >= 16 && Sse41.IsSupported) - { - MakeSeparatorListVectorized(ref sepListBuilder, sep0, sep1, sep2); - return; - } - - for (int i = 0; i < Length; i++) - { - char c = this[i]; - if (c == sep0 || c == sep1 || c == sep2) - { - sepListBuilder.Append(i); - } - } + MakeSeparatorListVectorized(ref sepListBuilder, sep0, sep1, sep2); } // Handle > 3 separators with a probabilistic map, ala IndexOfAny. @@ -1686,77 +1672,60 @@ private void MakeSeparatorList(ReadOnlySpan separators, ref ValueListBuild private void MakeSeparatorListVectorized(ref ValueListBuilder sepListBuilder, char c, char c2, char c3) { - // Redundant test so we won't prejit remainder of this method - // on platforms without SSE. - if (!Sse41.IsSupported) - { - throw new PlatformNotSupportedException(); - } - - // Constant that allows for the truncation of 16-bit (FFFF/0000) values within a register to 4-bit (F/0) - Vector128 shuffleConstant = Vector128.Create(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - - Vector128 v1 = Vector128.Create((ushort)c); - Vector128 v2 = Vector128.Create((ushort)c2); - Vector128 v3 = Vector128.Create((ushort)c3); - - ref char c0 = ref MemoryMarshal.GetReference(this.AsSpan()); - int cond = Length & -Vector128.Count; int i = 0; - - for (; i < cond; i += Vector128.Count) + if (Vector256.IsHardwareAccelerated && Length >= Vector256.Count * 2) { - Vector128 charVector = ReadVector(ref c0, i); - Vector128 cmp = Sse2.CompareEqual(charVector, v1); - - cmp = Sse2.Or(Sse2.CompareEqual(charVector, v2), cmp); - cmp = Sse2.Or(Sse2.CompareEqual(charVector, v3), cmp); - - if (Sse41.TestZ(cmp, cmp)) { continue; } - - Vector128 mask = Sse2.ShiftRightLogical(cmp.AsUInt64(), 4).AsByte(); - mask = Ssse3.Shuffle(mask, shuffleConstant); + ref ushort source = ref Unsafe.As(ref _firstChar); - uint lowBits = Sse2.ConvertToUInt32(mask.AsUInt32()); - mask = Sse2.ShiftRightLogical(mask.AsUInt64(), 32).AsByte(); - uint highBits = Sse2.ConvertToUInt32(mask.AsUInt32()); + Vector256 v1 = Vector256.Create((ushort)c); + Vector256 v2 = Vector256.Create((ushort)c2); + Vector256 v3 = Vector256.Create((ushort)c3); - for (int idx = i; lowBits != 0; idx++) + int vector256ShortCount = Vector256.Count; + for (; (i + vector256ShortCount) <= Length; i += vector256ShortCount) { - if ((lowBits & 0xF) != 0) + Vector256 vector = Vector256.LoadUnsafe(ref source, (uint)i); + Vector256 cmp = Vector256.Equals(vector, v1) | Vector256.Equals(vector, v2) | Vector256.Equals(vector, v3); + + uint mask = cmp.AsByte().ExtractMostSignificantBits() & 0b0101010101010101; + while (mask != 0) { - sepListBuilder.Append(idx); + sepListBuilder.Append(i + BitOperations.TrailingZeroCount(mask) / 2); + mask = BitOperations.ResetLowestSetBit(mask); } - - lowBits >>= 8; } + } + else if (Vector128.IsHardwareAccelerated && Length >= Vector128.Count * 2) + { + ref ushort source = ref Unsafe.As(ref _firstChar); + + Vector128 v1 = Vector128.Create((ushort)c); + Vector128 v2 = Vector128.Create((ushort)c2); + Vector128 v3 = Vector128.Create((ushort)c3); - for (int idx = i + 4; highBits != 0; idx++) + int vector128ShortCount = Vector128.Count; + for (; (i + vector128ShortCount) <= Length; i += vector128ShortCount) { - if ((highBits & 0xF) != 0) + Vector128 vector = Vector128.LoadUnsafe(ref source, (uint)i); + Vector128 cmp = Vector128.Equals(vector, v1) | Vector128.Equals(vector, v2) | Vector128.Equals(vector, v3); + + uint mask = cmp.AsByte().ExtractMostSignificantBits() & 0b0101010101010101; + while (mask != 0) { - sepListBuilder.Append(idx); + sepListBuilder.Append(i + BitOperations.TrailingZeroCount(mask) / 2); + mask = BitOperations.ResetLowestSetBit(mask); } - - highBits >>= 8; } } for (; i < Length; i++) { - char curr = Unsafe.Add(ref c0, (IntPtr)(uint)i); + char curr = this[i]; if (curr == c || curr == c2 || curr == c3) { sepListBuilder.Append(i); } } - - static Vector128 ReadVector(ref char c0, int offset) - { - ref char ci = ref Unsafe.Add(ref c0, (IntPtr)(uint)offset); - ref byte b = ref Unsafe.As(ref ci); - return Unsafe.ReadUnaligned>(ref b); - } } /// From e64d510a6f628644be8b1a983c2321b7d4262529 Mon Sep 17 00:00:00 2001 From: yesmey Date: Mon, 7 Feb 2022 12:38:46 +0100 Subject: [PATCH 2/7] Improve performance of common path for Append in ValueListBuilder --- .../Collections/Generic/ValueListBuilder.cs | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Collections/Generic/ValueListBuilder.cs b/src/libraries/System.Private.CoreLib/src/System/Collections/Generic/ValueListBuilder.cs index a82372b92b34db..7aaaa6d4e61e7c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Collections/Generic/ValueListBuilder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Collections/Generic/ValueListBuilder.cs @@ -43,10 +43,26 @@ public ref T this[int index] [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Append(T item) { + Span span = _span; int pos = _pos; - if (pos >= _span.Length) - Grow(); + if ((uint)pos < (uint)span.Length) + { + span[pos] = item; + _pos = pos + 1; + } + else + { + AddWithResize(item); + } + } + // Hide uncommon path + [MethodImpl(MethodImplOptions.NoInlining)] + private void AddWithResize(T item) + { + Debug.Assert(_pos == _span.Length); + int pos = _pos; + Grow(); _span[pos] = item; _pos = pos + 1; } From 60d94e1de4942b534c22a8f81ef0c48e4cd2c014 Mon Sep 17 00:00:00 2001 From: yesmey Date: Tue, 8 Feb 2022 00:21:30 +0100 Subject: [PATCH 3/7] Address pr feedback --- .../src/System/String.Manipulation.cs | 71 ++++++++++++------- 1 file changed, 44 insertions(+), 27 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs index 24e498ba9cb08c..d77cb5001e01ea 100644 --- a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs @@ -1637,7 +1637,7 @@ private void MakeSeparatorList(ReadOnlySpan separators, ref ValueListBuild } // Special-case the common cases of 1, 2, and 3 separators, with manual comparisons against each separator. - else if (separators.Length <= 3u) + else if ((uint)separators.Length <= (uint)3) { char sep0, sep1, sep2; sep0 = separators[0]; @@ -1672,58 +1672,75 @@ private void MakeSeparatorList(ReadOnlySpan separators, ref ValueListBuild private void MakeSeparatorListVectorized(ref ValueListBuilder sepListBuilder, char c, char c2, char c3) { - int i = 0; - if (Vector256.IsHardwareAccelerated && Length >= Vector256.Count * 2) - { - ref ushort source = ref Unsafe.As(ref _firstChar); + ref ushort source = ref Unsafe.As(ref _firstChar); + + nuint offset = 0; + nuint lengthToExamine = (nuint)(uint)Length; + if (Vector256.IsHardwareAccelerated && lengthToExamine >= (nuint)Vector256.Count) + { Vector256 v1 = Vector256.Create((ushort)c); Vector256 v2 = Vector256.Create((ushort)c2); Vector256 v3 = Vector256.Create((ushort)c3); - int vector256ShortCount = Vector256.Count; - for (; (i + vector256ShortCount) <= Length; i += vector256ShortCount) + while (offset <= lengthToExamine - (nuint)Vector256.Count) { - Vector256 vector = Vector256.LoadUnsafe(ref source, (uint)i); - Vector256 cmp = Vector256.Equals(vector, v1) | Vector256.Equals(vector, v2) | Vector256.Equals(vector, v3); + Vector256 vector = Vector256.LoadUnsafe(ref source, offset); + Vector256 vector1 = Vector256.Equals(vector, v1); + Vector256 vector2 = Vector256.Equals(vector, v2); + Vector256 vector3 = Vector256.Equals(vector, v3); + Vector256 cmp = (vector1 | vector2 | vector3).AsByte(); - uint mask = cmp.AsByte().ExtractMostSignificantBits() & 0b0101010101010101; - while (mask != 0) + if (cmp != Vector256.Zero) { - sepListBuilder.Append(i + BitOperations.TrailingZeroCount(mask) / 2); - mask = BitOperations.ResetLowestSetBit(mask); + uint mask = cmp.ExtractMostSignificantBits() & 0x55555555u; + do + { + uint bitPos = (uint)BitOperations.TrailingZeroCount(mask) / sizeof(char); + sepListBuilder.Append((int)(offset + bitPos)); + mask = BitOperations.ResetLowestSetBit(mask); + } while (mask != 0); } + + offset += (nuint)Vector256.Count; } } - else if (Vector128.IsHardwareAccelerated && Length >= Vector128.Count * 2) - { - ref ushort source = ref Unsafe.As(ref _firstChar); + if (Vector128.IsHardwareAccelerated && lengthToExamine >= (nuint)Vector128.Count) + { Vector128 v1 = Vector128.Create((ushort)c); Vector128 v2 = Vector128.Create((ushort)c2); Vector128 v3 = Vector128.Create((ushort)c3); - int vector128ShortCount = Vector128.Count; - for (; (i + vector128ShortCount) <= Length; i += vector128ShortCount) + while (offset <= lengthToExamine - (nuint)Vector128.Count) { - Vector128 vector = Vector128.LoadUnsafe(ref source, (uint)i); - Vector128 cmp = Vector128.Equals(vector, v1) | Vector128.Equals(vector, v2) | Vector128.Equals(vector, v3); + Vector128 vector = Vector128.LoadUnsafe(ref source, offset); + Vector128 vector1 = Vector128.Equals(vector, v1); + Vector128 vector2 = Vector128.Equals(vector, v2); + Vector128 vector3 = Vector128.Equals(vector, v3); + Vector128 cmp = (vector1 | vector2 | vector3).AsByte(); - uint mask = cmp.AsByte().ExtractMostSignificantBits() & 0b0101010101010101; - while (mask != 0) + if (cmp != Vector128.Zero) { - sepListBuilder.Append(i + BitOperations.TrailingZeroCount(mask) / 2); - mask = BitOperations.ResetLowestSetBit(mask); + uint mask = cmp.ExtractMostSignificantBits() & 0x5555u; + do + { + uint bitPos = (uint)BitOperations.TrailingZeroCount(mask) / sizeof(char); + sepListBuilder.Append((int)(offset + bitPos)); + mask = BitOperations.ResetLowestSetBit(mask); + } while (mask != 0); } + + offset += (nuint)Vector128.Count; } } - for (; i < Length; i++) + for (int j = (int)offset; j < Length; j++) { - char curr = this[i]; + char curr = this[j]; if (curr == c || curr == c2 || curr == c3) { - sepListBuilder.Append(i); + sepListBuilder.Append(j); } } } From 4c8c2b51e6dfd5f49ec18f6e3ee17ef4371978aa Mon Sep 17 00:00:00 2001 From: yesmey Date: Tue, 8 Feb 2022 00:33:57 +0100 Subject: [PATCH 4/7] Add a longer string split test example --- src/libraries/System.Runtime/tests/System/String.SplitTests.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libraries/System.Runtime/tests/System/String.SplitTests.cs b/src/libraries/System.Runtime/tests/System/String.SplitTests.cs index 519efb7dec4eab..a5228d1e5c1be0 100644 --- a/src/libraries/System.Runtime/tests/System/String.SplitTests.cs +++ b/src/libraries/System.Runtime/tests/System/String.SplitTests.cs @@ -561,6 +561,7 @@ public static void SplitCharArraySeparator(string value, char[] separators, int [InlineData("this, is, a, string, with some spaces, ", new[] { ",", " s" }, M, StringSplitOptions.RemoveEmptyEntries, new[] { "this", " is", " a", "tring", " with", "ome", "paces", " " })] [InlineData("this, is, a, string, with some spaces, ", new[] { ",", " s" }, M, StringSplitOptions.TrimEntries, new[] { "this", "is", "a", "", "tring", "with", "ome", "paces", "" })] [InlineData("this, is, a, string, with some spaces, ", new[] { ",", " s" }, M, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries, new[] { "this", "is", "a", "tring", "with", "ome", "paces" })] + [InlineData("this, is, a, very long string, with some spaces, commas and more spaces", new[] { ",", " s" }, M, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries, new[] { "this", "is", "a", "very long", "tring", "with", "ome", "paces", "commas and more", "paces" })] public static void SplitStringArraySeparator(string value, string[] separators, int count, StringSplitOptions options, string[] expected) { Assert.Equal(expected, value.Split(separators, count, options)); From 915ff8252abf77ca00eb68761ed1f05dc004c1be Mon Sep 17 00:00:00 2001 From: yesmey Date: Wed, 9 Feb 2022 12:34:37 +0100 Subject: [PATCH 5/7] Improve spilling for the Vector256 version. Remove temp span variable in ValueListBuilder. Improve sequential iteration --- .../Collections/Generic/ValueListBuilder.cs | 5 +- .../src/System/String.Manipulation.cs | 121 +++++++++++------- .../tests/System/String.SplitTests.cs | 1 + 3 files changed, 78 insertions(+), 49 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Collections/Generic/ValueListBuilder.cs b/src/libraries/System.Private.CoreLib/src/System/Collections/Generic/ValueListBuilder.cs index 7aaaa6d4e61e7c..72b25bc2d9dd33 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Collections/Generic/ValueListBuilder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Collections/Generic/ValueListBuilder.cs @@ -43,11 +43,10 @@ public ref T this[int index] [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Append(T item) { - Span span = _span; int pos = _pos; - if ((uint)pos < (uint)span.Length) + if ((uint)pos < (uint)_span.Length) { - span[pos] = item; + _span[pos] = item; _pos = pos + 1; } else diff --git a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs index 02fb24d8ee27c4..9e1d4ded387cbd 100644 --- a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs @@ -1644,76 +1644,105 @@ private void MakeSeparatorList(ReadOnlySpan separators, ref ValueListBuild private void MakeSeparatorListVectorized(ref ValueListBuilder sepListBuilder, char c, char c2, char c3) { - ref ushort source = ref Unsafe.As(ref _firstChar); - nuint offset = 0; nuint lengthToExamine = (nuint)(uint)Length; - if (Vector256.IsHardwareAccelerated && lengthToExamine >= (nuint)Vector256.Count) - { - Vector256 v1 = Vector256.Create((ushort)c); - Vector256 v2 = Vector256.Create((ushort)c2); - Vector256 v3 = Vector256.Create((ushort)c3); + ref ushort source = ref Unsafe.As(ref _firstChar); - while (offset <= lengthToExamine - (nuint)Vector256.Count) + if (Vector256.IsHardwareAccelerated) + { + if (lengthToExamine >= (nuint)Vector256.Count) { - Vector256 vector = Vector256.LoadUnsafe(ref source, offset); - Vector256 vector1 = Vector256.Equals(vector, v1); - Vector256 vector2 = Vector256.Equals(vector, v2); - Vector256 vector3 = Vector256.Equals(vector, v3); - Vector256 cmp = (vector1 | vector2 | vector3).AsByte(); + Vector256 v1 = Vector256.Create((ushort)c); + Vector256 v2 = Vector256.Create((ushort)c2); + Vector256 v3 = Vector256.Create((ushort)c3); - if (cmp != Vector256.Zero) + do { - uint mask = cmp.ExtractMostSignificantBits() & 0x55555555u; - do + Vector256 vector = Vector256.LoadUnsafe(ref source, offset); + Vector256 v1Eq = Vector256.Equals(vector, v1); + Vector256 v2Eq = Vector256.Equals(vector, v2); + Vector256 v3Eq = Vector256.Equals(vector, v3); + Vector256 cmp = (v1Eq | v2Eq | v3Eq).AsByte(); + + if (cmp != Vector256.Zero) { - uint bitPos = (uint)BitOperations.TrailingZeroCount(mask) / sizeof(char); - sepListBuilder.Append((int)(offset + bitPos)); - mask = BitOperations.ResetLowestSetBit(mask); - } while (mask != 0); - } + uint mask = cmp.ExtractMostSignificantBits() & 0x55555555; + do + { + uint bitPos = (uint)BitOperations.TrailingZeroCount(mask) / sizeof(char); + sepListBuilder.Append((int)(offset + bitPos)); + mask = BitOperations.ResetLowestSetBit(mask); + } while (mask != 0); + } + + offset += (nuint)Vector256.Count; + } while (offset <= lengthToExamine - (nuint)Vector256.Count); + + // See if we fit another 128 bit vector + if (offset <= lengthToExamine - (nuint)Vector128.Count) + { + Vector128 vector = Vector128.LoadUnsafe(ref source, offset); + Vector128 v1Eq = Vector128.Equals(vector, v1.GetLower()); + Vector128 v2Eq = Vector128.Equals(vector, v2.GetLower()); + Vector128 v3Eq = Vector128.Equals(vector, v3.GetLower()); + Vector128 cmp = (v1Eq | v2Eq | v3Eq).AsByte(); - offset += (nuint)Vector256.Count; + if (cmp != Vector128.Zero) + { + uint mask = cmp.ExtractMostSignificantBits() & 0x5555; + do + { + uint bitPos = (uint)BitOperations.TrailingZeroCount(mask) / sizeof(char); + sepListBuilder.Append((int)(offset + bitPos)); + mask = BitOperations.ResetLowestSetBit(mask); + } while (mask != 0); + } + + offset += (nuint)Vector128.Count; + } } } - - if (Vector128.IsHardwareAccelerated && lengthToExamine >= (nuint)Vector128.Count) + else if (Vector128.IsHardwareAccelerated) { - Vector128 v1 = Vector128.Create((ushort)c); - Vector128 v2 = Vector128.Create((ushort)c2); - Vector128 v3 = Vector128.Create((ushort)c3); - - while (offset <= lengthToExamine - (nuint)Vector128.Count) + if (lengthToExamine >= (nuint)Vector128.Count * 2) { - Vector128 vector = Vector128.LoadUnsafe(ref source, offset); - Vector128 vector1 = Vector128.Equals(vector, v1); - Vector128 vector2 = Vector128.Equals(vector, v2); - Vector128 vector3 = Vector128.Equals(vector, v3); - Vector128 cmp = (vector1 | vector2 | vector3).AsByte(); + Vector128 v1 = Vector128.Create((ushort)c); + Vector128 v2 = Vector128.Create((ushort)c2); + Vector128 v3 = Vector128.Create((ushort)c3); - if (cmp != Vector128.Zero) + do { - uint mask = cmp.ExtractMostSignificantBits() & 0x5555u; - do + Vector128 vector = Vector128.LoadUnsafe(ref source, offset); + Vector128 v1Eq = Vector128.Equals(vector, v1); + Vector128 v2Eq = Vector128.Equals(vector, v2); + Vector128 v3Eq = Vector128.Equals(vector, v3); + Vector128 cmp = (v1Eq | v2Eq | v3Eq).AsByte(); + + if (cmp != Vector128.Zero) { - uint bitPos = (uint)BitOperations.TrailingZeroCount(mask) / sizeof(char); - sepListBuilder.Append((int)(offset + bitPos)); - mask = BitOperations.ResetLowestSetBit(mask); - } while (mask != 0); - } + uint mask = cmp.ExtractMostSignificantBits() & 0x5555; + do + { + uint bitPos = (uint)BitOperations.TrailingZeroCount(mask) / sizeof(char); + sepListBuilder.Append((int)(offset + bitPos)); + mask = BitOperations.ResetLowestSetBit(mask); + } while (mask != 0); + } - offset += (nuint)Vector128.Count; + offset += (nuint)Vector128.Count; + } while (offset <= lengthToExamine - (nuint)Vector128.Count); } } - for (int j = (int)offset; j < Length; j++) + while (offset < lengthToExamine) { - char curr = this[j]; + char curr = (char)Unsafe.Add(ref source, (nint)offset); if (curr == c || curr == c2 || curr == c3) { - sepListBuilder.Append(j); + sepListBuilder.Append((int)offset); } + offset++; } } diff --git a/src/libraries/System.Runtime/tests/System/String.SplitTests.cs b/src/libraries/System.Runtime/tests/System/String.SplitTests.cs index a5228d1e5c1be0..1785e9141cead7 100644 --- a/src/libraries/System.Runtime/tests/System/String.SplitTests.cs +++ b/src/libraries/System.Runtime/tests/System/String.SplitTests.cs @@ -530,6 +530,7 @@ public static void SplitNullCharArraySeparator_BindsToCharArrayOverload() [InlineData("this, is, a, string, with some spaces", new[] { ',', 's', 'a' }, M, StringSplitOptions.RemoveEmptyEntries, new[] { "thi", " i", " ", " ", "tring", " with ", "ome ", "p", "ce" })] [InlineData("this, is, a, string, with some spaces", new[] { ',', 's', 'a' }, M, StringSplitOptions.TrimEntries, new[] { "thi", "", "i", "", "", "", "", "tring", "with", "ome", "p", "ce", "" })] [InlineData("this, is, a, string, with some spaces", new[] { ',', 's', 'a' }, M, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries, new[] { "thi", "i", "tring", "with", "ome", "p", "ce" })] + [InlineData("this, is, a, very long string, with some spaces, commas and more spaces", new[] { ',', 's' }, M, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries, new[] { "thi", "i", "a", "very long", "tring", "with", "ome", "pace", "comma", "and more", "pace" })] public static void SplitCharArraySeparator(string value, char[] separators, int count, StringSplitOptions options, string[] expected) { Assert.Equal(expected, value.Split(separators, count, options)); From dcadf059087dbb660b6ed72a4bbf12266295a645 Mon Sep 17 00:00:00 2001 From: yesmey Date: Sun, 13 Feb 2022 22:30:31 +0100 Subject: [PATCH 6/7] Revert the AVX version --- .../src/System/String.Manipulation.cs | 117 ++++++------------ 1 file changed, 38 insertions(+), 79 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs index 9e1d4ded387cbd..d909802ba0aa9b 100644 --- a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs @@ -1615,7 +1615,20 @@ private void MakeSeparatorList(ReadOnlySpan separators, ref ValueListBuild sep0 = separators[0]; sep1 = separators.Length > 1 ? separators[1] : sep0; sep2 = separators.Length > 2 ? separators[2] : sep1; - MakeSeparatorListVectorized(ref sepListBuilder, sep0, sep1, sep2); + if (Vector128.IsHardwareAccelerated && Length >= Vector128.Count * 2) + { + MakeSeparatorListVectorized(ref sepListBuilder, sep0, sep1, sep2); + return; + } + + for (int i = 0; i < Length; i++) + { + char c = this[i]; + if (c == sep0 || c == sep1 || c == sep2) + { + sepListBuilder.Append(i); + } + } } // Handle > 3 separators with a probabilistic map, ala IndexOfAny. @@ -1644,95 +1657,41 @@ private void MakeSeparatorList(ReadOnlySpan separators, ref ValueListBuild private void MakeSeparatorListVectorized(ref ValueListBuilder sepListBuilder, char c, char c2, char c3) { + if (!Vector128.IsHardwareAccelerated) + { + throw new PlatformNotSupportedException(); + } + nuint offset = 0; nuint lengthToExamine = (nuint)(uint)Length; ref ushort source = ref Unsafe.As(ref _firstChar); - if (Vector256.IsHardwareAccelerated) - { - if (lengthToExamine >= (nuint)Vector256.Count) - { - Vector256 v1 = Vector256.Create((ushort)c); - Vector256 v2 = Vector256.Create((ushort)c2); - Vector256 v3 = Vector256.Create((ushort)c3); - - do - { - Vector256 vector = Vector256.LoadUnsafe(ref source, offset); - Vector256 v1Eq = Vector256.Equals(vector, v1); - Vector256 v2Eq = Vector256.Equals(vector, v2); - Vector256 v3Eq = Vector256.Equals(vector, v3); - Vector256 cmp = (v1Eq | v2Eq | v3Eq).AsByte(); + Vector128 v1 = Vector128.Create((ushort)c); + Vector128 v2 = Vector128.Create((ushort)c2); + Vector128 v3 = Vector128.Create((ushort)c3); - if (cmp != Vector256.Zero) - { - uint mask = cmp.ExtractMostSignificantBits() & 0x55555555; - do - { - uint bitPos = (uint)BitOperations.TrailingZeroCount(mask) / sizeof(char); - sepListBuilder.Append((int)(offset + bitPos)); - mask = BitOperations.ResetLowestSetBit(mask); - } while (mask != 0); - } - - offset += (nuint)Vector256.Count; - } while (offset <= lengthToExamine - (nuint)Vector256.Count); - - // See if we fit another 128 bit vector - if (offset <= lengthToExamine - (nuint)Vector128.Count) - { - Vector128 vector = Vector128.LoadUnsafe(ref source, offset); - Vector128 v1Eq = Vector128.Equals(vector, v1.GetLower()); - Vector128 v2Eq = Vector128.Equals(vector, v2.GetLower()); - Vector128 v3Eq = Vector128.Equals(vector, v3.GetLower()); - Vector128 cmp = (v1Eq | v2Eq | v3Eq).AsByte(); - - if (cmp != Vector128.Zero) - { - uint mask = cmp.ExtractMostSignificantBits() & 0x5555; - do - { - uint bitPos = (uint)BitOperations.TrailingZeroCount(mask) / sizeof(char); - sepListBuilder.Append((int)(offset + bitPos)); - mask = BitOperations.ResetLowestSetBit(mask); - } while (mask != 0); - } - - offset += (nuint)Vector128.Count; - } - } - } - else if (Vector128.IsHardwareAccelerated) + while (offset <= lengthToExamine - (nuint)Vector128.Count) { - if (lengthToExamine >= (nuint)Vector128.Count * 2) - { - Vector128 v1 = Vector128.Create((ushort)c); - Vector128 v2 = Vector128.Create((ushort)c2); - Vector128 v3 = Vector128.Create((ushort)c3); + Vector128 vector = Vector128.LoadUnsafe(ref source, offset); + Vector128 v1Eq = Vector128.Equals(vector, v1); + Vector128 v2Eq = Vector128.Equals(vector, v2); + Vector128 v3Eq = Vector128.Equals(vector, v3); + Vector128 cmp = (v1Eq | v2Eq | v3Eq).AsByte(); + if (cmp != Vector128.Zero) + { + // Skip every other bit + uint mask = cmp.ExtractMostSignificantBits() & 0x5555; do { - Vector128 vector = Vector128.LoadUnsafe(ref source, offset); - Vector128 v1Eq = Vector128.Equals(vector, v1); - Vector128 v2Eq = Vector128.Equals(vector, v2); - Vector128 v3Eq = Vector128.Equals(vector, v3); - Vector128 cmp = (v1Eq | v2Eq | v3Eq).AsByte(); - - if (cmp != Vector128.Zero) - { - uint mask = cmp.ExtractMostSignificantBits() & 0x5555; - do - { - uint bitPos = (uint)BitOperations.TrailingZeroCount(mask) / sizeof(char); - sepListBuilder.Append((int)(offset + bitPos)); - mask = BitOperations.ResetLowestSetBit(mask); - } while (mask != 0); - } - - offset += (nuint)Vector128.Count; - } while (offset <= lengthToExamine - (nuint)Vector128.Count); + uint bitPos = (uint)BitOperations.TrailingZeroCount(mask) / sizeof(char); + sepListBuilder.Append((int)(offset + bitPos)); + mask = BitOperations.ResetLowestSetBit(mask); + } while (mask != 0); } + + offset += (nuint)Vector128.Count; } while (offset < lengthToExamine) From 51b102ba44980a3f545947a81238b320a00b0e83 Mon Sep 17 00:00:00 2001 From: yesmey Date: Wed, 23 Mar 2022 22:00:44 +0100 Subject: [PATCH 7/7] Address feedback --- .../src/System/String.Manipulation.cs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs index 779e0b0807a415..d883782dbe5a3b 100644 --- a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs @@ -1609,7 +1609,7 @@ private void MakeSeparatorList(ReadOnlySpan separators, ref ValueListBuild } // Special-case the common cases of 1, 2, and 3 separators, with manual comparisons against each separator. - else if ((uint)separators.Length <= (uint)3) + else if (separators.Length <= 3) { char sep0, sep1, sep2; sep0 = separators[0]; @@ -1657,11 +1657,15 @@ private void MakeSeparatorList(ReadOnlySpan separators, ref ValueListBuild private void MakeSeparatorListVectorized(ref ValueListBuilder sepListBuilder, char c, char c2, char c3) { + // Redundant test so we won't prejit remainder of this method + // on platforms where it is not supported if (!Vector128.IsHardwareAccelerated) { throw new PlatformNotSupportedException(); } + Debug.Assert(Length >= Vector128.Count); + nuint offset = 0; nuint lengthToExamine = (nuint)(uint)Length; @@ -1671,7 +1675,7 @@ private void MakeSeparatorListVectorized(ref ValueListBuilder sepListBuilde Vector128 v2 = Vector128.Create((ushort)c2); Vector128 v3 = Vector128.Create((ushort)c3); - while (offset <= lengthToExamine - (nuint)Vector128.Count) + do { Vector128 vector = Vector128.LoadUnsafe(ref source, offset); Vector128 v1Eq = Vector128.Equals(vector, v1); @@ -1692,11 +1696,11 @@ private void MakeSeparatorListVectorized(ref ValueListBuilder sepListBuilde } offset += (nuint)Vector128.Count; - } + } while (offset <= lengthToExamine - (nuint)Vector128.Count); while (offset < lengthToExamine) { - char curr = (char)Unsafe.Add(ref source, (nint)offset); + char curr = (char)Unsafe.Add(ref source, offset); if (curr == c || curr == c2 || curr == c3) { sepListBuilder.Append((int)offset);