diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
index 29d02837403b8b..7ed83bee772fa5 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -490,29 +490,25 @@ private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter w
// Emit the code for whatever find mode has been determined.
switch (regexTree.FindOptimizations.FindMode)
{
- case FindNextStartingPositionMode.LeadingPrefix_LeftToRight:
- Debug.Assert(!string.IsNullOrEmpty(regexTree.FindOptimizations.LeadingPrefix));
- EmitIndexOf_LeftToRight(regexTree.FindOptimizations.LeadingPrefix);
+ case FindNextStartingPositionMode.LeadingString_LeftToRight:
+ case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
+ EmitIndexOf_LeftToRight();
break;
- case FindNextStartingPositionMode.LeadingPrefix_RightToLeft:
- Debug.Assert(!string.IsNullOrEmpty(regexTree.FindOptimizations.LeadingPrefix));
- EmitIndexOf_RightToLeft(regexTree.FindOptimizations.LeadingPrefix);
+ case FindNextStartingPositionMode.LeadingString_RightToLeft:
+ EmitIndexOf_RightToLeft();
break;
case FindNextStartingPositionMode.LeadingSet_LeftToRight:
- case FindNextStartingPositionMode.FixedSets_LeftToRight:
- Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 });
+ case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight:
EmitFixedSet_LeftToRight();
break;
case FindNextStartingPositionMode.LeadingSet_RightToLeft:
- Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 });
EmitFixedSet_RightToLeft();
break;
case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight:
- Debug.Assert(regexTree.FindOptimizations.LiteralAfterLoop is not null);
EmitLiteralAfterAtomicLoop();
break;
@@ -712,12 +708,40 @@ bool EmitAnchors()
return false;
}
- // Emits a case-sensitive prefix search for a string at the beginning of the pattern.
- void EmitIndexOf_LeftToRight(string prefix)
+ // Emits a case-sensitive left-to-right search for a substring.
+ void EmitIndexOf_LeftToRight()
{
- writer.WriteLine($"// The pattern begins with a literal {Literal(prefix)}. Find the next occurrence.");
+ RegexFindOptimizations opts = regexTree.FindOptimizations;
+
+ string substring = "";
+ string offset = "";
+ string offsetDescription = "at the beginning of the pattern";
+
+ switch (opts.FindMode)
+ {
+ case FindNextStartingPositionMode.LeadingString_LeftToRight:
+ substring = regexTree.FindOptimizations.LeadingPrefix;
+ Debug.Assert(!string.IsNullOrEmpty(substring));
+ break;
+
+ case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
+ Debug.Assert(!string.IsNullOrEmpty(regexTree.FindOptimizations.FixedDistanceLiteral.String));
+ substring = regexTree.FindOptimizations.FixedDistanceLiteral.String;
+ if (regexTree.FindOptimizations.FixedDistanceLiteral is { Distance: > 0 } literal)
+ {
+ offset = $" + {literal.Distance}";
+ offsetDescription = $" at index {literal.Distance} in the pattern";
+ }
+ break;
+
+ default:
+ Debug.Fail($"Unexpected mode: {opts.FindMode}");
+ break;
+ }
+
+ writer.WriteLine($"// The pattern has the literal {Literal(substring)} {offsetDescription}. Find the next occurrence.");
writer.WriteLine($"// If it can't be found, there's no match.");
- writer.WriteLine($"int i = inputSpan.Slice(pos).IndexOf({Literal(prefix)});");
+ writer.WriteLine($"int i = inputSpan.Slice(pos{offset}).IndexOf({Literal(substring)});");
using (EmitBlock(writer, "if (i >= 0)"))
{
writer.WriteLine("base.runtextpos = pos + i;");
@@ -725,9 +749,11 @@ void EmitIndexOf_LeftToRight(string prefix)
}
}
- // Emits a case-sensitive right-to-left prefix search for a string at the beginning of the pattern.
- void EmitIndexOf_RightToLeft(string prefix)
+ // Emits a case-sensitive right-to-left search for a substring.
+ void EmitIndexOf_RightToLeft()
{
+ string prefix = regexTree.FindOptimizations.LeadingPrefix;
+
writer.WriteLine($"// The pattern begins with a literal {Literal(prefix)}. Find the next occurrence right-to-left.");
writer.WriteLine($"// If it can't be found, there's no match.");
writer.WriteLine($"pos = inputSpan.Slice(0, pos).LastIndexOf({Literal(prefix)});");
@@ -742,6 +768,8 @@ void EmitIndexOf_RightToLeft(string prefix)
// and potentially other sets at other fixed positions in the pattern.
void EmitFixedSet_LeftToRight()
{
+ Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 });
+
List<(char[]? Chars, string Set, int Distance)>? sets = regexTree.FindOptimizations.FixedDistanceSets;
(char[]? Chars, string Set, int Distance) primarySet = sets![0];
const int MaxSets = 4;
@@ -865,6 +893,8 @@ void EmitFixedSet_LeftToRight()
// (Currently that position will always be a distance of 0, meaning the start of the pattern itself.)
void EmitFixedSet_RightToLeft()
{
+ Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 });
+
(char[]? Chars, string Set, int Distance) set = regexTree.FindOptimizations.FixedDistanceSets![0];
Debug.Assert(set.Distance == 0);
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index 213782d2eb485c..d35de929762f84 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -1434,6 +1434,11 @@ private static RegexCharClass ParseRecursive(string charClass, int start)
return ranges;
}
+ /// Cache of character class strings for single ASCII characters.
+ private static readonly string[] s_asciiStrings = new string[128];
+ /// Cache of character class strings for pairs of upper/lower-case ASCII letters.
+ private static readonly string[] s_asciiLetterPairStrings = new string[26];
+
/// Creates a set string for a single character.
/// The character for which to create the set.
/// The create set string.
@@ -1450,44 +1455,32 @@ internal static unsafe string CharsToStringClass(ReadOnlySpan chars)
}
#endif
- // If there aren't any chars, just return an empty class.
- if (chars.Length == 0)
+ switch (chars.Length)
{
- return EmptyClass;
- }
+ case 0:
+ // If there aren't any chars, just return an empty class.
+ return EmptyClass;
- if (chars.Length == 2)
- {
- switch (chars[0], chars[1])
- {
- case ('A', 'a'): case ('a', 'A'): return "\0\x0004\0ABab";
- case ('B', 'b'): case ('b', 'B'): return "\0\x0004\0BCbc";
- case ('C', 'c'): case ('c', 'C'): return "\0\x0004\0CDcd";
- case ('D', 'd'): case ('d', 'D'): return "\0\x0004\0DEde";
- case ('E', 'e'): case ('e', 'E'): return "\0\x0004\0EFef";
- case ('F', 'f'): case ('f', 'F'): return "\0\x0004\0FGfg";
- case ('G', 'g'): case ('g', 'G'): return "\0\x0004\0GHgh";
- case ('H', 'h'): case ('h', 'H'): return "\0\x0004\0HIhi";
- // 'I' and 'i' are missing since depending on the cultuure they may
- // have additional mappings.
- case ('J', 'j'): case ('j', 'J'): return "\0\x0004\0JKjk";
- // 'K' and 'k' are missing since their mapping also includes Kelvin K.
- case ('L', 'l'): case ('l', 'L'): return "\0\x0004\0LMlm";
- case ('M', 'm'): case ('m', 'M'): return "\0\x0004\0MNmn";
- case ('N', 'n'): case ('n', 'N'): return "\0\x0004\0NOno";
- case ('O', 'o'): case ('o', 'O'): return "\0\x0004\0OPop";
- case ('P', 'p'): case ('p', 'P'): return "\0\x0004\0PQpq";
- case ('Q', 'q'): case ('q', 'Q'): return "\0\x0004\0QRqr";
- case ('R', 'r'): case ('r', 'R'): return "\0\x0004\0RSrs";
- case ('S', 's'): case ('s', 'S'): return "\0\x0004\0STst";
- case ('T', 't'): case ('t', 'T'): return "\0\x0004\0TUtu";
- case ('U', 'u'): case ('u', 'U'): return "\0\x0004\0UVuv";
- case ('V', 'v'): case ('v', 'V'): return "\0\x0004\0VWvw";
- case ('W', 'w'): case ('w', 'W'): return "\0\x0004\0WXwx";
- case ('X', 'x'): case ('x', 'X'): return "\0\x0004\0XYxy";
- case ('Y', 'y'): case ('y', 'Y'): return "\0\x0004\0YZyz";
- case ('Z', 'z'): case ('z', 'Z'): return "\0\x0004\0Z[z{";
- }
+ case 1:
+ // Special-case ASCII characters to avoid the computation/allocation in this very common case.
+ if (chars[0] < 128)
+ {
+ string[] asciiStrings = s_asciiStrings;
+ if (chars[0] < asciiStrings.Length)
+ {
+ return asciiStrings[chars[0]] ??= $"\0\u0002\0{chars[0]}{(char)(chars[0] + 1)}";
+ }
+ }
+ break;
+
+ case 2:
+ // Special-case cased ASCII letter pairs to avoid the computation/allocation in this very common case.
+ int masked0 = chars[0] | 0x20;
+ if ((uint)(masked0 - 'a') <= 'z' - 'a' && masked0 == (chars[1] | 0x20))
+ {
+ return s_asciiLetterPairStrings[masked0 - 'a'] ??= $"\0\u0004\0{(char)(masked0 & ~0x20)}{(char)((masked0 & ~0x20) + 1)}{(char)masked0}{(char)(masked0 + 1)}";
+ }
+ break;
}
// Count how many characters there actually are. All but the very last possible
@@ -1514,8 +1507,8 @@ internal static unsafe string CharsToStringClass(ReadOnlySpan chars)
// Fill in the set string
span[FlagsIndex] = (char)0;
- span[CategoryLengthIndex] = (char)0;
span[SetLengthIndex] = (char)(span.Length - SetStartIndex);
+ span[CategoryLengthIndex] = (char)0;
int i = SetStartIndex;
foreach (char c in chars)
{
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index a4fe958ab3a552..de6f322ac10357 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -417,29 +417,25 @@ protected void EmitTryFindNextPossibleStartingPosition()
// Either anchors weren't specified, or they don't completely root all matches to a specific location.
switch (_regexTree.FindOptimizations.FindMode)
{
- case FindNextStartingPositionMode.LeadingPrefix_LeftToRight:
- Debug.Assert(!string.IsNullOrEmpty(_regexTree.FindOptimizations.LeadingPrefix));
- EmitIndexOf_LeftToRight(_regexTree.FindOptimizations.LeadingPrefix);
+ case FindNextStartingPositionMode.LeadingString_LeftToRight:
+ case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
+ EmitIndexOf_LeftToRight();
break;
- case FindNextStartingPositionMode.LeadingPrefix_RightToLeft:
- Debug.Assert(!string.IsNullOrEmpty(_regexTree.FindOptimizations.LeadingPrefix));
- EmitIndexOf_RightToLeft(_regexTree.FindOptimizations.LeadingPrefix);
+ case FindNextStartingPositionMode.LeadingString_RightToLeft:
+ EmitIndexOf_RightToLeft();
break;
case FindNextStartingPositionMode.LeadingSet_LeftToRight:
- case FindNextStartingPositionMode.FixedSets_LeftToRight:
- Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 });
+ case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight:
EmitFixedSet_LeftToRight();
break;
case FindNextStartingPositionMode.LeadingSet_RightToLeft:
- Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 });
EmitFixedSet_RightToLeft();
break;
case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight:
- Debug.Assert(_regexTree.FindOptimizations.LiteralAfterLoop is not null);
EmitLiteralAfterAtomicLoop();
break;
@@ -707,16 +703,27 @@ bool EmitAnchors()
return false;
}
- // Emits a case-sensitive prefix search for a string at the beginning of the pattern.
- void EmitIndexOf_LeftToRight(string prefix)
+ // Emits a case-sensitive left-to-right search for a substring.
+ void EmitIndexOf_LeftToRight()
{
+ RegexFindOptimizations opts = _regexTree.FindOptimizations;
+ Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight);
+
using RentedLocalBuilder i = RentInt32Local();
// int i = inputSpan.Slice(pos).IndexOf(prefix);
Ldloca(inputSpan);
Ldloc(pos);
+ if (opts.FindMode == FindNextStartingPositionMode.FixedDistanceString_LeftToRight &&
+ opts.FixedDistanceLiteral is { Distance: > 0 } literal)
+ {
+ Ldc(literal.Distance);
+ Add();
+ }
Call(s_spanSliceIntMethod);
- Ldstr(prefix);
+ Ldstr(opts.FindMode == FindNextStartingPositionMode.LeadingString_LeftToRight ?
+ opts.LeadingPrefix :
+ opts.FixedDistanceLiteral.String!);
Call(s_stringAsSpanMethod);
Call(s_spanIndexOfSpan);
Stloc(i);
@@ -737,9 +744,12 @@ void EmitIndexOf_LeftToRight(string prefix)
Ret();
}
- // Emits a case-sensitive right-to-left prefix search for a string at the beginning of the pattern.
- void EmitIndexOf_RightToLeft(string prefix)
+ // Emits a case-sensitive right-to-left search for a substring.
+ void EmitIndexOf_RightToLeft()
{
+ string prefix = _regexTree.FindOptimizations.LeadingPrefix;
+ Debug.Assert(!string.IsNullOrEmpty(prefix));
+
// pos = inputSpan.Slice(0, pos).LastIndexOf(prefix);
Ldloca(inputSpan);
Ldc(0);
@@ -770,6 +780,8 @@ void EmitIndexOf_RightToLeft(string prefix)
// and potentially other sets at other fixed positions in the pattern.
void EmitFixedSet_LeftToRight()
{
+ Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 });
+
List<(char[]? Chars, string Set, int Distance)>? sets = _regexTree.FindOptimizations.FixedDistanceSets;
(char[]? Chars, string Set, int Distance) primarySet = sets![0];
const int MaxSets = 4;
@@ -967,6 +979,8 @@ void EmitFixedSet_LeftToRight()
// (Currently that position will always be a distance of 0, meaning the start of the pattern itself.)
void EmitFixedSet_RightToLeft()
{
+ Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 });
+
(char[]? Chars, string Set, int Distance) set = _regexTree.FindOptimizations.FixedDistanceSets![0];
Debug.Assert(set.Distance == 0);
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
index f06e1ec46142d5..51d427699c237e 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
@@ -83,8 +83,8 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
{
LeadingPrefix = prefix;
FindMode = _rightToLeft ?
- FindNextStartingPositionMode.LeadingPrefix_RightToLeft :
- FindNextStartingPositionMode.LeadingPrefix_LeftToRight;
+ FindNextStartingPositionMode.LeadingString_RightToLeft :
+ FindNextStartingPositionMode.LeadingString_LeftToRight;
return;
}
@@ -121,8 +121,8 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
{
// The set contains one and only one character, meaning every match starts
// with the same literal value (potentially case-insensitive). Search for that.
- FixedDistanceLiteral = (chars[0], 0);
- FindMode = FindNextStartingPositionMode.LeadingLiteral_RightToLeft;
+ FixedDistanceLiteral = (chars[0], null, 0);
+ FindMode = FindNextStartingPositionMode.LeadingChar_RightToLeft;
}
else
{
@@ -140,46 +140,60 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
// We're now left-to-right only and looking for sets.
- // As a backup, see if we can find a literal after a leading atomic loop. That might be better than whatever sets we find, so
- // we want to know whether we have one in our pocket before deciding whether to use a leading set.
- (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);
-
// Build up a list of all of the sets that are a fixed distance from the start of the expression.
List<(char[]? Chars, string Set, int Distance)>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, thorough: !interpreter);
Debug.Assert(fixedDistanceSets is null || fixedDistanceSets.Count != 0);
+ // See if we can make a string of at least two characters long out of those sets. We should have already caught
+ // one at the beginning of the pattern, but there may be one hiding at a non-zero fixed distance into the pattern.
+ if (fixedDistanceSets is not null &&
+ FindFixedDistanceString(fixedDistanceSets) is (string String, int Distance) bestFixedDistanceString)
+ {
+ FindMode = FindNextStartingPositionMode.FixedDistanceString_LeftToRight;
+ FixedDistanceLiteral = ('\0', bestFixedDistanceString.String, bestFixedDistanceString.Distance);
+ return;
+ }
+
+ // As a backup, see if we can find a literal after a leading atomic loop. That might be better than whatever sets we find, so
+ // we want to know whether we have one in our pocket before deciding whether to use a leading set (we'll prefer a leading
+ // set if it's something for which we can vectorize a search).
+ (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);
+
// If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support a vectorized
// search and we did successfully find a literal after an atomic loop we could search instead, we prefer the vectorizable search.
- if (fixedDistanceSets is not null &&
- (fixedDistanceSets[0].Chars is not null || literalAfterLoop is null))
+ if (fixedDistanceSets is not null)
{
- // Determine whether to do searching based on one or more sets or on a single literal. Compiled engines
- // don't need to special-case literals as they already do codegen to create the optimal lookup based on
- // the set's characteristics.
- if (!compiled &&
- fixedDistanceSets.Count == 1 &&
- fixedDistanceSets[0].Chars is { Length: 1 })
+ RegexPrefixAnalyzer.SortFixedDistanceSetsByQuality(fixedDistanceSets);
+ if (fixedDistanceSets[0].Chars is not null || literalAfterLoop is null)
{
- FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], fixedDistanceSets[0].Distance);
- FindMode = FindNextStartingPositionMode.FixedLiteral_LeftToRight;
- }
- else
- {
- // Limit how many sets we use to avoid doing lots of unnecessary work. The list was already
- // sorted from best to worst, so just keep the first ones up to our limit.
- const int MaxSetsToUse = 3; // arbitrary tuned limit
- if (fixedDistanceSets.Count > MaxSetsToUse)
+ // Determine whether to do searching based on one or more sets or on a single literal. Compiled engines
+ // don't need to special-case literals as they already do codegen to create the optimal lookup based on
+ // the set's characteristics.
+ if (!compiled &&
+ fixedDistanceSets.Count == 1 &&
+ fixedDistanceSets[0].Chars is { Length: 1 })
{
- fixedDistanceSets.RemoveRange(MaxSetsToUse, fixedDistanceSets.Count - MaxSetsToUse);
+ FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], null, fixedDistanceSets[0].Distance);
+ FindMode = FindNextStartingPositionMode.FixedDistanceChar_LeftToRight;
}
+ else
+ {
+ // Limit how many sets we use to avoid doing lots of unnecessary work. The list was already
+ // sorted from best to worst, so just keep the first ones up to our limit.
+ const int MaxSetsToUse = 3; // arbitrary tuned limit
+ if (fixedDistanceSets.Count > MaxSetsToUse)
+ {
+ fixedDistanceSets.RemoveRange(MaxSetsToUse, fixedDistanceSets.Count - MaxSetsToUse);
+ }
- // Store the sets, and compute which mode to use.
- FixedDistanceSets = fixedDistanceSets;
- FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ? FindNextStartingPositionMode.LeadingSet_LeftToRight
- : FindNextStartingPositionMode.FixedSets_LeftToRight;
- _asciiLookups = new uint[fixedDistanceSets.Count][];
+ // Store the sets, and compute which mode to use.
+ FixedDistanceSets = fixedDistanceSets;
+ FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ? FindNextStartingPositionMode.LeadingSet_LeftToRight
+ : FindNextStartingPositionMode.FixedDistanceSets_LeftToRight;
+ _asciiLookups = new uint[fixedDistanceSets.Count][];
+ }
+ return;
}
- return;
}
// If we found a literal we can search for after a leading set loop, use it.
@@ -216,7 +230,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
public string LeadingPrefix { get; } = string.Empty;
/// When in fixed distance literal mode, gets the literal and how far it is from the start of the pattern.
- public (char Literal, int Distance) FixedDistanceLiteral { get; }
+ public (char Char, string? String, int Distance) FixedDistanceLiteral { get; }
/// When in fixed distance set mode, gets the set and how far it is from the start of the pattern.
/// The case-insensitivity of the 0th entry will always match the mode selected, but subsequent entries may not.
@@ -225,6 +239,62 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
/// When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop.
public (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? LiteralAfterLoop { get; }
+ /// Analyzes a list of fixed-distance sets to extract a case-sensitive string at a fixed distance.
+ private static (string String, int Distance)? FindFixedDistanceString(List<(char[]? Chars, string Set, int Distance)> fixedDistanceSets)
+ {
+ (string String, int Distance)? best = null;
+
+ // A result string must be at least two characters in length; therefore we require at least that many sets.
+ if (fixedDistanceSets.Count >= 2)
+ {
+ // We're walking the sets from beginning to end, so we need them sorted by distance.
+ fixedDistanceSets.Sort((s1, s2) => s1.Distance.CompareTo(s2.Distance));
+
+ Span scratch = stackalloc char[64];
+ var vsb = new ValueStringBuilder(scratch);
+
+ // Looking for strings of length >= 2
+ int start = -1;
+ for (int i = 0; i < fixedDistanceSets.Count + 1; i++)
+ {
+ char[]? chars = i < fixedDistanceSets.Count ? fixedDistanceSets[i].Chars : null;
+ bool invalidChars = chars is not { Length: 1 };
+
+ // If the current set ends a sequence (or we've walked off the end), see whether
+ // what we've gathered constitues a valid string, and if it's better than the
+ // best we've already seen, store it. Regardless, reset the sequence in order
+ // to continue analyzing.
+ if (invalidChars ||
+ (i > 0 && fixedDistanceSets[i].Distance != fixedDistanceSets[i - 1].Distance + 1))
+ {
+ if (start != -1 && i - start >= (best is null ? 2 : best.Value.String.Length))
+ {
+ best = (vsb.ToString(), fixedDistanceSets[start].Distance);
+ }
+
+ vsb = new ValueStringBuilder(scratch);
+ start = -1;
+ if (invalidChars)
+ {
+ continue;
+ }
+ }
+
+ if (start == -1)
+ {
+ start = i;
+ }
+
+ Debug.Assert(chars is { Length: 1 });
+ vsb.Append(chars[0]);
+ }
+
+ vsb.Dispose();
+ }
+
+ return best;
+ }
+
/// Try to advance to the next starting position that might be a location for a match.
/// The text to search.
/// The position in . This is updated with the found position.
@@ -379,7 +449,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos
// There's a case-sensitive prefix. Search for it with ordinal IndexOf.
- case FindNextStartingPositionMode.LeadingPrefix_LeftToRight:
+ case FindNextStartingPositionMode.LeadingString_LeftToRight:
{
int i = textSpan.Slice(pos).IndexOf(LeadingPrefix.AsSpan());
if (i >= 0)
@@ -392,7 +462,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos
return false;
}
- case FindNextStartingPositionMode.LeadingPrefix_RightToLeft:
+ case FindNextStartingPositionMode.LeadingString_RightToLeft:
{
int i = textSpan.Slice(0, pos).LastIndexOf(LeadingPrefix.AsSpan());
if (i >= 0)
@@ -407,9 +477,9 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos
// There's a literal at the beginning of the pattern. Search for it.
- case FindNextStartingPositionMode.LeadingLiteral_RightToLeft:
+ case FindNextStartingPositionMode.LeadingChar_RightToLeft:
{
- int i = textSpan.Slice(0, pos).LastIndexOf(FixedDistanceLiteral.Literal);
+ int i = textSpan.Slice(0, pos).LastIndexOf(FixedDistanceLiteral.Char);
if (i >= 0)
{
pos = i + 1;
@@ -474,11 +544,26 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos
// There's a literal at a fixed offset from the beginning of the pattern. Search for it.
- case FindNextStartingPositionMode.FixedLiteral_LeftToRight:
+ case FindNextStartingPositionMode.FixedDistanceChar_LeftToRight:
+ {
+ Debug.Assert(FixedDistanceLiteral.Distance <= MinRequiredLength);
+
+ int i = textSpan.Slice(pos + FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.Char);
+ if (i >= 0)
+ {
+ pos += i;
+ return true;
+ }
+
+ pos = textSpan.Length;
+ return false;
+ }
+
+ case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
{
Debug.Assert(FixedDistanceLiteral.Distance <= MinRequiredLength);
- int i = textSpan.Slice(pos + FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.Literal);
+ int i = textSpan.Slice(pos + FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.String.AsSpan());
if (i >= 0)
{
pos += i;
@@ -491,7 +576,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos
// There are one or more sets at fixed offsets from the start of the pattern.
- case FindNextStartingPositionMode.FixedSets_LeftToRight:
+ case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight:
{
List<(char[]? Chars, string Set, int Distance)> sets = FixedDistanceSets!;
(char[]? primaryChars, string primarySet, int primaryDistance) = sets[0];
@@ -649,25 +734,27 @@ internal enum FindNextStartingPositionMode
TrailingAnchor_FixedLength_LeftToRight_EndZ,
/// A multi-character substring at the beginning of the pattern.
- LeadingPrefix_LeftToRight,
+ LeadingString_LeftToRight,
/// A multi-character substring at the beginning of the right-to-left pattern.
- LeadingPrefix_RightToLeft,
+ LeadingString_RightToLeft,
/// A set starting the pattern.
LeadingSet_LeftToRight,
/// A set starting the right-to-left pattern.
LeadingSet_RightToLeft,
- /// A single character at a fixed distance from the start of the right-to-left pattern.
- LeadingLiteral_RightToLeft,
+ /// A single character at the start of the right-to-left pattern.
+ LeadingChar_RightToLeft,
/// A single character at a fixed distance from the start of the pattern.
- FixedLiteral_LeftToRight,
+ FixedDistanceChar_LeftToRight,
+ /// A multi-character case-sensitive string at a fixed distance from the start of the pattern.
+ FixedDistanceString_LeftToRight,
/// One or more sets at a fixed distance from the start of the pattern.
- FixedSets_LeftToRight,
+ FixedDistanceSets_LeftToRight,
- /// A literal after a non-overlapping set loop at the start of the pattern.
+ /// A literal (single character, multi-char string, or set with small number of characters) after a non-overlapping set loop at the start of the pattern.
LiteralAfterLoop_LeftToRight,
/// Nothing to search for. Nop.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
index 0a0530e33cb790..ed98d39454ac72 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@@ -226,49 +226,6 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
}
}
- // Finally, try to move the "best" results to be earlier. "best" here are ones we're able to search
- // for the fastest and that have the best chance of matching as few false positives as possible.
- results.Sort((s1, s2) =>
- {
- if (s1.Chars is not null && s2.Chars is not null)
- {
- // Then of the ones that are the same length, prefer those with less frequent values. The frequency is
- // only an approximation, used as a tie-breaker when we'd otherwise effectively be picking randomly. True
- // frequencies will vary widely based on the actual data being searched, the language of the data, etc.
- int c = SumFrequencies(s1.Chars).CompareTo(SumFrequencies(s2.Chars));
- if (c != 0)
- {
- return c;
- }
-
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- static float SumFrequencies(char[] chars)
- {
- float sum = 0;
- foreach (char c in chars)
- {
- // Lookup each character in the table. For values > 255, this will end up truncating
- // and thus we'll get skew in the data. It's already a gross approximation, though,
- // and it is primarily meant for disambiguation of ASCII letters.
- sum += s_frequency[(byte)c];
- }
- return sum;
- }
- }
- else if (s1.Chars is not null)
- {
- // If s1 has chars and s2 doesn't, then s1 has fewer chars.
- return -1;
- }
- else if (s2.Chars is not null)
- {
- // If s2 has chars and s1 doesn't, then s2 has fewer chars.
- return 1;
- }
-
- return s1.Distance.CompareTo(s2.Distance);
- });
-
return results;
// Starting from the specified root node, populates results with any characters at a fixed distance
@@ -478,6 +435,51 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in
}
}
+ /// Sorts a set of fixed-distance set results from best to worst quality.
+ public static void SortFixedDistanceSetsByQuality(List<(char[]? Chars, string Set, int Distance)> results) =>
+ // Finally, try to move the "best" results to be earlier. "best" here are ones we're able to search
+ // for the fastest and that have the best chance of matching as few false positives as possible.
+ results.Sort((s1, s2) =>
+ {
+ if (s1.Chars is not null && s2.Chars is not null)
+ {
+ // Then of the ones that are the same length, prefer those with less frequent values. The frequency is
+ // only an approximation, used as a tie-breaker when we'd otherwise effectively be picking randomly. True
+ // frequencies will vary widely based on the actual data being searched, the language of the data, etc.
+ int c = SumFrequencies(s1.Chars).CompareTo(SumFrequencies(s2.Chars));
+ if (c != 0)
+ {
+ return c;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static float SumFrequencies(char[] chars)
+ {
+ float sum = 0;
+ foreach (char c in chars)
+ {
+ // Lookup each character in the table. For values > 255, this will end up truncating
+ // and thus we'll get skew in the data. It's already a gross approximation, though,
+ // and it is primarily meant for disambiguation of ASCII letters.
+ sum += s_frequency[(byte)c];
+ }
+ return sum;
+ }
+ }
+ else if (s1.Chars is not null)
+ {
+ // If s1 has chars and s2 doesn't, then s1 has fewer chars.
+ return -1;
+ }
+ else if (s2.Chars is not null)
+ {
+ // If s2 has chars and s1 doesn't, then s2 has fewer chars.
+ return 1;
+ }
+
+ return s1.Distance.CompareTo(s2.Distance);
+ });
+
///
/// Computes a character class for the first character in tree. This uses a more robust algorithm
/// than is used by TryFindFixedLiterals and thus can find starting sets it couldn't. For example,
diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs
index 15e83bc2c3d87d..32db3f3ca0834b 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs
@@ -58,7 +58,7 @@ public void LeadingAnchor_LeftToRight(string pattern, RegexOptions options, int
[InlineData(@"abc\z", RegexOptions.None, (int)FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_End, 3, (int)RegexNodeKind.End)]
[InlineData(@"abc\Z", RegexOptions.None, (int)FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ, 3, (int)RegexNodeKind.EndZ)]
[InlineData(@"abc$", RegexOptions.None, (int)FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ, 3, (int)RegexNodeKind.EndZ)]
- [InlineData(@"a{4,10}$", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, 10, (int)RegexNodeKind.EndZ)]
+ [InlineData(@"a{4,10}$", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, 10, (int)RegexNodeKind.EndZ)]
[InlineData(@"(abc|defg){1,2}\z", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, 8, (int)RegexNodeKind.End)]
public void TrailingAnchor(string pattern, RegexOptions options, int expectedMode, int expectedLength, int trailingAnchor)
{
@@ -69,28 +69,28 @@ public void TrailingAnchor(string pattern, RegexOptions options, int expectedMod
}
[Theory]
- [InlineData(@"ab", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "ab")]
- [InlineData(@"ab", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "ab")]
- [InlineData(@"(a)(bc)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "abc")]
- [InlineData(@"(a)(bc)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "bc")]
- [InlineData(@"a{10}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "aaaaaaaaaa")]
- [InlineData(@"a{10}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "aaaaaaaaaa")]
- [InlineData(@"(?>a{10,20})", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "aaaaaaaaaa")]
- [InlineData(@"(?>a{10,20})", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "aaaaaaaaaa")]
- [InlineData(@"a{3,5}?", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "aaa")]
- [InlineData(@"a{3,5}?", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "aaa")]
- [InlineData(@"ab{5}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "abbbbb")]
- [InlineData(@"ab{5}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "abbbbb")]
- [InlineData(@"ab\w", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "ab")]
- [InlineData(@"\wab", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "ab")]
- [InlineData(@"(ab){3}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "ababab")]
- [InlineData(@"(ab){3}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "ab")]
- [InlineData(@"(ab){2,4}(de){4,}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "abab")]
- [InlineData(@"(ab){2,4}(de){4,}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "de")]
- [InlineData(@"ab|(abc)|(abcd)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "ab")]
- [InlineData(@"ab|(abc)|(abcd)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "ab")]
- [InlineData(@"ab(?=cd)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "ab")]
- [InlineData(@"ab(?=cd)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "ab")]
+ [InlineData(@"ab", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")]
+ [InlineData(@"ab", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")]
+ [InlineData(@"(a)(bc)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")]
+ [InlineData(@"(a)(bc)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "bc")]
+ [InlineData(@"a{10}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "aaaaaaaaaa")]
+ [InlineData(@"a{10}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "aaaaaaaaaa")]
+ [InlineData(@"(?>a{10,20})", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "aaaaaaaaaa")]
+ [InlineData(@"(?>a{10,20})", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "aaaaaaaaaa")]
+ [InlineData(@"a{3,5}?", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "aaa")]
+ [InlineData(@"a{3,5}?", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "aaa")]
+ [InlineData(@"ab{5}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abbbbb")]
+ [InlineData(@"ab{5}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "abbbbb")]
+ [InlineData(@"ab\w", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")]
+ [InlineData(@"\wab", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")]
+ [InlineData(@"(ab){3}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ababab")]
+ [InlineData(@"(ab){3}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")]
+ [InlineData(@"(ab){2,4}(de){4,}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abab")]
+ [InlineData(@"(ab){2,4}(de){4,}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "de")]
+ [InlineData(@"ab|(abc)|(abcd)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")]
+ [InlineData(@"ab|(abc)|(abcd)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")]
+ [InlineData(@"ab(?=cd)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")]
+ [InlineData(@"ab(?=cd)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")]
public void LeadingPrefix(string pattern, RegexOptions options, int expectedMode, string expectedPrefix)
{
RegexFindOptimizations opts = ComputeOptimizations(pattern, options);
@@ -128,6 +128,19 @@ public void LiteralAfterLoop(string pattern, RegexOptions options, int expectedM
Assert.Equal(expectedChar, opts.LiteralAfterLoop.Value.Literal.Char);
}
+ [Theory]
+ [InlineData(@".ab", RegexOptions.None, (int)FindNextStartingPositionMode.FixedDistanceString_LeftToRight, "ab", 1)]
+ [InlineData(@".ab\w\w\wcdef\w\w\w\w\wghijklmnopq\w\w\w", RegexOptions.None, (int)FindNextStartingPositionMode.FixedDistanceString_LeftToRight, "ghijklmnopq", 15)]
+ [InlineData(@"a[Bb]c[Dd]ef", RegexOptions.None, (int)FindNextStartingPositionMode.FixedDistanceString_LeftToRight, "ef", 4)]
+ [InlineData(@"a[Bb]cd[Ee]fgh[Ii]", RegexOptions.None, (int)FindNextStartingPositionMode.FixedDistanceString_LeftToRight, "fgh", 5)]
+ public void FixedDistanceString(string pattern, RegexOptions options, int expectedMode, string expectedString, int distance)
+ {
+ RegexFindOptimizations opts = ComputeOptimizations(pattern, options);
+ Assert.Equal((FindNextStartingPositionMode)expectedMode, opts.FindMode);
+ Assert.Equal(expectedString, opts.FixedDistanceLiteral.String);
+ Assert.Equal(distance, opts.FixedDistanceLiteral.Distance);
+ }
+
private static RegexFindOptimizations ComputeOptimizations(string pattern, RegexOptions options)
{
RegexTree tree = RegexParser.Parse(pattern, options, CultureInfo.InvariantCulture);