diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 29d02837403b8b..7ed83bee772fa5 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -490,29 +490,25 @@ private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter w // Emit the code for whatever find mode has been determined. switch (regexTree.FindOptimizations.FindMode) { - case FindNextStartingPositionMode.LeadingPrefix_LeftToRight: - Debug.Assert(!string.IsNullOrEmpty(regexTree.FindOptimizations.LeadingPrefix)); - EmitIndexOf_LeftToRight(regexTree.FindOptimizations.LeadingPrefix); + case FindNextStartingPositionMode.LeadingString_LeftToRight: + case FindNextStartingPositionMode.FixedDistanceString_LeftToRight: + EmitIndexOf_LeftToRight(); break; - case FindNextStartingPositionMode.LeadingPrefix_RightToLeft: - Debug.Assert(!string.IsNullOrEmpty(regexTree.FindOptimizations.LeadingPrefix)); - EmitIndexOf_RightToLeft(regexTree.FindOptimizations.LeadingPrefix); + case FindNextStartingPositionMode.LeadingString_RightToLeft: + EmitIndexOf_RightToLeft(); break; case FindNextStartingPositionMode.LeadingSet_LeftToRight: - case FindNextStartingPositionMode.FixedSets_LeftToRight: - Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight: EmitFixedSet_LeftToRight(); break; case FindNextStartingPositionMode.LeadingSet_RightToLeft: - Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); EmitFixedSet_RightToLeft(); break; case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight: - Debug.Assert(regexTree.FindOptimizations.LiteralAfterLoop is not null); EmitLiteralAfterAtomicLoop(); break; @@ -712,12 +708,40 @@ bool EmitAnchors() return false; } - // Emits a case-sensitive prefix search for a string at the beginning of the pattern. - void EmitIndexOf_LeftToRight(string prefix) + // Emits a case-sensitive left-to-right search for a substring. + void EmitIndexOf_LeftToRight() { - writer.WriteLine($"// The pattern begins with a literal {Literal(prefix)}. Find the next occurrence."); + RegexFindOptimizations opts = regexTree.FindOptimizations; + + string substring = ""; + string offset = ""; + string offsetDescription = "at the beginning of the pattern"; + + switch (opts.FindMode) + { + case FindNextStartingPositionMode.LeadingString_LeftToRight: + substring = regexTree.FindOptimizations.LeadingPrefix; + Debug.Assert(!string.IsNullOrEmpty(substring)); + break; + + case FindNextStartingPositionMode.FixedDistanceString_LeftToRight: + Debug.Assert(!string.IsNullOrEmpty(regexTree.FindOptimizations.FixedDistanceLiteral.String)); + substring = regexTree.FindOptimizations.FixedDistanceLiteral.String; + if (regexTree.FindOptimizations.FixedDistanceLiteral is { Distance: > 0 } literal) + { + offset = $" + {literal.Distance}"; + offsetDescription = $" at index {literal.Distance} in the pattern"; + } + break; + + default: + Debug.Fail($"Unexpected mode: {opts.FindMode}"); + break; + } + + writer.WriteLine($"// The pattern has the literal {Literal(substring)} {offsetDescription}. Find the next occurrence."); writer.WriteLine($"// If it can't be found, there's no match."); - writer.WriteLine($"int i = inputSpan.Slice(pos).IndexOf({Literal(prefix)});"); + writer.WriteLine($"int i = inputSpan.Slice(pos{offset}).IndexOf({Literal(substring)});"); using (EmitBlock(writer, "if (i >= 0)")) { writer.WriteLine("base.runtextpos = pos + i;"); @@ -725,9 +749,11 @@ void EmitIndexOf_LeftToRight(string prefix) } } - // Emits a case-sensitive right-to-left prefix search for a string at the beginning of the pattern. - void EmitIndexOf_RightToLeft(string prefix) + // Emits a case-sensitive right-to-left search for a substring. + void EmitIndexOf_RightToLeft() { + string prefix = regexTree.FindOptimizations.LeadingPrefix; + writer.WriteLine($"// The pattern begins with a literal {Literal(prefix)}. Find the next occurrence right-to-left."); writer.WriteLine($"// If it can't be found, there's no match."); writer.WriteLine($"pos = inputSpan.Slice(0, pos).LastIndexOf({Literal(prefix)});"); @@ -742,6 +768,8 @@ void EmitIndexOf_RightToLeft(string prefix) // and potentially other sets at other fixed positions in the pattern. void EmitFixedSet_LeftToRight() { + Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + List<(char[]? Chars, string Set, int Distance)>? sets = regexTree.FindOptimizations.FixedDistanceSets; (char[]? Chars, string Set, int Distance) primarySet = sets![0]; const int MaxSets = 4; @@ -865,6 +893,8 @@ void EmitFixedSet_LeftToRight() // (Currently that position will always be a distance of 0, meaning the start of the pattern itself.) void EmitFixedSet_RightToLeft() { + Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + (char[]? Chars, string Set, int Distance) set = regexTree.FindOptimizations.FixedDistanceSets![0]; Debug.Assert(set.Distance == 0); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 213782d2eb485c..d35de929762f84 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -1434,6 +1434,11 @@ private static RegexCharClass ParseRecursive(string charClass, int start) return ranges; } + /// Cache of character class strings for single ASCII characters. + private static readonly string[] s_asciiStrings = new string[128]; + /// Cache of character class strings for pairs of upper/lower-case ASCII letters. + private static readonly string[] s_asciiLetterPairStrings = new string[26]; + /// Creates a set string for a single character. /// The character for which to create the set. /// The create set string. @@ -1450,44 +1455,32 @@ internal static unsafe string CharsToStringClass(ReadOnlySpan chars) } #endif - // If there aren't any chars, just return an empty class. - if (chars.Length == 0) + switch (chars.Length) { - return EmptyClass; - } + case 0: + // If there aren't any chars, just return an empty class. + return EmptyClass; - if (chars.Length == 2) - { - switch (chars[0], chars[1]) - { - case ('A', 'a'): case ('a', 'A'): return "\0\x0004\0ABab"; - case ('B', 'b'): case ('b', 'B'): return "\0\x0004\0BCbc"; - case ('C', 'c'): case ('c', 'C'): return "\0\x0004\0CDcd"; - case ('D', 'd'): case ('d', 'D'): return "\0\x0004\0DEde"; - case ('E', 'e'): case ('e', 'E'): return "\0\x0004\0EFef"; - case ('F', 'f'): case ('f', 'F'): return "\0\x0004\0FGfg"; - case ('G', 'g'): case ('g', 'G'): return "\0\x0004\0GHgh"; - case ('H', 'h'): case ('h', 'H'): return "\0\x0004\0HIhi"; - // 'I' and 'i' are missing since depending on the cultuure they may - // have additional mappings. - case ('J', 'j'): case ('j', 'J'): return "\0\x0004\0JKjk"; - // 'K' and 'k' are missing since their mapping also includes Kelvin K. - case ('L', 'l'): case ('l', 'L'): return "\0\x0004\0LMlm"; - case ('M', 'm'): case ('m', 'M'): return "\0\x0004\0MNmn"; - case ('N', 'n'): case ('n', 'N'): return "\0\x0004\0NOno"; - case ('O', 'o'): case ('o', 'O'): return "\0\x0004\0OPop"; - case ('P', 'p'): case ('p', 'P'): return "\0\x0004\0PQpq"; - case ('Q', 'q'): case ('q', 'Q'): return "\0\x0004\0QRqr"; - case ('R', 'r'): case ('r', 'R'): return "\0\x0004\0RSrs"; - case ('S', 's'): case ('s', 'S'): return "\0\x0004\0STst"; - case ('T', 't'): case ('t', 'T'): return "\0\x0004\0TUtu"; - case ('U', 'u'): case ('u', 'U'): return "\0\x0004\0UVuv"; - case ('V', 'v'): case ('v', 'V'): return "\0\x0004\0VWvw"; - case ('W', 'w'): case ('w', 'W'): return "\0\x0004\0WXwx"; - case ('X', 'x'): case ('x', 'X'): return "\0\x0004\0XYxy"; - case ('Y', 'y'): case ('y', 'Y'): return "\0\x0004\0YZyz"; - case ('Z', 'z'): case ('z', 'Z'): return "\0\x0004\0Z[z{"; - } + case 1: + // Special-case ASCII characters to avoid the computation/allocation in this very common case. + if (chars[0] < 128) + { + string[] asciiStrings = s_asciiStrings; + if (chars[0] < asciiStrings.Length) + { + return asciiStrings[chars[0]] ??= $"\0\u0002\0{chars[0]}{(char)(chars[0] + 1)}"; + } + } + break; + + case 2: + // Special-case cased ASCII letter pairs to avoid the computation/allocation in this very common case. + int masked0 = chars[0] | 0x20; + if ((uint)(masked0 - 'a') <= 'z' - 'a' && masked0 == (chars[1] | 0x20)) + { + return s_asciiLetterPairStrings[masked0 - 'a'] ??= $"\0\u0004\0{(char)(masked0 & ~0x20)}{(char)((masked0 & ~0x20) + 1)}{(char)masked0}{(char)(masked0 + 1)}"; + } + break; } // Count how many characters there actually are. All but the very last possible @@ -1514,8 +1507,8 @@ internal static unsafe string CharsToStringClass(ReadOnlySpan chars) // Fill in the set string span[FlagsIndex] = (char)0; - span[CategoryLengthIndex] = (char)0; span[SetLengthIndex] = (char)(span.Length - SetStartIndex); + span[CategoryLengthIndex] = (char)0; int i = SetStartIndex; foreach (char c in chars) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index a4fe958ab3a552..de6f322ac10357 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -417,29 +417,25 @@ protected void EmitTryFindNextPossibleStartingPosition() // Either anchors weren't specified, or they don't completely root all matches to a specific location. switch (_regexTree.FindOptimizations.FindMode) { - case FindNextStartingPositionMode.LeadingPrefix_LeftToRight: - Debug.Assert(!string.IsNullOrEmpty(_regexTree.FindOptimizations.LeadingPrefix)); - EmitIndexOf_LeftToRight(_regexTree.FindOptimizations.LeadingPrefix); + case FindNextStartingPositionMode.LeadingString_LeftToRight: + case FindNextStartingPositionMode.FixedDistanceString_LeftToRight: + EmitIndexOf_LeftToRight(); break; - case FindNextStartingPositionMode.LeadingPrefix_RightToLeft: - Debug.Assert(!string.IsNullOrEmpty(_regexTree.FindOptimizations.LeadingPrefix)); - EmitIndexOf_RightToLeft(_regexTree.FindOptimizations.LeadingPrefix); + case FindNextStartingPositionMode.LeadingString_RightToLeft: + EmitIndexOf_RightToLeft(); break; case FindNextStartingPositionMode.LeadingSet_LeftToRight: - case FindNextStartingPositionMode.FixedSets_LeftToRight: - Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight: EmitFixedSet_LeftToRight(); break; case FindNextStartingPositionMode.LeadingSet_RightToLeft: - Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); EmitFixedSet_RightToLeft(); break; case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight: - Debug.Assert(_regexTree.FindOptimizations.LiteralAfterLoop is not null); EmitLiteralAfterAtomicLoop(); break; @@ -707,16 +703,27 @@ bool EmitAnchors() return false; } - // Emits a case-sensitive prefix search for a string at the beginning of the pattern. - void EmitIndexOf_LeftToRight(string prefix) + // Emits a case-sensitive left-to-right search for a substring. + void EmitIndexOf_LeftToRight() { + RegexFindOptimizations opts = _regexTree.FindOptimizations; + Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight); + using RentedLocalBuilder i = RentInt32Local(); // int i = inputSpan.Slice(pos).IndexOf(prefix); Ldloca(inputSpan); Ldloc(pos); + if (opts.FindMode == FindNextStartingPositionMode.FixedDistanceString_LeftToRight && + opts.FixedDistanceLiteral is { Distance: > 0 } literal) + { + Ldc(literal.Distance); + Add(); + } Call(s_spanSliceIntMethod); - Ldstr(prefix); + Ldstr(opts.FindMode == FindNextStartingPositionMode.LeadingString_LeftToRight ? + opts.LeadingPrefix : + opts.FixedDistanceLiteral.String!); Call(s_stringAsSpanMethod); Call(s_spanIndexOfSpan); Stloc(i); @@ -737,9 +744,12 @@ void EmitIndexOf_LeftToRight(string prefix) Ret(); } - // Emits a case-sensitive right-to-left prefix search for a string at the beginning of the pattern. - void EmitIndexOf_RightToLeft(string prefix) + // Emits a case-sensitive right-to-left search for a substring. + void EmitIndexOf_RightToLeft() { + string prefix = _regexTree.FindOptimizations.LeadingPrefix; + Debug.Assert(!string.IsNullOrEmpty(prefix)); + // pos = inputSpan.Slice(0, pos).LastIndexOf(prefix); Ldloca(inputSpan); Ldc(0); @@ -770,6 +780,8 @@ void EmitIndexOf_RightToLeft(string prefix) // and potentially other sets at other fixed positions in the pattern. void EmitFixedSet_LeftToRight() { + Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + List<(char[]? Chars, string Set, int Distance)>? sets = _regexTree.FindOptimizations.FixedDistanceSets; (char[]? Chars, string Set, int Distance) primarySet = sets![0]; const int MaxSets = 4; @@ -967,6 +979,8 @@ void EmitFixedSet_LeftToRight() // (Currently that position will always be a distance of 0, meaning the start of the pattern itself.) void EmitFixedSet_RightToLeft() { + Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + (char[]? Chars, string Set, int Distance) set = _regexTree.FindOptimizations.FixedDistanceSets![0]; Debug.Assert(set.Distance == 0); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index f06e1ec46142d5..51d427699c237e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -83,8 +83,8 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) { LeadingPrefix = prefix; FindMode = _rightToLeft ? - FindNextStartingPositionMode.LeadingPrefix_RightToLeft : - FindNextStartingPositionMode.LeadingPrefix_LeftToRight; + FindNextStartingPositionMode.LeadingString_RightToLeft : + FindNextStartingPositionMode.LeadingString_LeftToRight; return; } @@ -121,8 +121,8 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) { // The set contains one and only one character, meaning every match starts // with the same literal value (potentially case-insensitive). Search for that. - FixedDistanceLiteral = (chars[0], 0); - FindMode = FindNextStartingPositionMode.LeadingLiteral_RightToLeft; + FixedDistanceLiteral = (chars[0], null, 0); + FindMode = FindNextStartingPositionMode.LeadingChar_RightToLeft; } else { @@ -140,46 +140,60 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) // We're now left-to-right only and looking for sets. - // As a backup, see if we can find a literal after a leading atomic loop. That might be better than whatever sets we find, so - // we want to know whether we have one in our pocket before deciding whether to use a leading set. - (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root); - // Build up a list of all of the sets that are a fixed distance from the start of the expression. List<(char[]? Chars, string Set, int Distance)>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, thorough: !interpreter); Debug.Assert(fixedDistanceSets is null || fixedDistanceSets.Count != 0); + // See if we can make a string of at least two characters long out of those sets. We should have already caught + // one at the beginning of the pattern, but there may be one hiding at a non-zero fixed distance into the pattern. + if (fixedDistanceSets is not null && + FindFixedDistanceString(fixedDistanceSets) is (string String, int Distance) bestFixedDistanceString) + { + FindMode = FindNextStartingPositionMode.FixedDistanceString_LeftToRight; + FixedDistanceLiteral = ('\0', bestFixedDistanceString.String, bestFixedDistanceString.Distance); + return; + } + + // As a backup, see if we can find a literal after a leading atomic loop. That might be better than whatever sets we find, so + // we want to know whether we have one in our pocket before deciding whether to use a leading set (we'll prefer a leading + // set if it's something for which we can vectorize a search). + (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root); + // If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support a vectorized // search and we did successfully find a literal after an atomic loop we could search instead, we prefer the vectorizable search. - if (fixedDistanceSets is not null && - (fixedDistanceSets[0].Chars is not null || literalAfterLoop is null)) + if (fixedDistanceSets is not null) { - // Determine whether to do searching based on one or more sets or on a single literal. Compiled engines - // don't need to special-case literals as they already do codegen to create the optimal lookup based on - // the set's characteristics. - if (!compiled && - fixedDistanceSets.Count == 1 && - fixedDistanceSets[0].Chars is { Length: 1 }) + RegexPrefixAnalyzer.SortFixedDistanceSetsByQuality(fixedDistanceSets); + if (fixedDistanceSets[0].Chars is not null || literalAfterLoop is null) { - FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], fixedDistanceSets[0].Distance); - FindMode = FindNextStartingPositionMode.FixedLiteral_LeftToRight; - } - else - { - // Limit how many sets we use to avoid doing lots of unnecessary work. The list was already - // sorted from best to worst, so just keep the first ones up to our limit. - const int MaxSetsToUse = 3; // arbitrary tuned limit - if (fixedDistanceSets.Count > MaxSetsToUse) + // Determine whether to do searching based on one or more sets or on a single literal. Compiled engines + // don't need to special-case literals as they already do codegen to create the optimal lookup based on + // the set's characteristics. + if (!compiled && + fixedDistanceSets.Count == 1 && + fixedDistanceSets[0].Chars is { Length: 1 }) { - fixedDistanceSets.RemoveRange(MaxSetsToUse, fixedDistanceSets.Count - MaxSetsToUse); + FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], null, fixedDistanceSets[0].Distance); + FindMode = FindNextStartingPositionMode.FixedDistanceChar_LeftToRight; } + else + { + // Limit how many sets we use to avoid doing lots of unnecessary work. The list was already + // sorted from best to worst, so just keep the first ones up to our limit. + const int MaxSetsToUse = 3; // arbitrary tuned limit + if (fixedDistanceSets.Count > MaxSetsToUse) + { + fixedDistanceSets.RemoveRange(MaxSetsToUse, fixedDistanceSets.Count - MaxSetsToUse); + } - // Store the sets, and compute which mode to use. - FixedDistanceSets = fixedDistanceSets; - FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ? FindNextStartingPositionMode.LeadingSet_LeftToRight - : FindNextStartingPositionMode.FixedSets_LeftToRight; - _asciiLookups = new uint[fixedDistanceSets.Count][]; + // Store the sets, and compute which mode to use. + FixedDistanceSets = fixedDistanceSets; + FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ? FindNextStartingPositionMode.LeadingSet_LeftToRight + : FindNextStartingPositionMode.FixedDistanceSets_LeftToRight; + _asciiLookups = new uint[fixedDistanceSets.Count][]; + } + return; } - return; } // If we found a literal we can search for after a leading set loop, use it. @@ -216,7 +230,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) public string LeadingPrefix { get; } = string.Empty; /// When in fixed distance literal mode, gets the literal and how far it is from the start of the pattern. - public (char Literal, int Distance) FixedDistanceLiteral { get; } + public (char Char, string? String, int Distance) FixedDistanceLiteral { get; } /// When in fixed distance set mode, gets the set and how far it is from the start of the pattern. /// The case-insensitivity of the 0th entry will always match the mode selected, but subsequent entries may not. @@ -225,6 +239,62 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) /// When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop. public (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? LiteralAfterLoop { get; } + /// Analyzes a list of fixed-distance sets to extract a case-sensitive string at a fixed distance. + private static (string String, int Distance)? FindFixedDistanceString(List<(char[]? Chars, string Set, int Distance)> fixedDistanceSets) + { + (string String, int Distance)? best = null; + + // A result string must be at least two characters in length; therefore we require at least that many sets. + if (fixedDistanceSets.Count >= 2) + { + // We're walking the sets from beginning to end, so we need them sorted by distance. + fixedDistanceSets.Sort((s1, s2) => s1.Distance.CompareTo(s2.Distance)); + + Span scratch = stackalloc char[64]; + var vsb = new ValueStringBuilder(scratch); + + // Looking for strings of length >= 2 + int start = -1; + for (int i = 0; i < fixedDistanceSets.Count + 1; i++) + { + char[]? chars = i < fixedDistanceSets.Count ? fixedDistanceSets[i].Chars : null; + bool invalidChars = chars is not { Length: 1 }; + + // If the current set ends a sequence (or we've walked off the end), see whether + // what we've gathered constitues a valid string, and if it's better than the + // best we've already seen, store it. Regardless, reset the sequence in order + // to continue analyzing. + if (invalidChars || + (i > 0 && fixedDistanceSets[i].Distance != fixedDistanceSets[i - 1].Distance + 1)) + { + if (start != -1 && i - start >= (best is null ? 2 : best.Value.String.Length)) + { + best = (vsb.ToString(), fixedDistanceSets[start].Distance); + } + + vsb = new ValueStringBuilder(scratch); + start = -1; + if (invalidChars) + { + continue; + } + } + + if (start == -1) + { + start = i; + } + + Debug.Assert(chars is { Length: 1 }); + vsb.Append(chars[0]); + } + + vsb.Dispose(); + } + + return best; + } + /// Try to advance to the next starting position that might be a location for a match. /// The text to search. /// The position in . This is updated with the found position. @@ -379,7 +449,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos // There's a case-sensitive prefix. Search for it with ordinal IndexOf. - case FindNextStartingPositionMode.LeadingPrefix_LeftToRight: + case FindNextStartingPositionMode.LeadingString_LeftToRight: { int i = textSpan.Slice(pos).IndexOf(LeadingPrefix.AsSpan()); if (i >= 0) @@ -392,7 +462,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos return false; } - case FindNextStartingPositionMode.LeadingPrefix_RightToLeft: + case FindNextStartingPositionMode.LeadingString_RightToLeft: { int i = textSpan.Slice(0, pos).LastIndexOf(LeadingPrefix.AsSpan()); if (i >= 0) @@ -407,9 +477,9 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos // There's a literal at the beginning of the pattern. Search for it. - case FindNextStartingPositionMode.LeadingLiteral_RightToLeft: + case FindNextStartingPositionMode.LeadingChar_RightToLeft: { - int i = textSpan.Slice(0, pos).LastIndexOf(FixedDistanceLiteral.Literal); + int i = textSpan.Slice(0, pos).LastIndexOf(FixedDistanceLiteral.Char); if (i >= 0) { pos = i + 1; @@ -474,11 +544,26 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos // There's a literal at a fixed offset from the beginning of the pattern. Search for it. - case FindNextStartingPositionMode.FixedLiteral_LeftToRight: + case FindNextStartingPositionMode.FixedDistanceChar_LeftToRight: + { + Debug.Assert(FixedDistanceLiteral.Distance <= MinRequiredLength); + + int i = textSpan.Slice(pos + FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.Char); + if (i >= 0) + { + pos += i; + return true; + } + + pos = textSpan.Length; + return false; + } + + case FindNextStartingPositionMode.FixedDistanceString_LeftToRight: { Debug.Assert(FixedDistanceLiteral.Distance <= MinRequiredLength); - int i = textSpan.Slice(pos + FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.Literal); + int i = textSpan.Slice(pos + FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.String.AsSpan()); if (i >= 0) { pos += i; @@ -491,7 +576,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos // There are one or more sets at fixed offsets from the start of the pattern. - case FindNextStartingPositionMode.FixedSets_LeftToRight: + case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight: { List<(char[]? Chars, string Set, int Distance)> sets = FixedDistanceSets!; (char[]? primaryChars, string primarySet, int primaryDistance) = sets[0]; @@ -649,25 +734,27 @@ internal enum FindNextStartingPositionMode TrailingAnchor_FixedLength_LeftToRight_EndZ, /// A multi-character substring at the beginning of the pattern. - LeadingPrefix_LeftToRight, + LeadingString_LeftToRight, /// A multi-character substring at the beginning of the right-to-left pattern. - LeadingPrefix_RightToLeft, + LeadingString_RightToLeft, /// A set starting the pattern. LeadingSet_LeftToRight, /// A set starting the right-to-left pattern. LeadingSet_RightToLeft, - /// A single character at a fixed distance from the start of the right-to-left pattern. - LeadingLiteral_RightToLeft, + /// A single character at the start of the right-to-left pattern. + LeadingChar_RightToLeft, /// A single character at a fixed distance from the start of the pattern. - FixedLiteral_LeftToRight, + FixedDistanceChar_LeftToRight, + /// A multi-character case-sensitive string at a fixed distance from the start of the pattern. + FixedDistanceString_LeftToRight, /// One or more sets at a fixed distance from the start of the pattern. - FixedSets_LeftToRight, + FixedDistanceSets_LeftToRight, - /// A literal after a non-overlapping set loop at the start of the pattern. + /// A literal (single character, multi-char string, or set with small number of characters) after a non-overlapping set loop at the start of the pattern. LiteralAfterLoop_LeftToRight, /// Nothing to search for. Nop. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 0a0530e33cb790..ed98d39454ac72 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -226,49 +226,6 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) } } - // Finally, try to move the "best" results to be earlier. "best" here are ones we're able to search - // for the fastest and that have the best chance of matching as few false positives as possible. - results.Sort((s1, s2) => - { - if (s1.Chars is not null && s2.Chars is not null) - { - // Then of the ones that are the same length, prefer those with less frequent values. The frequency is - // only an approximation, used as a tie-breaker when we'd otherwise effectively be picking randomly. True - // frequencies will vary widely based on the actual data being searched, the language of the data, etc. - int c = SumFrequencies(s1.Chars).CompareTo(SumFrequencies(s2.Chars)); - if (c != 0) - { - return c; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static float SumFrequencies(char[] chars) - { - float sum = 0; - foreach (char c in chars) - { - // Lookup each character in the table. For values > 255, this will end up truncating - // and thus we'll get skew in the data. It's already a gross approximation, though, - // and it is primarily meant for disambiguation of ASCII letters. - sum += s_frequency[(byte)c]; - } - return sum; - } - } - else if (s1.Chars is not null) - { - // If s1 has chars and s2 doesn't, then s1 has fewer chars. - return -1; - } - else if (s2.Chars is not null) - { - // If s2 has chars and s1 doesn't, then s2 has fewer chars. - return 1; - } - - return s1.Distance.CompareTo(s2.Distance); - }); - return results; // Starting from the specified root node, populates results with any characters at a fixed distance @@ -478,6 +435,51 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in } } + /// Sorts a set of fixed-distance set results from best to worst quality. + public static void SortFixedDistanceSetsByQuality(List<(char[]? Chars, string Set, int Distance)> results) => + // Finally, try to move the "best" results to be earlier. "best" here are ones we're able to search + // for the fastest and that have the best chance of matching as few false positives as possible. + results.Sort((s1, s2) => + { + if (s1.Chars is not null && s2.Chars is not null) + { + // Then of the ones that are the same length, prefer those with less frequent values. The frequency is + // only an approximation, used as a tie-breaker when we'd otherwise effectively be picking randomly. True + // frequencies will vary widely based on the actual data being searched, the language of the data, etc. + int c = SumFrequencies(s1.Chars).CompareTo(SumFrequencies(s2.Chars)); + if (c != 0) + { + return c; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static float SumFrequencies(char[] chars) + { + float sum = 0; + foreach (char c in chars) + { + // Lookup each character in the table. For values > 255, this will end up truncating + // and thus we'll get skew in the data. It's already a gross approximation, though, + // and it is primarily meant for disambiguation of ASCII letters. + sum += s_frequency[(byte)c]; + } + return sum; + } + } + else if (s1.Chars is not null) + { + // If s1 has chars and s2 doesn't, then s1 has fewer chars. + return -1; + } + else if (s2.Chars is not null) + { + // If s2 has chars and s1 doesn't, then s2 has fewer chars. + return 1; + } + + return s1.Distance.CompareTo(s2.Distance); + }); + /// /// Computes a character class for the first character in tree. This uses a more robust algorithm /// than is used by TryFindFixedLiterals and thus can find starting sets it couldn't. For example, diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs index 15e83bc2c3d87d..32db3f3ca0834b 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs @@ -58,7 +58,7 @@ public void LeadingAnchor_LeftToRight(string pattern, RegexOptions options, int [InlineData(@"abc\z", RegexOptions.None, (int)FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_End, 3, (int)RegexNodeKind.End)] [InlineData(@"abc\Z", RegexOptions.None, (int)FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ, 3, (int)RegexNodeKind.EndZ)] [InlineData(@"abc$", RegexOptions.None, (int)FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ, 3, (int)RegexNodeKind.EndZ)] - [InlineData(@"a{4,10}$", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, 10, (int)RegexNodeKind.EndZ)] + [InlineData(@"a{4,10}$", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, 10, (int)RegexNodeKind.EndZ)] [InlineData(@"(abc|defg){1,2}\z", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, 8, (int)RegexNodeKind.End)] public void TrailingAnchor(string pattern, RegexOptions options, int expectedMode, int expectedLength, int trailingAnchor) { @@ -69,28 +69,28 @@ public void TrailingAnchor(string pattern, RegexOptions options, int expectedMod } [Theory] - [InlineData(@"ab", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "ab")] - [InlineData(@"ab", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "ab")] - [InlineData(@"(a)(bc)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "abc")] - [InlineData(@"(a)(bc)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "bc")] - [InlineData(@"a{10}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "aaaaaaaaaa")] - [InlineData(@"a{10}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "aaaaaaaaaa")] - [InlineData(@"(?>a{10,20})", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "aaaaaaaaaa")] - [InlineData(@"(?>a{10,20})", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "aaaaaaaaaa")] - [InlineData(@"a{3,5}?", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "aaa")] - [InlineData(@"a{3,5}?", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "aaa")] - [InlineData(@"ab{5}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "abbbbb")] - [InlineData(@"ab{5}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "abbbbb")] - [InlineData(@"ab\w", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "ab")] - [InlineData(@"\wab", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "ab")] - [InlineData(@"(ab){3}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "ababab")] - [InlineData(@"(ab){3}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "ab")] - [InlineData(@"(ab){2,4}(de){4,}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "abab")] - [InlineData(@"(ab){2,4}(de){4,}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "de")] - [InlineData(@"ab|(abc)|(abcd)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "ab")] - [InlineData(@"ab|(abc)|(abcd)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "ab")] - [InlineData(@"ab(?=cd)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingPrefix_LeftToRight, "ab")] - [InlineData(@"ab(?=cd)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingPrefix_RightToLeft, "ab")] + [InlineData(@"ab", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] + [InlineData(@"ab", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")] + [InlineData(@"(a)(bc)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")] + [InlineData(@"(a)(bc)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "bc")] + [InlineData(@"a{10}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "aaaaaaaaaa")] + [InlineData(@"a{10}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "aaaaaaaaaa")] + [InlineData(@"(?>a{10,20})", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "aaaaaaaaaa")] + [InlineData(@"(?>a{10,20})", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "aaaaaaaaaa")] + [InlineData(@"a{3,5}?", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "aaa")] + [InlineData(@"a{3,5}?", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "aaa")] + [InlineData(@"ab{5}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abbbbb")] + [InlineData(@"ab{5}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "abbbbb")] + [InlineData(@"ab\w", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] + [InlineData(@"\wab", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")] + [InlineData(@"(ab){3}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ababab")] + [InlineData(@"(ab){3}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")] + [InlineData(@"(ab){2,4}(de){4,}", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abab")] + [InlineData(@"(ab){2,4}(de){4,}", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "de")] + [InlineData(@"ab|(abc)|(abcd)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] + [InlineData(@"ab|(abc)|(abcd)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")] + [InlineData(@"ab(?=cd)", RegexOptions.None, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] + [InlineData(@"ab(?=cd)", RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")] public void LeadingPrefix(string pattern, RegexOptions options, int expectedMode, string expectedPrefix) { RegexFindOptimizations opts = ComputeOptimizations(pattern, options); @@ -128,6 +128,19 @@ public void LiteralAfterLoop(string pattern, RegexOptions options, int expectedM Assert.Equal(expectedChar, opts.LiteralAfterLoop.Value.Literal.Char); } + [Theory] + [InlineData(@".ab", RegexOptions.None, (int)FindNextStartingPositionMode.FixedDistanceString_LeftToRight, "ab", 1)] + [InlineData(@".ab\w\w\wcdef\w\w\w\w\wghijklmnopq\w\w\w", RegexOptions.None, (int)FindNextStartingPositionMode.FixedDistanceString_LeftToRight, "ghijklmnopq", 15)] + [InlineData(@"a[Bb]c[Dd]ef", RegexOptions.None, (int)FindNextStartingPositionMode.FixedDistanceString_LeftToRight, "ef", 4)] + [InlineData(@"a[Bb]cd[Ee]fgh[Ii]", RegexOptions.None, (int)FindNextStartingPositionMode.FixedDistanceString_LeftToRight, "fgh", 5)] + public void FixedDistanceString(string pattern, RegexOptions options, int expectedMode, string expectedString, int distance) + { + RegexFindOptimizations opts = ComputeOptimizations(pattern, options); + Assert.Equal((FindNextStartingPositionMode)expectedMode, opts.FindMode); + Assert.Equal(expectedString, opts.FixedDistanceLiteral.String); + Assert.Equal(distance, opts.FixedDistanceLiteral.Distance); + } + private static RegexFindOptimizations ComputeOptimizations(string pattern, RegexOptions options) { RegexTree tree = RegexParser.Parse(pattern, options, CultureInfo.InvariantCulture);