From 136c5f8223313fd131753229e4a52a96c85efe7c Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 19 Apr 2022 16:05:16 -0400 Subject: [PATCH 1/3] Pass more information about execution mode to RegexRunners Some engines, in particular NonBacktracking, are relatively pay-for-play, in that the more information you need, the more processing they do. However, we currently don't pass enough information down to the RegexRunner to allow the engine to take full advantage. Today NonBacktracking can short-circuit its evaluation if it's told IsMatch is being used, but with the new EnumerateMatches and Count (and some uses of Replace), it's still gathering up all of the capture information even though that capture information will be ignored. This commit introduces a new RegexRunnerMode enum that lets us pass down to the engine exactly what portion of the information is needed, allowing it to avoid unnecessary work. Related, we can reduce the amount of work performed by Match.Tidy: if the captures information won't be used, there's no point in fixing up the positions. As part of this, I noticed we have a race condition in the new EnumerateMatches. We want to extract the index, length, and new text position from the Match object in order to populate the enumerator and result structs, but today we're doing so after the runner is returned to the cache. That means another thread could come along and start using that same Match object while we're still using it in the EnumerateMatches call. The fix is to extract the data from the Match before returning the runner. --- .../src/System.Text.RegularExpressions.csproj | 1 + .../System/Text/RegularExpressions/Match.cs | 45 +++-- .../RegularExpressions/MatchCollection.cs | 2 +- .../Text/RegularExpressions/Regex.Count.cs | 4 +- .../Regex.EnumerateMatches.cs | 8 +- .../Text/RegularExpressions/Regex.Match.cs | 12 +- .../Text/RegularExpressions/Regex.Replace.cs | 4 +- .../Text/RegularExpressions/Regex.Split.cs | 8 +- .../System/Text/RegularExpressions/Regex.cs | 184 ++++++++---------- .../RegularExpressions/RegexReplacement.cs | 7 +- .../Text/RegularExpressions/RegexRunner.cs | 13 +- .../RegularExpressions/RegexRunnerMode.cs | 21 ++ .../Symbolic/SymbolicRegexMatcher.cs | 37 ++-- .../Symbolic/SymbolicRegexRunnerFactory.cs | 6 +- 14 files changed, 185 insertions(+), 167 deletions(-) create mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index ad33ecdd61174f..225dcb01ae43e5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -47,6 +47,7 @@ + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs index 882ff118a8ef39..a20c50984d82d4 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs @@ -106,7 +106,7 @@ public Match NextMatch() Regex? r = _regex; Debug.Assert(Text != null); return r != null ? - r.RunSingleMatch(false, Length, Text, _textbeg, _textend - _textbeg, _textpos)! : + r.RunSingleMatch(RegexRunnerMode.CapturesRequired, Length, Text, _textbeg, _textend - _textbeg, _textpos)! : this; } @@ -272,39 +272,44 @@ internal int MatchLength(int cap) } /// Tidy the match so that it can be used as an immutable result - internal void Tidy(int textpos, int beginningOfSpanSlice) + internal void Tidy(int textpos, int beginningOfSpanSlice, RegexRunnerMode mode) { + Debug.Assert(mode != RegexRunnerMode.Existence); + int[] matchcount = _matchcount; - int[][] matches = _matches; + _capcount = matchcount[0]; // used to indicate Success + _textpos = textpos; // used to determine where to perform next match - _textpos = textpos; - _capcount = matchcount[0]; + int[][] matches = _matches; int[] interval = matches[0]; - Index = interval[0]; - Length = interval[1]; - if (_balancing) - { - TidyBalancing(); - } + Length = interval[1]; // the length of the match + Index = interval[0] + beginningOfSpanSlice; // the index of the match, adjusted for input slicing // At this point the Match is consistent for handing back to a caller, with regards to the span that was processed. // However, the caller may have actually provided a string, and may have done so with a non-zero beginning. // In such a case, all offsets need to be shifted by beginning, e.g. if beginning is 5 and a capture occurred at // offset 17, that 17 offset needs to be increased to 22 to account for the fact that it was actually 17 from the // beginning, which the implementation saw as 0 but which from the caller's perspective was 5. - Debug.Assert(!_balancing); - if (beginningOfSpanSlice != 0) + if (mode == RegexRunnerMode.CapturesRequired) { - Index += beginningOfSpanSlice; - for (int groupNumber = 0; groupNumber < matches.Length; groupNumber++) + if (_balancing) + { + TidyBalancing(); + } + Debug.Assert(!_balancing); + + if (beginningOfSpanSlice != 0) { - int[] captures = matches[groupNumber]; - if (captures is not null) + for (int groupNumber = 0; groupNumber < matches.Length; groupNumber++) { - int capturesLength = matchcount[groupNumber] * 2; // each capture has an offset and a length - for (int c = 0; c < capturesLength; c += 2) + int[] captures = matches[groupNumber]; + if (captures is not null) { - captures[c] += beginningOfSpanSlice; + int capturesLength = matchcount[groupNumber] * 2; // each capture has an offset and a length + for (int c = 0; c < capturesLength; c += 2) + { + captures[c] += beginningOfSpanSlice; + } } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/MatchCollection.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/MatchCollection.cs index 39007829a8907c..67475f5fe8c883 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/MatchCollection.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/MatchCollection.cs @@ -89,7 +89,7 @@ public virtual Match this[int i] Match match; do { - match = _regex.RunSingleMatch(false, _prevlen, _input, 0, _input.Length, _startat)!; + match = _regex.RunSingleMatch(RegexRunnerMode.CapturesRequired, _prevlen, _input, 0, _input.Length, _startat)!; if (!match.Success) { _done = true; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Count.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Count.cs index 1fecf8efc0c167..52f81301ef8e68 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Count.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Count.cs @@ -24,7 +24,7 @@ public int Count(string input) { count++; return true; - }, reuseMatchObject: true); + }, RegexRunnerMode.BoundsRequired, reuseMatchObject: true); return count; } @@ -42,7 +42,7 @@ public int Count(ReadOnlySpan input) { count++; return true; - }, reuseMatchObject: true); + }, RegexRunnerMode.BoundsRequired, reuseMatchObject: true); return count; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.EnumerateMatches.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.EnumerateMatches.cs index 44a0cca9e6d8db..a2e16b81e16ef6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.EnumerateMatches.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.EnumerateMatches.cs @@ -126,15 +126,15 @@ internal ValueMatchEnumerator(Regex regex, ReadOnlySpan input, int startAt /// public bool MoveNext() { - Match? match = _regex.RunSingleMatch(quick: false, _prevLen, _input, _startAt); - Debug.Assert(match != null, "Match shouldn't be null because we passed quick = false."); - if (match != RegularExpressions.Match.Empty) + (bool Success, int Index, int Length, int TextPosition) match = _regex.RunSingleMatch(RegexRunnerMode.BoundsRequired, _prevLen, _input, _startAt); + if (match.Success) { _current = new ValueMatch(match.Index, match.Length); - _startAt = match._textpos; + _startAt = match.TextPosition; _prevLen = match.Length; return true; } + return false; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs index a821c0590a6222..4027e51d72891d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs @@ -77,7 +77,7 @@ public bool IsMatch(string input) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return RunSingleMatch(quick: true, -1, input, 0, input.Length, RightToLeft ? input.Length : 0) is null; + return RunSingleMatch(RegexRunnerMode.Existence, -1, input, 0, input.Length, RightToLeft ? input.Length : 0) is null; } /// @@ -87,7 +87,7 @@ public bool IsMatch(string input) /// if the regular expression finds a match; otherwise, . /// A time-out ocurred. public bool IsMatch(ReadOnlySpan input) => - RunSingleMatch(quick: true, -1, input, RightToLeft ? input.Length : 0) is null; + RunSingleMatch(RegexRunnerMode.Existence, -1, input, RightToLeft ? input.Length : 0).Success; /// /// Searches the input string for one or more matches using the previous pattern and options, @@ -100,7 +100,7 @@ public bool IsMatch(string input, int startat) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return RunSingleMatch(quick: true, -1, input, 0, input.Length, startat) is null; + return RunSingleMatch(RegexRunnerMode.Existence, -1, input, 0, input.Length, startat) is null; } /// @@ -132,7 +132,7 @@ public Match Match(string input) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return RunSingleMatch(quick: false, -1, input, 0, input.Length, RightToLeft ? input.Length : 0)!; + return RunSingleMatch(RegexRunnerMode.CapturesRequired, -1, input, 0, input.Length, RightToLeft ? input.Length : 0)!; } /// @@ -146,7 +146,7 @@ public Match Match(string input, int startat) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return RunSingleMatch(quick: false, -1, input, 0, input.Length, startat)!; + return RunSingleMatch(RegexRunnerMode.CapturesRequired, -1, input, 0, input.Length, startat)!; } /// @@ -159,7 +159,7 @@ public Match Match(string input, int beginning, int length) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return RunSingleMatch(quick: false, -1, input, beginning, length, RightToLeft ? beginning + length : beginning)!; + return RunSingleMatch(RegexRunnerMode.CapturesRequired, -1, input, beginning, length, RightToLeft ? beginning + length : beginning)!; } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs index b29317f6ef3877..2bae094f3e64ec 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs @@ -182,7 +182,7 @@ private static string Replace(MatchEvaluator evaluator, Regex regex, string inpu state.prevat = match.Index + match.Length; state.segments.Add(state.evaluator(match).AsMemory()); return --state.count != 0; - }, reuseMatchObject: false); + }, RegexRunnerMode.CapturesRequired, reuseMatchObject: false); if (state.segments.Count == 0) { @@ -201,7 +201,7 @@ private static string Replace(MatchEvaluator evaluator, Regex regex, string inpu state.prevat = match.Index; state.segments.Add(state.evaluator(match).AsMemory()); return --state.count != 0; - }, reuseMatchObject: false); + }, RegexRunnerMode.CapturesRequired, reuseMatchObject: false); if (state.segments.Count == 0) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs index 359f48e8b89ccf..c6f5c0aea0009c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs @@ -100,12 +100,12 @@ private static string[] Split(Regex regex, string input, int count, int startat) { if (match.IsMatched(i)) { - state.results.Add(match.Groups[i].ToString()); + state.results.Add(match.Groups[i].Value); } } return --state.count != 0; - }, reuseMatchObject: true); + }, RegexRunnerMode.CapturesRequired, reuseMatchObject: true); if (state.results.Count == 0) { @@ -128,12 +128,12 @@ private static string[] Split(Regex regex, string input, int count, int startat) { if (match.IsMatched(i)) { - state.results.Add(match.Groups[i].ToString()); + state.results.Add(match.Groups[i].Value); } } return --state.count != 0; - }, reuseMatchObject: true); + }, RegexRunnerMode.CapturesRequired, reuseMatchObject: true); if (state.results.Count == 0) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index f148f6660b747c..3ca623966766c6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -363,7 +363,7 @@ protected void InitializeReferences() } /// Internal worker which will scan the passed in string for a match. Used by public APIs. - internal Match? RunSingleMatch(bool quick, int prevlen, string input, int beginning, int length, int startat) + internal Match? RunSingleMatch(RegexRunnerMode mode, int prevlen, string input, int beginning, int length, int startat) { if ((uint)startat > (uint)input.Length) { @@ -380,22 +380,28 @@ protected void InitializeReferences() runner.InitializeTimeout(internalMatchTimeout); runner.runtext = input; ReadOnlySpan span = input.AsSpan(beginning, length); - runner.InitializeForScan(this, span, startat - beginning, quick); - - int stoppos = RightToLeft ? 0 : span.Length; + runner.InitializeForScan(this, span, startat - beginning, mode); // If previous match was empty or failed, advance by one before matching. if (prevlen == 0) { + int stoppos = span.Length; + int bump = 1; + if (RightToLeft) + { + stoppos = 0; + bump = -1; + } + if (runner.runtextstart == stoppos) { return RegularExpressions.Match.Empty; } - runner.runtextpos += RightToLeft ? -1 : 1; + runner.runtextpos += bump; } - return ScanInternal(quick, input, beginning, runner, span, returnNullIfQuick: true); + return ScanInternal(mode, reuseMatchObject: mode == RegexRunnerMode.Existence, input, beginning, runner, span, returnNullIfReuseMatchObject: true); } finally { @@ -405,8 +411,10 @@ protected void InitializeReferences() } /// Internal worker which will scan the passed in span for a match. Used by public APIs. - internal Match? RunSingleMatch(bool quick, int prevlen, ReadOnlySpan input, int startat) + internal (bool Success, int Index, int Length, int TextPosition) RunSingleMatch(RegexRunnerMode mode, int prevlen, ReadOnlySpan input, int startat) { + Debug.Assert(mode <= RegexRunnerMode.BoundsRequired); + // startat parameter is always either 0 or input.Length since public API for IsMatch doesn't have an overload // that takes in startat. Debug.Assert(startat <= input.Length); @@ -415,7 +423,7 @@ protected void InitializeReferences() try { runner.InitializeTimeout(internalMatchTimeout); - runner.InitializeForScan(this, input, startat, quick); + runner.InitializeForScan(this, input, startat, mode); // If previous match was empty or failed, advance by one before matching. if (prevlen == 0) @@ -424,7 +432,7 @@ protected void InitializeReferences() { if (runner.runtextstart == 0) { - return RegularExpressions.Match.Empty; + return (false, 0, 0, 0); } runner.runtextpos--; } @@ -432,7 +440,7 @@ protected void InitializeReferences() { if (runner.runtextstart == input.Length) { - return RegularExpressions.Match.Empty; + return (false, 0, 0, 0); } runner.runtextpos++; } @@ -443,17 +451,18 @@ protected void InitializeReferences() // If runmatch is null it means that an override of Scan didn't implement it correctly, so we will // let this null ref since there are lots of ways where you can end up in a erroneous state. Match match = runner.runmatch!; - if (match!.FoundMatch) + if (match.FoundMatch) { - if (quick) + if (mode == RegexRunnerMode.Existence) { - return null; + return (true, 0, 0, 0); } - match.Tidy(runner.runtextpos, 0); - return match; + + match.Tidy(runner.runtextpos, 0, mode); + return (true, match.Index, match.Length, match._textpos); } - return RegularExpressions.Match.Empty; + return (false, 0, 0, 0); } finally { @@ -462,84 +471,53 @@ protected void InitializeReferences() } /// Internal worker which will scan the passed in string for all matches, and will call for each match found. - internal void RunAllMatchesWithCallback(string input, int startat, ref TState state, MatchCallback callback, bool reuseMatchObject) - { - Debug.Assert((uint)startat <= (uint)input.Length); + internal void RunAllMatchesWithCallback(string? input, int startat, ref TState state, MatchCallback callback, RegexRunnerMode mode, bool reuseMatchObject) => + RunAllMatchesWithCallback(input, (ReadOnlySpan)input, startat, ref state, callback, mode, reuseMatchObject); - RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); - try - { - // For the string overload, we need to set runtext before starting the match attempts. - runner.runtext = input; - RunAllMatchesWithCallbackHelper(input, startat, ref state, callback, runner, usingStringOverload: true, reuseMatchObject); - } - finally - { - runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache. - _runner = runner; - } - } + internal void RunAllMatchesWithCallback(ReadOnlySpan input, int startat, ref TState state, MatchCallback callback, RegexRunnerMode mode, bool reuseMatchObject) => + RunAllMatchesWithCallback(inputString: null, input, startat, ref state, callback, mode, reuseMatchObject); - /// Internal worker which will scan the passed in string for all matches, and will call for each match found. - internal void RunAllMatchesWithCallback(ReadOnlySpan input, int startat, ref TState state, MatchCallback callback, bool reuseMatchObject) + private void RunAllMatchesWithCallback(string? inputString, ReadOnlySpan inputSpan, int startat, ref TState state, MatchCallback callback, RegexRunnerMode mode, bool reuseMatchObject) { - Debug.Assert((uint)startat <= (uint)input.Length); + Debug.Assert(inputString is null || inputSpan.SequenceEqual(inputString)); + Debug.Assert((uint)startat <= (uint)inputSpan.Length); RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { - RunAllMatchesWithCallbackHelper(input, startat, ref state, callback, runner, usingStringOverload: false, reuseMatchObject); - } - finally - { - _runner = runner; - } - } + runner.runtext = inputString; + runner.InitializeTimeout(internalMatchTimeout); + int runtextpos = startat; - /// - /// Helper method used by and - /// which loops to find - /// all matches on the passed in and calls for each match found. - /// - private void RunAllMatchesWithCallbackHelper(ReadOnlySpan input, int startat, ref TState state, MatchCallback callback, RegexRunner runner, bool usingStringOverload, bool reuseMatchObject) - { - runner.InitializeTimeout(internalMatchTimeout); - int runtextpos = startat; - while (true) - { - runner.InitializeForScan(this, input, startat, false); - runner.runtextpos = runtextpos; + while (true) + { + runner.InitializeForScan(this, inputSpan, startat, mode); + runner.runtextpos = runtextpos; - int stoppos = RightToLeft ? 0 : input.Length; + // We get the Match by calling Scan. 'input' parameter is used to set the Match text which is only relevante if we are using the Run string + // overload, as APIs that call the span overload (like Count) don't require match.Text to be set, so we pass null in that case. + Match? match = ScanInternal(mode, reuseMatchObject, inputString, 0, runner, inputSpan, returnNullIfReuseMatchObject: false); + Debug.Assert(match is not null); + + // If we failed to match again, we're done. + if (!match.Success) + { + break; + } - // We get the Match by calling Scan. 'input' parameter is used to set the Match text which is only relevante if we are using the Run string - // overload, as APIs that call the span overload (like Count) don't require match.Text to be set, so we pass null in that case. - Match? match = ScanInternal(reuseMatchObject, input: usingStringOverload ? runner.runtext : null, 0, runner, input, returnNullIfQuick: false); - Debug.Assert(match is not null); + // We got a match. Call the callback function with the match and prepare for next iteration. - // if we got a match, then call the callback function with the match and prepare for next iteration. - if (match.Success) - { if (!reuseMatchObject) { // We're not reusing match objects, so null out our field reference to the instance. - // It'll be recreated the next time one is needed. + // It'll be recreated the next time one is needed. reuseMatchObject will be false + // when the callback may expose the Match object to user code. runner.runmatch = null; } if (!callback(ref state, match)) { // If the callback returns false, we're done. - - if (usingStringOverload && reuseMatchObject) - { - // We're reusing the single match instance and we were called via the string overload - // which would have set the match's text, so clear it out as well. - // We don't do this if we're not reusing instances, as in that case we're - // dropping the whole reference to the match, and we no longer own the instance - // having handed it out to the callback. - match.Text = null; - } return; } @@ -547,42 +525,39 @@ private void RunAllMatchesWithCallbackHelper(ReadOnlySpan input, i // the current position, just as Match.NextMatch() would pass in _textpos as textstart. runtextpos = startat = runner.runtextpos; - // Reset state for another iteration. - runner.runtrackpos = runner.runtrack!.Length; - runner.runstackpos = runner.runstack!.Length; - runner.runcrawlpos = runner.runcrawl!.Length; - if (match.Length == 0) { - if (runner.runtextpos == stoppos) + int stoppos = inputSpan.Length; + int bump = 1; + if (RightToLeft) + { + stoppos = 0; + bump = -1; + } + + if (runtextpos == stoppos) { - if (usingStringOverload && reuseMatchObject) - { - // See above comment. - match.Text = null; - } return; } - runtextpos += RightToLeft ? -1 : 1; + runtextpos += bump; } - // Loop around to perform next match from where we left off. - continue; - } - else - { - // We failed to match at this position. If we're at the stopping point, we're done. - if (runner.runtextpos == stoppos) - { - return; - } + // Reset state for another iteration. + runner.runtrackpos = runner.runtrack!.Length; + runner.runstackpos = runner.runstack!.Length; + runner.runcrawlpos = runner.runcrawl!.Length; } } + finally + { + runner.runtext = null; // drop reference to string to avoid keeping it alive in a cache. + _runner = runner; + } } /// Helper method used by RunSingleMatch and RunAllMatchesWithCallback which calls runner.Scan to find a match on the passed in span. - private static Match? ScanInternal(bool quick, string? input, int beginning, RegexRunner runner, ReadOnlySpan span, bool returnNullIfQuick) + private static Match? ScanInternal(RegexRunnerMode mode, bool reuseMatchObject, string? input, int beginning, RegexRunner runner, ReadOnlySpan span, bool returnNullIfReuseMatchObject) { runner.Scan(span); @@ -592,26 +567,31 @@ private void RunAllMatchesWithCallbackHelper(ReadOnlySpan input, i // If we got a match, do some cleanup and return it, or return null if quick is true; if (match.FoundMatch) { - if (!quick) + if (!reuseMatchObject) { - // We're about to return the Match object. Store the input into it and remove it from the runner. + // The match object is only reusable in very specific circumstances where the internal caller + // extracts only the matching information (e.g. bounds) it needs from the Match object, so + // in such situations we don't need to fill in the input value, and because it's being reused, + // we don't want to null it out in the runner. If, however, the match object isn't going to + // be reused, then we do need to finish populating it with the input text, and we do want to + // remove it from the runner so that no one else touches the object once we give it back. match.Text = input; runner.runmatch = null; } - else if (returnNullIfQuick) + else if (returnNullIfReuseMatchObject) { match.Text = null; return null; } - match.Tidy(runner.runtextpos, beginning); + match.Tidy(runner.runtextpos, beginning, mode); return match; } // We failed to match, so we will return Match.Empty which means we can reuse runmatch object. // We do however need to clear its Text in case it was set, so as to not keep it alive in some cache. - runner.runmatch!.Text = null; + match.Text = null; return RegularExpressions.Match.Empty; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs index 52fed822d1e123..72b8e5b4a96d18 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs @@ -23,6 +23,7 @@ internal sealed class RegexReplacement private readonly string[] _strings; // table of string constants private readonly int[] _rules; // negative -> group #, positive -> string # + private bool _hasBackreferences; // true if the replacement has any backreferences; otherwise, false /// /// Since RegexReplacement shares the same parser as Regex, @@ -72,6 +73,7 @@ public RegexReplacement(string rep, RegexNode concat, Hashtable _caps) } rules.Append(-Specials - 1 - slot); + _hasBackreferences = true; break; default: @@ -109,7 +111,6 @@ public RegexReplacement(string rep, RegexNode concat, Hashtable _caps) public static RegexReplacement GetOrCreate(WeakReference replRef, string replacement, Hashtable caps, int capsize, Hashtable capnames, RegexOptions roptions) { - if (!replRef.TryGetTarget(out RegexReplacement? repl) || !repl.Pattern.Equals(replacement)) { repl = RegexParser.ParseReplacement(replacement, roptions, caps, capsize, capnames); @@ -220,7 +221,7 @@ public string Replace(Regex regex, string input, int count, int startat) state.prevat = match.Index + match.Length; state.thisRef.ReplacementImpl(ref state.segments, match); return --state.count != 0; - }, reuseMatchObject: true); + }, _hasBackreferences ? RegexRunnerMode.CapturesRequired : RegexRunnerMode.BoundsRequired, reuseMatchObject: true); if (state.segments.Count == 0) { @@ -239,7 +240,7 @@ public string Replace(Regex regex, string input, int count, int startat) state.prevat = match.Index; state.thisRef.ReplacementImplRTL(ref state.segments, match); return --state.count != 0; - }, reuseMatchObject: true); + }, _hasBackreferences ? RegexRunnerMode.CapturesRequired : RegexRunnerMode.BoundsRequired, reuseMatchObject: true); if (state.segments.Count == 0) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index f9bd0c8b3024d4..293f375767cfb5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -62,8 +62,7 @@ public abstract class RegexRunner protected internal Match? runmatch; // result object protected internal Regex? runregex; // regex object - // TODO: Expose something as protected internal: https://github.com/dotnet/runtime/issues/59629 - private protected bool quick; // false if match details matter, true if only the fact that match occurred matters + private protected RegexRunnerMode _mode; // the mode in which the runner is currently operating private int _timeout; // timeout in milliseconds private bool _checkTimeout; @@ -135,10 +134,12 @@ protected internal virtual void Scan(ReadOnlySpan text) { InitializeTimeout(timeout); + RegexRunnerMode mode = quick ? RegexRunnerMode.Existence : RegexRunnerMode.CapturesRequired; + // We set runtext before calling InitializeForScan so that runmatch object is initialized with the text runtext = text; - InitializeForScan(regex, text, textstart, quick); + InitializeForScan(regex, text, textstart, mode); // InitializeForScan will default runtextstart and runtextend to 0 and length of string // since it is configured to work over a sliced portion of text so we adjust those values. @@ -178,7 +179,7 @@ protected internal virtual void Scan(ReadOnlySpan text) } runmatch = null; - match.Tidy(runtextpos, 0); + match.Tidy(runtextpos, 0, mode); } else { @@ -240,11 +241,11 @@ private Match InternalScan(Regex regex, int textbeg, int textend) } } - internal void InitializeForScan(Regex regex, ReadOnlySpan text, int textstart, bool quick) + internal void InitializeForScan(Regex regex, ReadOnlySpan text, int textstart, RegexRunnerMode mode) { // Store remaining arguments into fields now that we're going to start the scan. // These are referenced by the derived runner. - this.quick = quick; + _mode = mode; runregex = regex; runtextstart = textstart; runtextbeg = 0; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs new file mode 100644 index 00000000000000..42f0bd2c51e2ca --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs @@ -0,0 +1,21 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace System.Text.RegularExpressions +{ + /// Represents the mode of execution for a . + internal enum RegexRunnerMode + { + /// The runner need only determine whether the input has a match; no additional information is required. + /// This mode is used by Regex.IsMatch. + Existence, + + /// The runner needs to determine the next location and length of a match in the input; no additional information is required. + /// This mode is used by Regex.Count, Regex.EnumerateMatches, and Regex.Replace (when the replacement doesn't involve backreferences). + BoundsRequired, + + /// The runner needs to determine the next location and length of a match in the input, as well as the full details on all captures. + /// This mode is used by Regex.Match, Regex.Matches, Regex.Split, and Regex.Replace (when the replacement involves backreferences). + CapturesRequired, + } +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 90aa3e2611f3c6..74e82e9fffa2fd 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -317,11 +317,11 @@ private void CheckTimeout(long timeoutOccursAt) } /// Find a match. - /// Whether to return once we know there's a match without determining where exactly it matched. + /// The mode of execution based on the regex operation being performed. /// The input span /// The position to start search in the input span. /// Per thread data reused between calls. - public SymbolicMatch FindMatch(bool isMatch, ReadOnlySpan input, int startat, PerThreadData perThreadData) + public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, int startat, PerThreadData perThreadData) { Debug.Assert(startat >= 0 && startat <= input.Length, $"{nameof(startat)} == {startat}, {nameof(input.Length)} == {input.Length}"); Debug.Assert(perThreadData is not null); @@ -341,7 +341,7 @@ public SymbolicMatch FindMatch(bool isMatch, ReadOnlySpan input, int start // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after // the c as the low boundary for the starting position. - int matchEnd = FindEndPosition(input, startat, timeoutOccursAt, isMatch, out int matchStartLowBoundary, out int matchStartLengthMarker, perThreadData); + int matchEnd = FindEndPosition(input, startat, timeoutOccursAt, mode, out int matchStartLowBoundary, out int matchStartLengthMarker, perThreadData); // If there wasn't a match, we're done. if (matchEnd == NoMatchExists) @@ -351,7 +351,7 @@ public SymbolicMatch FindMatch(bool isMatch, ReadOnlySpan input, int start // A match exists. If we don't need further details, because IsMatch was used (and thus we don't // need the exact bounds of the match, captures, etc.), we're done. - if (isMatch) + if (mode == RegexRunnerMode.Existence) { return SymbolicMatch.QuickMatch; } @@ -380,11 +380,11 @@ public SymbolicMatch FindMatch(bool isMatch, ReadOnlySpan input, int start } // Phase 3: - // If there are no subcaptures, the matching process is done. For patterns with subcaptures (captures other - // than the top-level capture for the whole match), we need to do an additional pass to find their bounds. + // If there are no subcaptures (or if they're not needed), the matching process is done. For patterns with subcaptures + // (captures other than the top-level capture for the whole match), we need to do an additional pass to find their bounds. // Continuing for the previous example, phase 3 will be executed for the characters inside the match, aaabbbc, // and will find associate the one capture (b*) with it's match: bbb. - if (!HasSubcaptures) + if (!HasSubcaptures || mode < RegexRunnerMode.CapturesRequired) { return new SymbolicMatch(matchStart, matchEnd - matchStart); } @@ -399,14 +399,14 @@ public SymbolicMatch FindMatch(bool isMatch, ReadOnlySpan input, int start /// The input text. /// The starting position in . /// The time at which timeout occurs, if timeouts are being checked. - /// Whether this is an isMatch call. + /// The mode of execution based on the regex operation being performed. /// The last position the initial state of was visited before the end position was found. /// Length of the match if there's a match; otherwise, -1. /// Per thread data reused between calls. /// /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists. /// - private int FindEndPosition(ReadOnlySpan input, int i, long timeoutOccursAt, bool isMatch, out int initialStateIndex, out int matchLength, PerThreadData perThreadData) + private int FindEndPosition(ReadOnlySpan input, int i, long timeoutOccursAt, RegexRunnerMode mode, out int initialStateIndex, out int matchLength, PerThreadData perThreadData) { int endPosition = NoMatchExists; @@ -451,8 +451,8 @@ private int FindEndPosition(ReadOnlySpan input, int i, long timeoutOccursA int newEndPosition; int findResult = currentState.NfaState is not null ? - FindEndPositionDeltas(builder, inputForInnerLoop, isMatch, ref i, ref currentState, ref matchLength, out newEndPosition) : - FindEndPositionDeltas(builder, inputForInnerLoop, isMatch, ref i, ref currentState, ref matchLength, out newEndPosition); + FindEndPositionDeltas(builder, inputForInnerLoop, mode, ref i, ref currentState, ref matchLength, out newEndPosition) : + FindEndPositionDeltas(builder, inputForInnerLoop, mode, ref i, ref currentState, ref matchLength, out newEndPosition); // If a new end position was found, commit to the matching initial state index if (newEndPosition != -1) @@ -516,7 +516,7 @@ private int FindEndPosition(ReadOnlySpan input, int i, long timeoutOccursA /// 0 if iteration completed because we reached an initial state. /// A negative value if iteration completed because we ran out of input or we failed to transition. /// - private int FindEndPositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, bool isMatch, ref int i, ref CurrentState currentState, ref int matchLength, out int endPosition) + private int FindEndPositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, RegexRunnerMode mode, ref int i, ref CurrentState currentState, ref int matchLength, out int endPosition) where TStateHandler : struct, IStateHandler { // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. @@ -536,18 +536,25 @@ private int FindEndPositionDeltas(SymbolicRegexBuilder buil // use that length to optimize subsequent matching phases. matchLength = TStateHandler.FixedLength(ref state); endPos = pos; - // If this is an isMatch call we are done, since a match is now known to exist. - if (isMatch) + + // A match is known to exist. If that's all we need to know, we're done. + if (mode == RegexRunnerMode.Existence) + { return 1; + } } // If the state is a dead end, such that we can't transition anywhere else, end the search. if (TStateHandler.IsDeadend(ref state)) + { return 1; + } // If there is more input available try to transition with the next character. if ((uint)pos >= (uint)input.Length || !TryTakeTransition(builder, input, pos, ref state)) + { return -1; + } // We successfully transitioned, so update our current input index to match. pos++; @@ -556,7 +563,9 @@ private int FindEndPositionDeltas(SymbolicRegexBuilder buil // If it does, we exit out in order to allow our find optimizations to kick in to hopefully more quickly // find the next possible starting location. if (TStateHandler.IsInitialState(ref state)) + { return 0; + } } } finally diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index 4e157a91c0b736..620fa635bf79c9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -58,7 +58,7 @@ internal Runner(SymbolicRegexMatcher matcher) protected internal override void Scan(ReadOnlySpan text) { // Perform the match. - SymbolicMatch pos = _matcher.FindMatch(quick, text, runtextpos, _perThreadData); + SymbolicMatch pos = _matcher.FindMatch(_mode, text, runtextpos, _perThreadData); // Transfer the result back to the RegexRunner state. if (pos.Success) @@ -66,7 +66,7 @@ protected internal override void Scan(ReadOnlySpan text) // If we successfully matched, capture the match, and then jump the current position to the end of the match. int start = pos.Index; int end = start + pos.Length; - if (!quick && pos.CaptureStarts != null) + if (_mode == RegexRunnerMode.CapturesRequired && pos.CaptureStarts != null) { Debug.Assert(pos.CaptureEnds != null); Debug.Assert(pos.CaptureStarts.Length == pos.CaptureEnds.Length); @@ -89,7 +89,7 @@ protected internal override void Scan(ReadOnlySpan text) { // If we failed to find a match in the entire remainder of the input, skip the current position to the end. // The calling scan loop will then exit. - runtextpos = runtextend; + runtextpos = text.Length; } } } From 8cfba16c46652136b5369bc2b7ddf38c332b0d31 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 19 Apr 2022 23:23:54 -0400 Subject: [PATCH 2/3] Address PR feedback --- .../src/README.md | 243 ------------------ .../System/Text/RegularExpressions/Match.cs | 2 +- .../Text/RegularExpressions/Regex.Match.cs | 6 +- .../System/Text/RegularExpressions/Regex.cs | 8 +- .../Text/RegularExpressions/RegexRunner.cs | 22 +- .../RegularExpressions/RegexRunnerMode.cs | 2 +- .../Symbolic/SymbolicMatch.cs | 2 +- .../Symbolic/SymbolicRegexMatcher.cs | 8 +- 8 files changed, 20 insertions(+), 273 deletions(-) delete mode 100644 src/libraries/System.Text.RegularExpressions/src/README.md diff --git a/src/libraries/System.Text.RegularExpressions/src/README.md b/src/libraries/System.Text.RegularExpressions/src/README.md deleted file mode 100644 index bdc154e86bd78d..00000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/README.md +++ /dev/null @@ -1,243 +0,0 @@ -# Implementation of System.Text.RegularExpressions - -The implementation uses a typical NFA approach that supports back references. Patterns are parsed into a tree (`RegexTree`), translated into an intermediate representation (`RegexCode`) by a writer (`RegexWriter`), and then either used in an interpreter (`RegexInterpreter`) or compiled to IL which is executed (`CompiledRegexRunner`). Both of these derive from `RegexRunner`: in the case of the compiled runner, one must generate them from the `RegexCode` using a factory. - -Regex engines have different features: .NET regular expressions have a couple that others do not have, such as `Capture`s (distinct from `Group`s). It does not support searching UTF-8 text, nor searching a Span over a buffer. - -Unlike some DFA based engines, patterns must be trusted. Text may be untrusted with the use of a timeout to prevent catastrophic backtracking. - -Performance is important and we welcome optimizations so long as they preserve the public contract. - -## Extensibility - -Key types have significant protected (including protected internal) surface area. This is not intended as an general extensibility point, but rather as a detail of implementing saving a compiled regex to disk. Saving to disk is implemented by saving an assembly containing three types, one that derives from each of `Regex`, `RegexRunnerFactory`, and `RegexRunner`. This mechanism accounts for all the protected methods (and even protected fields) on these classes. If we were designing them today, we would likely more carefully limit their public surface, and possibly not rely on derived types. - -Protected members are part of the public API which cannot be broken, so they may potentially make some future optimizations more difficult - or even features - especially the fields. For example the string `runtext` is exposed as a protected internal field on `RegexRunner`. If we wanted to expose the text instead as, for example, a `ReadOnlyMemory`, in order to make it possible to use Regex over other sources beyond string, we would have to find a creative way to preserve compatibility. - -In particular, we must keep this API stable in order to remain compatible with regexes saved by .NET Framework. - -`RegexCompiler` is internal, and it is abstract for a different reason: to share implementation between `RegexLWCGCompiler` (used when `RegexOptions.Compiled` is specified to compile the regular expression in memory) and `RegexAssemblyCompiler` (used when `CompileToAssembly` is called to compile the regular expression to an assembly to persist in the file system): it is based around a field of type `System.Reflection.Emit.ILGenerator` and has protected utility methods and fields to work with it. - -## Key types - General - -### Regex (public) - -* Represents an executable regular expression with some utility static methods -* Several protected fields and methods but no derived classes exist in this implementation (see [Extensibility](#Extensibility) section above). -* Constructor sets `RegexCode` using `RegexParser` and `RegexWriter`; then, if `RegexOptions.Compiled`, compiles and holds a `RegexRunnerFactory` and clears `RegexCode`; these steps only need to be done once for this `Regex` object -* Thread-safe: no state changes are visible from concurrent threads after construction -* Various public entry points converge on `Run()` which uses the held `RegexRunner` if any; if none or in use, creates another with the held `RegexRunnerFactory` if any; if none, interprets with held `RegexCode` -* All static methods (such as `Regex.Match`) attempt to find a pre-existing `Regex` object for the requested pattern and options in the `RegexCache`. This is legitimate, since `Regex` options are thread-safe. If there is a cache hit, execution can begin immediately; if not, the cache is populated first. If the caller uses an instance instead of a static method, they are effectively performing the same caching themselves - -### RegexOptions (public) - -* `RightToLeft` is supported throughout, but as the less common case it is less optimized. -* `ExplicitCapture` is off by default: this is relevant to performance, as often patterns contain parentheses as a useful grouping mechanism, for example `(something){1,3}` is easier to type than the non capturing form `(?:something){1,3}`. Because explicit capture is off by default, the engine in this case will capture `something` even if it was not needed. -* There are other various options, `CaseInsensitive` in particular is commonly used - -### MatchEvaluator (public) - -### RegexCompilationInfo (public) - -* Parameters to use for regex compilation to disk -* Passed in by app to `Regex.CompileToAssembly(..)` - which is not currently implemented in .NET Core - -## Key types - Parsing - -### RegexParser - -* Converts pattern string to `RegexTree` of `RegexNode`s -* Invoked with `RegexTree Parse(string pattern, RegexOptions options...) {}` -* Also has `Escape(..)` and `Unescape(..)` methods (which serve as the implementation of the public `Regex.Escape/Unescape` methods), and parses into `RegexReplacement`s -* Does a partial prescan to prep capture slots -* As each `RegexNode` is added, it attempts to reduce (optimize) the newly formed subtree. When parsing completes, there is a final optimization of the whole tree. - -### RegexReplacement - -* Parsed replacement pattern -* Created by `RegexParser`, used in `Regex.Replace`/`Match.Result(..)` - -### RegexCharClass - -* Representation of a "character class", which defines what characters should be considered a match. It supports ranges, Unicode categories, and character class subtraction. As part of reduction / optimization of a `RegexNode` as well as during compilation, trivial character classes may be replaced by faster equivalent forms, e.g. replacing a character class that represents just one character with the corresponding "one" `RegexNode`. -* Created by `RegexParser` -* Creates packed string to be held on `RegexNode`. During execution, this string is passed to `CharInClass` to determine whether a given character is in the set, although the implementation (in particular in the compiler) may emit faster equivalent checks when possible. -* Has utility methods for examining the packed string, in particular for testing membership of the class (`CharInClass(..)`) - -### RegexNode - -* Node in regex parse tree -* Created by `RegexParser` -* Some nodes represent subsequent optimizations, rather than individual elements of the pattern -* Holds `Children` and `Next`. `Next` ends up pointing to the immediate parent -* Holds char or string (which may be char class), and `M` and `N` constants; these constants are node-specific values, e.g. for a loop they represent minimum and maximum iteration counts, respectively. -* Note: polymorphism was not used here: the interpretation of its fields depends on the integer Type field - -### RegexTree - -* Simple holder for root `RegexNode`, options, and a captures data structure -* Created by `RegexParser` - -### RegexWriter - -* Responsible for translating a `RegexTree` to a `RegexCode` -* Invoked by `Regex` -* Creates itself `RegexCode Write(RegexTree tree){}` - -### RegexFCD - -* Responsible for static pattern prefixes -* Created by `RegexWriter` -* Creates `RegexFC`s -* `FirstChars()` creates `RegexPrefix` from `RegexTree` -* FC means "First chars": not clear what D means... - -### RegexPrefix - -* Literal string that match must begin with - -### RegexBoyerMoore - -* Supports searching the text for literals -* Constructed by `RegexWriter` -* Singleton held on `RegexCode` -* `RegexInterpreter` uses it to perform Boyer-Moore search -* `RegexCompiler` uses the tables from this object, but generates its own code for the Boyer-Moore search - -### RegexCode - -* Abstract representation of the "program" for a particular pattern -* Created by `RegexWriter` -* Code is an array of integers. Within the array, op-codes' types are indicated by integer consts analogous to those on `RegexNode`. -* Has several related data structures such as a string table, a captures table, and prefixes - -## Key types - Compilation (if not interpreted) - -### RegexCompiler (public abstract) - -* Responsible for compiling `RegexCode` to a `RegexRunnerFactory` -* Has a utility method `CompileToAssembly` that invokes `RegexParser` and `RegexWriter` directly then uses `RegexAssemblyCompiler` (see note for that type) -* Key protected methods are `GenerateFindFirstChar()` and `GenerateGo()` -* Created and used only from `RegexRunnerFactory Regex.Compile(RegexCode code, RegexOptions options...)` -* Has a factory method `RegexRunnerFactory RegexCompiler.Compile(RegexCode code, RegexOptions options...)` that is implemented with its derived type `RegexLWCGCompiler` - -### RegexLWCGCompiler (is a RegexCompiler) - -* Creates a `CompiledRegexRunnerFactory` using `RegexRunnerFactory FactoryInstanceFromCode(RegexCode .. )` - -### RegexRunnerFactory (public pure abstract) - -* Reusable: creates `RegexRunner`s on demand with `RegexRunner CreateInstance()` -* Not relevant to interpreted mode -* Must be thread-safe, as each `Regex` holds one, and `Regex` is thread-safe - -### CompiledRegexRunnerFactory (is a RegexRunnerFactory) - -* Created by `RegexLWCGCompiler` -* Creates `CompiledRegexRunner` on request - -### RegexAssemblyCompiler - -* Created and used by `RegexCompiler.CompileToAssembly(...)` to write compiled regex to disk: at present, writing to disk is not implemented, because Reflection.Emit does not support it. - -## Key types - Execution - -### RegexRunner (public abstract) - -* Responsible for executing a regular expression: not thread-safe -* Reusable: each call to `Scan(..)` begins a new execution -* Lots of protected members: tracking position, execution stacks, and captures: - * `protected abstract void Go()` - * `protected abstract bool FindFirstChar()` - * `Match? Scan(System.Text.RegularExpressions.Regex regex, string text...)` calls `FindFirstChar()` and `Go()` -* Has a "quick" mode that does not instantiate any captures: used by `Regex.IsMatch(..)` which does not expose captures to the caller -* Concrete instances created by `Match? Regex.Run(...)` calling either `RegexRunner CompiledRegexRunnerFactory.CreateInstance()` or newing up a `RegexInterpreter` - -### RegexInterpreter (is a RegexRunner) - -* See above. Note that this is sealed. - -### CompiledRegexRunner (is a RegexRunner) - -* See above. - -## Results - -### Match (public, is a Group) - -* Represents one match of the pattern: there may be several -* Holds a `Regex` in order to call `NextMatch()` -* Created by `RegexRunner` -* `Match` and related objects are not thread-safe, unlike `Regex` itself - -### Group (public, is a Capture) - -* Represents one capturing group from the match -* Simple data holder - -### Capture (public) - -* Represents one of the potentially several captures from a capturing group; this is a .NET-only concept. -* Simple data holder - -### MatchCollection (public) - -* Created by `Regex.Matches` -* Lazily provides `Match`es - -### GroupCollection (public) - -* Created by `Match.Groups` -* Lazily creates `Group`s - -### CaptureCollection (public) - -* Created by `Group.Captures` -* Lazily creates `Capture`s - -### RegexParseException (is a ArgumentException) - -* Thrown when pattern is invalid -* Contains `RegexParseError` - -### RegexMatchTimeoutException (public) - -* Thrown when timeout expires - -## Optimizations - -### Tree optimization - -* Every `RegexNode.AddChild()` calls `Reduce()` to attempt to optimize subtree as it is being assembled, and parsing ends with call to `RegexNode.FinalOptimize()` for some optimizations that require the entire tree. The goal is to make a functionally equivalent tree that can produce a more efficient program. With more detailed analysis of the tree and some creativity, more could be done here. - -### Testing character classes - -* Testing a character for membership of a character class can take a significant time in aggregate. Numerous optimizations have been made here. For example, originally it used a binary search, and now it attempts to use a bitmap where possible. More improvements here would likely be worthwhile. - -### Prefix matching - -* If the pattern begins with a literal, `FindFirstChar()` is used to run quickly to the next point in the text that matches that literal or character class, without using the engine. If the literal is a single character, this can use `IndexOf()` which is vectorized; otherwise it uses `RegexBoyerMoore`. Future optimizations could, for example, handle an alternation of leading literals using the Aho-Corasick algorithm; or use `IndexOf` to find a low-probability char before matching the whole literal. These optimizations are likely to most help in the case of a large text, perhaps with few matches, and a pattern with leading large literals or small character class. - -// TODO - more here - -More optimization opportunities are being tracked [in this issue](https://github.com/dotnet/runtime/issues/1349): you are welcome to offer more ideas, or contribute PR's. - -# Tracing and dumping output - -If the engine is built in debug configuration, and `RegexOptions.Debug` is passed, some internal datastructures will be written out with `Debug.Write()`. This includes the pattern itself, then `RegexWriter` will write out the input `RegexTree` with its nodes, and the output `RegexCode`. The `RegexBoyerMoore` dumps its tables - this would likely be relevant only if there was a bug in that class. `RegexRunner`s also dump their state as they execute the pattern. `Match` also has the ability to dump state. - -For example, if you are working to optimize the `RegexTree` generated from a pattern, this can be a convenient way to visualize the tree without concerning yourself with the subsequent execution. - -When you compile your test program, `RegexOptions.Debug` may not be visible to the compiler: you can use `(RegexOptions)0x0080` instead. - -# Debugging - -// TODO - -# Profiling and benchmarks - -// TODO - -# Test strategy - -// TODO diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs index a20c50984d82d4..703c43492043b0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs @@ -274,7 +274,7 @@ internal int MatchLength(int cap) /// Tidy the match so that it can be used as an immutable result internal void Tidy(int textpos, int beginningOfSpanSlice, RegexRunnerMode mode) { - Debug.Assert(mode != RegexRunnerMode.Existence); + Debug.Assert(mode != RegexRunnerMode.ExistenceRequired); int[] matchcount = _matchcount; _capcount = matchcount[0]; // used to indicate Success diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs index 4027e51d72891d..9d28c1675172c0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs @@ -77,7 +77,7 @@ public bool IsMatch(string input) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return RunSingleMatch(RegexRunnerMode.Existence, -1, input, 0, input.Length, RightToLeft ? input.Length : 0) is null; + return RunSingleMatch(RegexRunnerMode.ExistenceRequired, -1, input, 0, input.Length, RightToLeft ? input.Length : 0) is null; } /// @@ -87,7 +87,7 @@ public bool IsMatch(string input) /// if the regular expression finds a match; otherwise, . /// A time-out ocurred. public bool IsMatch(ReadOnlySpan input) => - RunSingleMatch(RegexRunnerMode.Existence, -1, input, RightToLeft ? input.Length : 0).Success; + RunSingleMatch(RegexRunnerMode.ExistenceRequired, -1, input, RightToLeft ? input.Length : 0).Success; /// /// Searches the input string for one or more matches using the previous pattern and options, @@ -100,7 +100,7 @@ public bool IsMatch(string input, int startat) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return RunSingleMatch(RegexRunnerMode.Existence, -1, input, 0, input.Length, startat) is null; + return RunSingleMatch(RegexRunnerMode.ExistenceRequired, -1, input, 0, input.Length, startat) is null; } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 3ca623966766c6..304fb99a5ac733 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -401,7 +401,7 @@ protected void InitializeReferences() runner.runtextpos += bump; } - return ScanInternal(mode, reuseMatchObject: mode == RegexRunnerMode.Existence, input, beginning, runner, span, returnNullIfReuseMatchObject: true); + return ScanInternal(mode, reuseMatchObject: mode == RegexRunnerMode.ExistenceRequired, input, beginning, runner, span, returnNullIfReuseMatchObject: true); } finally { @@ -453,7 +453,7 @@ protected void InitializeReferences() Match match = runner.runmatch!; if (match.FoundMatch) { - if (mode == RegexRunnerMode.Existence) + if (mode == RegexRunnerMode.ExistenceRequired) { return (true, 0, 0, 0); } @@ -494,7 +494,7 @@ private void RunAllMatchesWithCallback(string? inputString, ReadOnlySpan runner.InitializeForScan(this, inputSpan, startat, mode); runner.runtextpos = runtextpos; - // We get the Match by calling Scan. 'input' parameter is used to set the Match text which is only relevante if we are using the Run string + // We get the Match by calling Scan. 'input' parameter is used to set the Match text which is only relevant if we are using the Run string // overload, as APIs that call the span overload (like Count) don't require match.Text to be set, so we pass null in that case. Match? match = ScanInternal(mode, reuseMatchObject, inputString, 0, runner, inputSpan, returnNullIfReuseMatchObject: false); Debug.Assert(match is not null); @@ -564,7 +564,7 @@ private void RunAllMatchesWithCallback(string? inputString, ReadOnlySpan Match? match = runner.runmatch; Debug.Assert(match is not null); - // If we got a match, do some cleanup and return it, or return null if quick is true; + // If we got a match, do some cleanup and return it, or return null if reuseMatchObject and returnNullIfReuseMatchObject are true. if (match.FoundMatch) { if (!reuseMatchObject) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 293f375767cfb5..9ea8bad5a51bc0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -70,21 +70,6 @@ public abstract class RegexRunner protected RegexRunner() { } - /// - /// Scans the string to find the first match. Uses the Match object - /// both to feed text in and as a place to store matches that come out. - /// - /// All the action is in the abstract Go() method defined by subclasses. Our - /// responsibility is to load up the class members (as done here) before - /// calling Go. - /// - /// The optimizer can compute a set of candidate starting characters, - /// and we could use a separate method Skip() that will quickly scan past - /// any characters that we know can't match. - /// - protected Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) => - Scan(regex, text, textbeg, textend, textstart, prevlen, quick, regex.MatchTimeout); - protected internal virtual void Scan(ReadOnlySpan text) { // This base implementation is overridden by all of the built-in engines and by all source-generated @@ -126,6 +111,11 @@ protected internal virtual void Scan(ReadOnlySpan text) InternalScan(runregex!, beginning, beginning + text.Length); } + // TODO https://github.com/dotnet/runtime/issues/62573: Obsolete this. + protected Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) => + Scan(regex, text, textbeg, textend, textstart, prevlen, quick, regex.MatchTimeout); + + // TODO https://github.com/dotnet/runtime/issues/62573: Obsolete this. /// /// This method's body is only kept since it is a protected member that could be called by someone outside /// the assembly. @@ -134,7 +124,7 @@ protected internal virtual void Scan(ReadOnlySpan text) { InitializeTimeout(timeout); - RegexRunnerMode mode = quick ? RegexRunnerMode.Existence : RegexRunnerMode.CapturesRequired; + RegexRunnerMode mode = quick ? RegexRunnerMode.ExistenceRequired : RegexRunnerMode.CapturesRequired; // We set runtext before calling InitializeForScan so that runmatch object is initialized with the text runtext = text; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs index 42f0bd2c51e2ca..d034bef03ffec0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs @@ -8,7 +8,7 @@ internal enum RegexRunnerMode { /// The runner need only determine whether the input has a match; no additional information is required. /// This mode is used by Regex.IsMatch. - Existence, + ExistenceRequired, /// The runner needs to determine the next location and length of a match in the input; no additional information is required. /// This mode is used by Regex.Count, Regex.EnumerateMatches, and Regex.Replace (when the replacement doesn't involve backreferences). diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicMatch.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicMatch.cs index 2dee258bcd65d4..f7ca11ca359fdd 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicMatch.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicMatch.cs @@ -9,7 +9,7 @@ internal readonly struct SymbolicMatch internal static SymbolicMatch NoMatch => new SymbolicMatch(-1, -1); /// Indicates a match was found but without meaningful details about where. - internal static SymbolicMatch QuickMatch => new SymbolicMatch(0, 0); + internal static SymbolicMatch MatchExists => new SymbolicMatch(0, 0); public SymbolicMatch(int index, int length, int[]? captureStarts = null, int[]? captureEnds = null) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 74e82e9fffa2fd..d14edf5e648be1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -349,11 +349,11 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i return SymbolicMatch.NoMatch; } - // A match exists. If we don't need further details, because IsMatch was used (and thus we don't + // A match exists. If we don't need further details, e.g. because IsMatch was used (and thus we don't // need the exact bounds of the match, captures, etc.), we're done. - if (mode == RegexRunnerMode.Existence) + if (mode == RegexRunnerMode.ExistenceRequired) { - return SymbolicMatch.QuickMatch; + return SymbolicMatch.MatchExists; } // Phase 2: @@ -538,7 +538,7 @@ private int FindEndPositionDeltas(SymbolicRegexBuilder buil endPos = pos; // A match is known to exist. If that's all we need to know, we're done. - if (mode == RegexRunnerMode.Existence) + if (mode == RegexRunnerMode.ExistenceRequired) { return 1; } From 03ae0f8caa6d129a32aa6a5805675baba6d4ae6b Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Wed, 20 Apr 2022 20:04:47 -0400 Subject: [PATCH 3/3] Address PR feedback --- .../src/System/Text/RegularExpressions/Match.cs | 4 ++-- .../src/System/Text/RegularExpressions/MatchCollection.cs | 2 +- .../src/System/Text/RegularExpressions/Regex.Match.cs | 6 +++--- .../src/System/Text/RegularExpressions/Regex.Replace.cs | 4 ++-- .../src/System/Text/RegularExpressions/Regex.Split.cs | 4 ++-- .../src/System/Text/RegularExpressions/Regex.cs | 8 ++++---- .../System/Text/RegularExpressions/RegexReplacement.cs | 4 ++-- .../src/System/Text/RegularExpressions/RegexRunner.cs | 2 +- .../src/System/Text/RegularExpressions/RegexRunnerMode.cs | 2 +- .../RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 2 +- .../Symbolic/SymbolicRegexRunnerFactory.cs | 2 +- 11 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs index 703c43492043b0..ca41f062605109 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs @@ -106,7 +106,7 @@ public Match NextMatch() Regex? r = _regex; Debug.Assert(Text != null); return r != null ? - r.RunSingleMatch(RegexRunnerMode.CapturesRequired, Length, Text, _textbeg, _textend - _textbeg, _textpos)! : + r.RunSingleMatch(RegexRunnerMode.FullMatchRequired, Length, Text, _textbeg, _textend - _textbeg, _textpos)! : this; } @@ -290,7 +290,7 @@ internal void Tidy(int textpos, int beginningOfSpanSlice, RegexRunnerMode mode) // In such a case, all offsets need to be shifted by beginning, e.g. if beginning is 5 and a capture occurred at // offset 17, that 17 offset needs to be increased to 22 to account for the fact that it was actually 17 from the // beginning, which the implementation saw as 0 but which from the caller's perspective was 5. - if (mode == RegexRunnerMode.CapturesRequired) + if (mode == RegexRunnerMode.FullMatchRequired) { if (_balancing) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/MatchCollection.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/MatchCollection.cs index 67475f5fe8c883..bd18aabbb61afb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/MatchCollection.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/MatchCollection.cs @@ -89,7 +89,7 @@ public virtual Match this[int i] Match match; do { - match = _regex.RunSingleMatch(RegexRunnerMode.CapturesRequired, _prevlen, _input, 0, _input.Length, _startat)!; + match = _regex.RunSingleMatch(RegexRunnerMode.FullMatchRequired, _prevlen, _input, 0, _input.Length, _startat)!; if (!match.Success) { _done = true; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs index 9d28c1675172c0..76f8ee6eaf5811 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs @@ -132,7 +132,7 @@ public Match Match(string input) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return RunSingleMatch(RegexRunnerMode.CapturesRequired, -1, input, 0, input.Length, RightToLeft ? input.Length : 0)!; + return RunSingleMatch(RegexRunnerMode.FullMatchRequired, -1, input, 0, input.Length, RightToLeft ? input.Length : 0)!; } /// @@ -146,7 +146,7 @@ public Match Match(string input, int startat) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return RunSingleMatch(RegexRunnerMode.CapturesRequired, -1, input, 0, input.Length, startat)!; + return RunSingleMatch(RegexRunnerMode.FullMatchRequired, -1, input, 0, input.Length, startat)!; } /// @@ -159,7 +159,7 @@ public Match Match(string input, int beginning, int length) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return RunSingleMatch(RegexRunnerMode.CapturesRequired, -1, input, beginning, length, RightToLeft ? beginning + length : beginning)!; + return RunSingleMatch(RegexRunnerMode.FullMatchRequired, -1, input, beginning, length, RightToLeft ? beginning + length : beginning)!; } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs index 2bae094f3e64ec..1b630f2daba048 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs @@ -182,7 +182,7 @@ private static string Replace(MatchEvaluator evaluator, Regex regex, string inpu state.prevat = match.Index + match.Length; state.segments.Add(state.evaluator(match).AsMemory()); return --state.count != 0; - }, RegexRunnerMode.CapturesRequired, reuseMatchObject: false); + }, RegexRunnerMode.FullMatchRequired, reuseMatchObject: false); if (state.segments.Count == 0) { @@ -201,7 +201,7 @@ private static string Replace(MatchEvaluator evaluator, Regex regex, string inpu state.prevat = match.Index; state.segments.Add(state.evaluator(match).AsMemory()); return --state.count != 0; - }, RegexRunnerMode.CapturesRequired, reuseMatchObject: false); + }, RegexRunnerMode.FullMatchRequired, reuseMatchObject: false); if (state.segments.Count == 0) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs index c6f5c0aea0009c..ecfa8b1b45eb6a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs @@ -105,7 +105,7 @@ private static string[] Split(Regex regex, string input, int count, int startat) } return --state.count != 0; - }, RegexRunnerMode.CapturesRequired, reuseMatchObject: true); + }, RegexRunnerMode.FullMatchRequired, reuseMatchObject: true); if (state.results.Count == 0) { @@ -133,7 +133,7 @@ private static string[] Split(Regex regex, string input, int count, int startat) } return --state.count != 0; - }, RegexRunnerMode.CapturesRequired, reuseMatchObject: true); + }, RegexRunnerMode.FullMatchRequired, reuseMatchObject: true); if (state.results.Count == 0) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 304fb99a5ac733..e118db2c7fa444 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -432,7 +432,7 @@ protected void InitializeReferences() { if (runner.runtextstart == 0) { - return (false, 0, 0, 0); + return (false, -1, -1, -1); } runner.runtextpos--; } @@ -440,7 +440,7 @@ protected void InitializeReferences() { if (runner.runtextstart == input.Length) { - return (false, 0, 0, 0); + return (false, -1, -1, -1); } runner.runtextpos++; } @@ -455,14 +455,14 @@ protected void InitializeReferences() { if (mode == RegexRunnerMode.ExistenceRequired) { - return (true, 0, 0, 0); + return (true, -1, -1, -1); } match.Tidy(runner.runtextpos, 0, mode); return (true, match.Index, match.Length, match._textpos); } - return (false, 0, 0, 0); + return (false, -1, -1, -1); } finally { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs index 72b8e5b4a96d18..9b35bafae58a68 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs @@ -221,7 +221,7 @@ public string Replace(Regex regex, string input, int count, int startat) state.prevat = match.Index + match.Length; state.thisRef.ReplacementImpl(ref state.segments, match); return --state.count != 0; - }, _hasBackreferences ? RegexRunnerMode.CapturesRequired : RegexRunnerMode.BoundsRequired, reuseMatchObject: true); + }, _hasBackreferences ? RegexRunnerMode.FullMatchRequired : RegexRunnerMode.BoundsRequired, reuseMatchObject: true); if (state.segments.Count == 0) { @@ -240,7 +240,7 @@ public string Replace(Regex regex, string input, int count, int startat) state.prevat = match.Index; state.thisRef.ReplacementImplRTL(ref state.segments, match); return --state.count != 0; - }, _hasBackreferences ? RegexRunnerMode.CapturesRequired : RegexRunnerMode.BoundsRequired, reuseMatchObject: true); + }, _hasBackreferences ? RegexRunnerMode.FullMatchRequired : RegexRunnerMode.BoundsRequired, reuseMatchObject: true); if (state.segments.Count == 0) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 9ea8bad5a51bc0..9e0341301896df 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -124,7 +124,7 @@ protected internal virtual void Scan(ReadOnlySpan text) { InitializeTimeout(timeout); - RegexRunnerMode mode = quick ? RegexRunnerMode.ExistenceRequired : RegexRunnerMode.CapturesRequired; + RegexRunnerMode mode = quick ? RegexRunnerMode.ExistenceRequired : RegexRunnerMode.FullMatchRequired; // We set runtext before calling InitializeForScan so that runmatch object is initialized with the text runtext = text; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs index d034bef03ffec0..a0e0ea00d0a0a3 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerMode.cs @@ -16,6 +16,6 @@ internal enum RegexRunnerMode /// The runner needs to determine the next location and length of a match in the input, as well as the full details on all captures. /// This mode is used by Regex.Match, Regex.Matches, Regex.Split, and Regex.Replace (when the replacement involves backreferences). - CapturesRequired, + FullMatchRequired, } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index d14edf5e648be1..5a65c8f5944a16 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -384,7 +384,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // (captures other than the top-level capture for the whole match), we need to do an additional pass to find their bounds. // Continuing for the previous example, phase 3 will be executed for the characters inside the match, aaabbbc, // and will find associate the one capture (b*) with it's match: bbb. - if (!HasSubcaptures || mode < RegexRunnerMode.CapturesRequired) + if (!HasSubcaptures || mode < RegexRunnerMode.FullMatchRequired) { return new SymbolicMatch(matchStart, matchEnd - matchStart); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index 620fa635bf79c9..61087e37dd4790 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -66,7 +66,7 @@ protected internal override void Scan(ReadOnlySpan text) // If we successfully matched, capture the match, and then jump the current position to the end of the match. int start = pos.Index; int end = start + pos.Length; - if (_mode == RegexRunnerMode.CapturesRequired && pos.CaptureStarts != null) + if (_mode == RegexRunnerMode.FullMatchRequired && pos.CaptureStarts != null) { Debug.Assert(pos.CaptureEnds != null); Debug.Assert(pos.CaptureStarts.Length == pos.CaptureEnds.Length);