From c1c0fa3987e89589eaed5168b4753f0e5f31e3a8 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Tue, 1 Feb 2022 15:12:53 -0800 Subject: [PATCH 1/6] Adding new API and ensuring new Scan is called during Regex.Run() with all tests passing. --- .../ref/System.Text.RegularExpressions.cs | 11 +++-- .../System/Text/RegularExpressions/Capture.cs | 11 +++++ .../Text/RegularExpressions/Regex.Match.cs | 40 +++++++++++++++++ .../System/Text/RegularExpressions/Regex.cs | 44 ++++++++++++++++++- .../Text/RegularExpressions/RegexRunner.cs | 18 ++++++-- 5 files changed, 117 insertions(+), 7 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs index 251d595081072d..8e688e348d0e2d 100644 --- a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs +++ b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs @@ -171,6 +171,10 @@ public static void CompileToAssembly(System.Text.RegularExpressions.RegexCompila public string GroupNameFromNumber(int i) { throw null; } public int GroupNumberFromName(string name) { throw null; } protected void InitializeReferences() { } + public bool IsMatch(System.ReadOnlySpan input) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options, System.TimeSpan matchTimeout) { throw null; } public bool IsMatch(string input) { throw null; } public bool IsMatch(string input, int startat) { throw null; } public static bool IsMatch(string input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; } @@ -330,9 +334,9 @@ protected void DoubleCrawl() { } protected void DoubleStack() { } protected void DoubleTrack() { } protected void EnsureStorage() { } - protected abstract bool FindFirstChar(); - protected abstract void Go(); - protected abstract void InitTrackCount(); + protected virtual bool FindFirstChar() { throw null; } + protected virtual void Go() { throw null; } + protected virtual void InitTrackCount() { throw null; } protected bool IsBoundary(int index, int startpos, int endpos) { throw null; } protected bool IsECMABoundary(int index, int startpos, int endpos) { throw null; } protected bool IsMatched(int cap) { throw null; } @@ -341,6 +345,7 @@ protected void EnsureStorage() { } protected int Popcrawl() { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, System.TimeSpan timeout) { throw null; } + protected internal virtual void Scan(System.Text.RegularExpressions.Regex regex, System.ReadOnlySpan text, int textstart, int prevlen, bool quick, System.TimeSpan timeout) { throw null; } protected void TransferCapture(int capnum, int uncapnum, int start, int end) { } protected void Uncapture() { } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs index 81683e09bec1ff..abdb182135d87c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs @@ -19,6 +19,17 @@ internal Capture(string text, int index, int length) /// Returns the position in the original string where the first character of captured substring was found. public int Index { get; private protected set; } + /// + /// This method should only be called when the text for matching was sliced with a different beginning, so the resulting index of + /// the match is not from the start of the text, but instead the start of the slice. This method will add back that extra indices + /// to account for the original text beginning. + /// + /// The original text's beginning offset. + internal void AddBeginningToIndex(int beginning) + { + Index += beginning; + } + /// Returns the length of the captured substring. public int Length { get; private protected set; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs index cd0494c3dc0092..f2a058a4d70924 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs @@ -13,6 +13,15 @@ public partial class Regex public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern) => RegexCache.GetOrAdd(pattern).IsMatch(input); + /// + /// Searches the input span for one or more occurrences of the text supplied in the given pattern. + /// + /// The input span to be searched on. + /// The Regex pattern to be used for matching. + /// if the input matches the pattern, otherwise. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern) => + RegexCache.GetOrAdd(pattern).IsMatch(input); + /// /// Searches the input string for one or more occurrences of the text /// supplied in the pattern parameter with matching options supplied in the options @@ -21,9 +30,30 @@ public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Reg public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options) => RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).IsMatch(input); + /// + /// Searches the input span for one or more occurrences of the text supplied in the given pattern. It uses the passed in options. + /// + /// The input span to be searched on. + /// The Regex pattern to be used for matching. + /// The options to be used for matching + /// if the input matches the pattern, otherwise. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options) => + RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).IsMatch(input); + public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options, TimeSpan matchTimeout) => RegexCache.GetOrAdd(pattern, options, matchTimeout).IsMatch(input); + /// + /// Searches the input span for one or more occurrences of the text supplied in the given pattern under the specified timeout. It uses the passed in options. + /// + /// The input span to be searched on. + /// The Regex pattern to be used for matching. + /// The options to be used for matching + /// Max time to be used for matching before returning. + /// if the input matches the pattern, otherwise. Also returns for time out. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options, TimeSpan matchTimeout) => + RegexCache.GetOrAdd(pattern, options, matchTimeout).IsMatch(input); + /// /// Searches the input string for one or more matches using the previous pattern, /// options, and starting position. @@ -38,6 +68,16 @@ public bool IsMatch(string input) return Run(quick: true, -1, input, 0, input.Length, UseOptionR() ? input.Length : 0) is null; } + /// + /// Searches the input span for one or more matches using the previous pattern, + /// options, and starting position. + /// + /// if the input matches the pattern, otherwise. + public bool IsMatch(ReadOnlySpan input) + { + return Run(quick: true, -1, input, 0, input.Length, UseOptionR() ? input.Length : 0) is null; + } + /// /// Searches the input string for one or more matches using the previous pattern and options, /// with a new starting position. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index d07f3440de4b8a..0cc2bdaabd1c3d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -383,7 +383,49 @@ protected void InitializeReferences() RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { - return runner.Scan(this, input, beginning, beginning + length, startat, prevlen, quick, internalMatchTimeout); + runner.Scan(this, input.AsSpan(beginning, length), startat - beginning, prevlen, quick, internalMatchTimeout); + Match? m = runner.runmatch; + runner.runmatch = null; // Reset runmatch + if (m is not null) + { + if (!quick && m.Text != input) + { + m.Text = input; + } + + // If there was a match and the original text was sliced, then add beggining to the index to get the real + // Index of the match. + if (m.Success && beginning != 0) + { + m.AddBeginningToIndex(beginning); + } + } + return m; + } + finally + { + _runner = runner; + } + } + + internal Match? Run(bool quick, int prevlen, ReadOnlySpan input, int beginning, int length, int startat) + { + if ((uint)startat > (uint)input.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.startat, ExceptionResource.BeginIndexNotNegative); + } + if ((uint)length > (uint)input.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.length, ExceptionResource.LengthNotNegative); + } + + RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); + try + { + runner.Scan(this, input.Slice(beginning, length), startat - beginning, prevlen, quick, internalMatchTimeout); + Match? m = runner.runmatch; + runner.runmatch = null; // Reset runmatch + return m; } finally { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 13a9fbf155bb12..6a238d0afd7303 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -88,6 +88,18 @@ protected RegexRunner() { } protected Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) => Scan(regex, text, textbeg, textend, textstart, prevlen, quick, regex.MatchTimeout); + protected internal virtual void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick, TimeSpan timeout) + { + string? s = runtext; + if (text != s) + { + s = text.ToString(); + } + + Match? match = Scan(regex, s, 0, text.Length, textstart, prevlen, quick, timeout); + runmatch = match; + } + protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) { this.quick = quick; @@ -394,21 +406,21 @@ private void DoCheckTimeout() /// then to leave runtextpos at the ending position. It should leave /// runtextpos where it started if there was no match. /// - protected abstract void Go(); + protected virtual void Go() => throw new NotImplementedException(); /// /// The responsibility of FindFirstChar() is to advance runtextpos /// until it is at the next position which is a candidate for the /// beginning of a successful match. /// - protected abstract bool FindFirstChar(); + protected virtual bool FindFirstChar() => throw new NotImplementedException(); /// /// InitTrackCount must initialize the runtrackcount field; this is /// used to know how large the initial runtrack and runstack arrays /// must be. /// - protected abstract void InitTrackCount(); + protected virtual void InitTrackCount() => runtrackcount = 0; /// /// Initializes all the data members that are used by Go() From 2d3aef3209e9ffc7c87d3a44ed8357427ebabf22 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Tue, 1 Feb 2022 15:20:04 -0800 Subject: [PATCH 2/6] - Source generator emits new Scan method and switched Go() to return bool. - Added Scan overload to NonBacktracking - Added Scan overload to CompiledEngine which internally calls the generated Go and FFC dynamic methods. - Moved a lot of the initialization that happens as part of Scan to occur before Scan, in order to not requiring to expose additional APIs to create custom regexRunners - Added Scan overload to Interpreted engine - Moved logic that cleans up the Match object that used to happen at the end of Scan to now happen after Scan() is done. --- .../gen/RegexGenerator.Emitter.cs | 133 +++++++++++- .../RegularExpressions/CompiledRegexRunner.cs | 15 +- .../CompiledRegexRunnerFactory.cs | 14 +- .../System/Text/RegularExpressions/Regex.cs | 19 +- .../Text/RegularExpressions/RegexCompiler.cs | 190 +++++++++++++++++- .../RegularExpressions/RegexInterpreter.cs | 67 +++++- .../RegularExpressions/RegexLWCGCompiler.cs | 15 +- .../Text/RegularExpressions/RegexRunner.cs | 87 ++++---- .../Symbolic/SymbolicRegexRunnerFactory.cs | 11 +- 9 files changed, 444 insertions(+), 107 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 1f2a5d7de00a03..d0dd2d8dd25909 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -222,7 +222,15 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri DescribeExpression(writer, rm.Code.Tree.Root.Child(0), " // ", rm.Code); // skip implicit root capture writer.WriteLine(); - writer.WriteLine($" protected override bool FindFirstChar()"); + writer.WriteLine($" protected override void Scan(global::System.Text.RegularExpressions.Regex regex, global::System.ReadOnlySpan text, int textstart, int prevlen, bool quick, global::System.TimeSpan timeout)"); + writer.WriteLine($" {{"); + writer.Indent += 4; + EmitScan(writer, rm, id); + writer.Indent -= 4; + writer.WriteLine($" }}"); + writer.WriteLine(); + + writer.WriteLine($" private bool FindFirstChar(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; RequiredHelperFunctions requiredHelpers = EmitFindFirstChar(writer, rm, id); @@ -233,7 +241,7 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri { writer.WriteLine($" [global::System.Runtime.CompilerServices.SkipLocalsInit]"); } - writer.WriteLine($" protected override void Go()"); + writer.WriteLine($" private bool Go(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; requiredHelpers |= EmitGo(writer, rm, id); @@ -299,6 +307,115 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht) } } + private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string id) + { + RegexOptions options = (RegexOptions)rm.Options; + + // Emit locals initialization + //writer.WriteLine("this.quick = quick;"); + //writer.WriteLine("base.runtextpos = textstart;"); + //writer.WriteLine("base.runregex = regex;"); + //writer.WriteLine("base.runtextstart = textstart;"); + //writer.WriteLine("base.runtextbeg = textbeg;"); + //writer.WriteLine("base.runtextend = textend;"); + //writer.WriteLine("bool initialized = false;"); + //writer.WriteLine(); + + //EmitTimeoutHandling(); + //writer.WriteLine(); + + // Source generator doesn't support Right-To-Left so there is no need to add the sepcial bump logic. + // If Right-to-left is ever added to the source generator, then we would need to the logic to define + // bump, as well as stoppos + Debug.Assert((options & RegexOptions.RightToLeft) == 0); + + EmitPrevLenCheck(); + writer.WriteLine(); + + EmitMainSearchLoop(); + writer.WriteLine(); + + return; + +#pragma warning disable CS8321 // Local function is declared but never used + void EmitTimeoutHandling() + { + writer.WriteLine("// Handle timeout argument"); + writer.WriteLine("_timeout = -1;"); + writer.WriteLine("bool ignoreTimeout = global::System.Text.RegularExpressions.Regex.InfiniteMatchTimeout == timeout;"); + using (EmitBlock(writer, "if (!ignoreTimeout)")) + { + writer.WriteLine("// We are using Environment.TickCount and not Stopwatch for performance reasons."); + writer.WriteLine("// Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt"); + writer.WriteLine("// overflow it will still stay ahead of Environment.TickCount for comparisons made"); + writer.WriteLine("// in DoCheckTimeout()."); + writer.WriteLine("global::System.Text.RegularExpressions.Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected"); + writer.WriteLine("_timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round;"); + writer.WriteLine("_timeoutOccursAt = global::System.Environment.TickCount + _timeout;"); + writer.WriteLine("_timeoutChecksToSkip = TimeoutCheckFrequency;"); + } + } +#pragma warning restore CS8321 // Local function is declared but never used + + void EmitPrevLenCheck() + { + writer.WriteLine("// If previous match was empty or failed, advance by one before matching."); + using (EmitBlock(writer, "if (prevlen == 0)")) + { + using (EmitBlock(writer, "if (textstart == text.Length)")) + { + writer.WriteLine("base.runmatch = global::System.Text.RegularExpressions.Match.Empty;"); + writer.WriteLine("return;"); + } + writer.WriteLine(); + writer.WriteLine("base.runtextpos++;"); + } + } + + void EmitMainSearchLoop() + { + using (EmitBlock(writer, "while (true)")) + { + using (EmitBlock(writer, "if (FindFirstChar(text))")) + { + writer.WriteLine("base.CheckTimeout();"); + writer.WriteLine(); + + writer.WriteLine("// If we got a match, we're done."); + using (EmitBlock(writer, "if (Go(text))")) + { + using (EmitBlock(writer, "if (quick)")) + { + writer.WriteLine("base.runmatch = null;"); + writer.WriteLine("return;"); + } + writer.WriteLine(); + + writer.WriteLine("// base.runmatch!.Tidy(base.runtextpos);"); + writer.WriteLine("return;"); + } + writer.WriteLine(); + + writer.WriteLine("// Reset state for another iteration."); + writer.WriteLine("base.runtrackpos = base.runtrack!.Length;"); + writer.WriteLine("base.runstackpos = base.runstack!.Length;"); + writer.WriteLine("base.runcrawlpos = base.runcrawl!.Length;"); + } + writer.WriteLine(); + + writer.WriteLine("// We failed to find a match. If we're at the end of the input, then we are done."); + using (EmitBlock(writer, "if (base.runtextpos == text.Length)")) + { + writer.WriteLine("base.runmatch = global::System.Text.RegularExpressions.Match.Empty;"); + writer.WriteLine("return;"); + } + writer.WriteLine(); + + writer.WriteLine("base.runtextpos++;"); + } + } + } + /// Emits the body of the FindFirstChar override. private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id) { @@ -347,7 +464,6 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ { case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitIndexOf(code.FindOptimizations.LeadingCaseSensitivePrefix); break; @@ -356,13 +472,11 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitFixedSet(); break; case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight_CaseSensitive: Debug.Assert(code.FindOptimizations.LiteralAfterLoop is not null); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitLiteralAfterAtomicLoop(); break; @@ -463,7 +577,6 @@ bool EmitAnchors() // the other anchors, which all skip all subsequent processing if found, with BOL we just use it // to boost our position to the next line, and then continue normally with any searches. writer.WriteLine("// Beginning-of-line anchor"); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); additionalDeclarations.Add("int beginning = base.runtextbeg;"); using (EmitBlock(writer, "if (pos > beginning && inputSpan[pos - 1] != '\\n')")) { @@ -763,6 +876,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe writer.WriteLine($"int end = start + {(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1)};"); writer.WriteLine("base.Capture(0, start, end);"); writer.WriteLine("base.runtextpos = end;"); + writer.WriteLine("return true;"); return requiredHelpers; case RegexNodeKind.Empty: @@ -770,6 +884,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // source generator and seeing what happens as you add more to expressions. When approaching // it from a learning perspective, this is very common, as it's the empty string you start with. writer.WriteLine("base.Capture(0, base.runtextpos, base.runtextpos);"); + writer.WriteLine("return true;"); return requiredHelpers; } @@ -781,7 +896,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // Declare some locals. string sliceSpan = "slice"; - writer.WriteLine("global::System.ReadOnlySpan inputSpan = base.runtext;"); writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;"); writer.WriteLine($"int original_pos = pos;"); bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm); @@ -826,7 +940,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe } writer.WriteLine("base.runtextpos = pos;"); writer.WriteLine("base.Capture(0, original_pos, pos);"); - writer.WriteLine("return;"); + writer.WriteLine("return true;"); writer.WriteLine(); // We only get here in the code if the whole expression fails to match and jumps to @@ -837,6 +951,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe { EmitUncaptureUntil("0"); } + writer.WriteLine("return false;"); // We're done with the match. @@ -846,8 +961,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // And emit any required helpers. if (additionalLocalFunctions.Count != 0) { - writer.WriteLine("return;"); // not strictly necessary, just for readability - foreach (KeyValuePair localFunctions in additionalLocalFunctions.OrderBy(k => k.Key)) { writer.WriteLine(); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs index c9e6419fed87cf..4711932555bc7c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs @@ -5,19 +5,18 @@ namespace System.Text.RegularExpressions { internal sealed class CompiledRegexRunner : RegexRunner { - private readonly Action _goMethod; - private readonly Func _findFirstCharMethod; + private readonly ScanDelegate _scanMethod; - public CompiledRegexRunner(Action go, Func findFirstChar, int trackCount) + internal delegate void ScanDelegate(RegexRunner runner, Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick, TimeSpan timeout); + + public CompiledRegexRunner(ScanDelegate scan, int trackCount) { - _goMethod = go; - _findFirstCharMethod = findFirstChar; + _scanMethod = scan; runtrackcount = trackCount; } - protected override void Go() => _goMethod(this); - - protected override bool FindFirstChar() => _findFirstCharMethod(this); + protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick, TimeSpan timeout) + => _scanMethod(this, regex, text, textstart, prevlen, quick, timeout); protected override void InitTrackCount() { } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs index 958d5cf3dc16f1..6575a3856e97d9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs @@ -7,25 +7,21 @@ namespace System.Text.RegularExpressions { internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory { - private readonly DynamicMethod _goMethod; - private readonly DynamicMethod _findFirstCharMethod; + private readonly DynamicMethod _scanMethod; private readonly int _trackcount; // Delegates are lazily created to avoid forcing JIT'ing until the regex is actually executed. - private Action? _go; - private Func? _findFirstChar; + private CompiledRegexRunner.ScanDelegate? _scan; - public CompiledRegexRunnerFactory(DynamicMethod goMethod, DynamicMethod findFirstCharMethod, int trackcount) + public CompiledRegexRunnerFactory(DynamicMethod scanMethod, int trackcount) { - _goMethod = goMethod; - _findFirstCharMethod = findFirstCharMethod; + _scanMethod = scanMethod; _trackcount = trackcount; } protected internal override RegexRunner CreateInstance() => new CompiledRegexRunner( - _go ??= _goMethod.CreateDelegate>(), - _findFirstChar ??= _findFirstCharMethod.CreateDelegate>(), + _scan ??= _scanMethod.CreateDelegate(), _trackcount); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 0cc2bdaabd1c3d..52983d3ade42da 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -383,14 +383,21 @@ protected void InitializeReferences() RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { - runner.Scan(this, input.AsSpan(beginning, length), startat - beginning, prevlen, quick, internalMatchTimeout); + runner.InitializeTimeout(internalMatchTimeout); + ReadOnlySpan span = input.AsSpan(beginning, length); + runner.InitializeForScan(this, span, 0, span.Length, startat - beginning, quick); + runner.InitializeForGo(); + runner.Scan(this, span, startat - beginning, prevlen, quick, internalMatchTimeout); Match? m = runner.runmatch; runner.runmatch = null; // Reset runmatch if (m is not null) { - if (!quick && m.Text != input) + if (!quick) { - m.Text = input; + if (m._matchcount[0] > 0) + m.Tidy(runner.runtextpos); + if (m.Text != input) + m.Text = input; } // If there was a match and the original text was sliced, then add beggining to the index to get the real @@ -422,7 +429,11 @@ protected void InitializeReferences() RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { - runner.Scan(this, input.Slice(beginning, length), startat - beginning, prevlen, quick, internalMatchTimeout); + runner.InitializeTimeout(internalMatchTimeout); + ReadOnlySpan span = input.Slice(beginning, length); + runner.InitializeForScan(this, span, 0, span.Length, startat - beginning, quick); + runner.InitializeForGo(); + runner.Scan(this, span, startat - beginning, prevlen, quick, internalMatchTimeout); Match? m = runner.runmatch; runner.runmatch = null; // Reset runmatch return m; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 96ca0512634625..cf2167e07c1c88 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -20,8 +20,15 @@ internal abstract class RegexCompiler private static readonly FieldInfo s_runtextendField = RegexRunnerField("runtextend"); private static readonly FieldInfo s_runtextstartField = RegexRunnerField("runtextstart"); private static readonly FieldInfo s_runtextposField = RegexRunnerField("runtextpos"); - private static readonly FieldInfo s_runtextField = RegexRunnerField("runtext"); private static readonly FieldInfo s_runstackField = RegexRunnerField("runstack"); + private static readonly FieldInfo s_runmatchField = RegexRunnerField("runmatch"); + private static readonly FieldInfo s_runtrackField = RegexRunnerField("runtrack"); + private static readonly FieldInfo s_runtrackposField = RegexRunnerField("runtrackpos"); + private static readonly FieldInfo s_runstackposField = RegexRunnerField("runstackpos"); + private static readonly FieldInfo s_runcrawlField = RegexRunnerField("runcrawl"); + private static readonly FieldInfo s_runcrawlposField = RegexRunnerField("runcrawlpos"); + + private static readonly FieldInfo s_matchMatchCountField = typeof(Match).GetField("_matchcount", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance | BindingFlags.Static)!; private static readonly MethodInfo s_captureMethod = RegexRunnerMethod("Capture"); private static readonly MethodInfo s_transferCaptureMethod = RegexRunnerMethod("TransferCapture"); @@ -42,8 +49,10 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!; private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!; private static readonly MethodInfo s_cultureInfoGetTextInfoMethod = typeof(CultureInfo).GetMethod("get_TextInfo")!; + private static readonly MethodInfo s_regexGetRightToLeft = typeof(Regex).GetMethod("get_RightToLeft")!; private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan).GetMethod("get_Item", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan).GetMethod("get_Length")!; + private static readonly MethodInfo s_matchGetEmptyMethod = typeof(Match).GetMethod("get_Empty")!; private static readonly MethodInfo s_memoryMarshalGetReference = typeof(MemoryMarshal).GetMethod("GetReference", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfChar = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfSpan = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); @@ -180,6 +189,9 @@ internal abstract class RegexCompiler /// A macro for _ilg.Emit(OpCodes.Ldarg_0). protected void Ldthis() => _ilg!.Emit(OpCodes.Ldarg_0); + /// A macro for _ilgEmit(OpCodes.Ldarg_1) + private void Ldarg_1() => _ilg!.Emit(OpCodes.Ldarg_1); + /// A macro for Ldthis(); Ldfld(); protected void Ldthisfld(FieldInfo ft) { @@ -271,6 +283,9 @@ private void Mvfldloc(FieldInfo ft, LocalBuilder lt) private void Switch(Label[] table) => _ilg!.Emit(OpCodes.Switch, table); + /// Declares a local bool. + private LocalBuilder DeclareBool() => _ilg!.DeclareLocal(typeof(bool)); + /// Declares a local int. private LocalBuilder DeclareInt32() => _ilg!.DeclareLocal(typeof(int)); @@ -388,11 +403,10 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or // Load necessary locals // int pos = base.runtextpos; // int end = base.runtextend; - // ReadOnlySpan inputSpan = base.runtext.AsSpan(); + // ReadOnlySpan inputSpan = input; Mvfldloc(s_runtextposField, pos); Mvfldloc(s_runtextendField, end); - Ldthisfld(s_runtextField); - Call(s_stringAsSpanMethod); + Ldarg_1(); Stloc(inputSpan); // Generate length check. If the input isn't long enough to possibly match, fail quickly. @@ -1098,10 +1112,9 @@ protected void EmitGo() // CultureInfo culture = CultureInfo.CurrentCulture; // only if the whole expression or any subportion is ignoring case, and we're not using invariant InitializeCultureForGoIfNecessary(); - // ReadOnlySpan inputSpan = base.runtext.AsSpan(); + // ReadOnlySpan inputSpan = input; // int end = base.runtextend; - Ldthisfld(s_runtextField); - Call(s_stringAsSpanMethod); + Ldarg_1(); Stloc(inputSpan); Mvfldloc(s_runtextendField, end); @@ -3925,6 +3938,169 @@ void EmitStackPop() } } + protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMethod) + { + LocalBuilder bump = DeclareInt32(); + LocalBuilder stoppos = DeclareInt32(); + + Label notRightToLeft = DefineLabel(); + // int bump = 1 + Ldc(1); + Stloc(bump); + + // int stoppos = text.Length + _ilg!.Emit(OpCodes.Ldarga_S, 2); + Call(s_spanGetLengthMethod); + Stloc(stoppos); + + // if (regex.RightToLeft) + // { + Ldarg_1(); + Callvirt(s_regexGetRightToLeft); + BrfalseFar(notRightToLeft); + + // bump = -1; + // stoppos = 0; + // } + Ldc(-1); + Stloc(bump); + Ldc(0); + Stloc(stoppos); + MarkLabel(notRightToLeft); + + // if (prevlen == 0) + // { + _ilg!.Emit(OpCodes.Ldarga_S, 4); + Ldc(0); + Ceq(); + Label prevelenIsNotZero = DefineLabel(); + BrfalseFar(prevelenIsNotZero); + + // if (textstart == stoppos) + // { + _ilg!.Emit(OpCodes.Ldarg_3); + Ldloc(stoppos); + Ceq(); + Label textstartNotEqualToStoppos = DefineLabel(); + BrfalseFar(textstartNotEqualToStoppos); + + // runmatch = Match.Empty; + // return; + Ldthis(); + Call(s_matchGetEmptyMethod); + Stfld(s_runmatchField); + Label returnLabel = DefineLabel(); + BrFar(returnLabel); + MarkLabel(textstartNotEqualToStoppos); + + // runtextpos++; + Ldthis(); + Ldthisfld(s_runtextposField); + Ldc(1); + Add(); + Stfld(s_runtextposField); + MarkLabel(prevelenIsNotZero); + + // while (true) + Label whileLoopEnd = DefineLabel(); + Label whileLoopBody = DefineLabel(); + MarkLabel(whileLoopBody); + + // if (FindFirstChar(text)) + Ldthis(); + _ilg!.Emit(OpCodes.Ldarg_2); + Call(findFirstCharMethod); + Label afterFindFirstCharLabel = DefineLabel(); + BrfalseFar(afterFindFirstCharLabel); + + // CheckTimeout(); + Ldthis(); + Call(s_checkTimeoutMethod); + + // Go(text); + Ldthis(); + _ilg!.Emit(OpCodes.Ldarg_2); + Call(goMethod); + + // if (runmatch!._matchcount[0] > 0) + Ldthisfld(s_runmatchField); + _ilg!.Emit(OpCodes.Ldfld, s_matchMatchCountField); + Ldc(0); + LdelemI4(); + Ldc(0); + _ilg!.Emit(OpCodes.Cgt); + Label afterSuccessMatchLabel = DefineLabel(); + BrfalseFar(afterSuccessMatchLabel); + + // if (quick) + _ilg!.Emit(OpCodes.Ldarg_S, 5); + Label afterQuickCheckLabel = DefineLabel(); + BrfalseFar(afterQuickCheckLabel); + + // runmatch = null; + Ldthis(); + _ilg!.Emit(OpCodes.Ldnull); + Stfld(s_runmatchField); + + // return; + MarkLabel(afterQuickCheckLabel); + BrFar(returnLabel); + + // runtrackpos = runtrack!.Length; + MarkLabel(afterSuccessMatchLabel); + Ldthis(); + Ldthisfld(s_runtrackField); + Ldlen(); + _ilg!.Emit(OpCodes.Conv_I4); + Stfld(s_runtrackposField); + + // runtrackpos = runstack!.Length; + Ldthis(); + Ldthisfld(s_runstackField); + Ldlen(); + _ilg!.Emit(OpCodes.Conv_I4); + Stfld(s_runstackposField); + + // runcrawlpos = runcrawl!.Length; + Ldthis(); + Ldthisfld(s_runcrawlField); + Ldlen(); + _ilg!.Emit(OpCodes.Conv_I4); + Stfld(s_runcrawlposField); + + // if (runtextpos == stoppos) + MarkLabel(afterFindFirstCharLabel); + Ldthisfld(s_runtextposField); + Ldloc(stoppos); + Ceq(); + Label incrementRuntextPosLabel = DefineLabel(); + BrfalseFar(incrementRuntextPosLabel); + + // runmatch = Match.Empty; + // return; + Ldthis(); + Call(s_matchGetEmptyMethod); + Stfld(s_runmatchField); + BrFar(returnLabel); + + // runtextpos += bump + MarkLabel(incrementRuntextPosLabel); + Ldthis(); + Ldthisfld(s_runtextposField); + Ldloc(bump); + Add(); + Stfld(s_runtextposField); + + // End loop body. + BrFar(whileLoopBody); + MarkLabel(whileLoopEnd); + + // return; + MarkLabel(returnLabel); + _ilg!.Emit(OpCodes.Nop); + Ret(); + } + private void InitializeCultureForGoIfNecessary() { _textInfo = null; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 9f615754a808fd..712e8e512360b4 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -324,15 +324,74 @@ private bool MatchRef(int index, int length, ReadOnlySpan inputSpan) private void Backwardnext() => runtextpos += _rightToLeft ? 1 : -1; - protected override bool FindFirstChar() => - _code.FindOptimizations.TryFindNextStartingPosition(runtext!, ref runtextpos, runtextbeg, runtextstart, runtextend); + protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick, TimeSpan timeout) + { + // Configure the additional value to "bump" the position along each time we loop around + // to call FindFirstChar again, as well as the stopping position for the loop. We generally + // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump + // by -1 and stop at textbeg. + int bump = 1, stoppos = text.Length; + if (regex.RightToLeft) + { + bump = -1; + stoppos = 0; + } + + // If previous match was empty or failed, advance by one before matching. + if (prevlen == 0) + { + if (textstart == stoppos) + { + runmatch = Match.Empty; + return; + } + + runtextpos++; + } + + while (true) + { + if (FindFirstChar(text)) + { + CheckTimeout(); + + Go(text); + + // If we got a match, we're done. + if (runmatch!._matchcount[0] > 0) + { + if (quick) + { + runmatch = null; + } + + return; + } + + // Reset state for another iteration. + runtrackpos = runtrack!.Length; + runstackpos = runstack!.Length; + runcrawlpos = runcrawl!.Length; + } + + if (runtextpos == stoppos) + { + runmatch = Match.Empty; + return; + } + + runtextpos += bump; + } + } + + private bool FindFirstChar(ReadOnlySpan inputSpan) => + _code.FindOptimizations.TryFindNextStartingPosition(inputSpan, ref runtextpos, runtextbeg, runtextstart, runtextend); - protected override void Go() + private void Go(ReadOnlySpan inputSpan) { SetOperator((RegexOpcode)_code.Codes[0]); _codepos = 0; int advance = -1; - ReadOnlySpan inputSpan = runtext; while (true) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index 34b7f1b1130592..060750e23d58df 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -24,7 +24,7 @@ internal sealed class RegexLWCGCompiler : RegexCompiler private static readonly bool s_includePatternInName = Environment.GetEnvironmentVariable(IncludePatternInNamesEnvVar) == "1"; /// Parameter types for the generated Go and FindFirstChar methods. - private static readonly Type[] s_paramTypes = new Type[] { typeof(RegexRunner) }; + private static readonly Type[] s_paramTypes = new Type[] { typeof(RegexRunner), typeof(ReadOnlySpan) }; /// Id number to use for the next compiled regex. private static int s_regexCount; @@ -52,17 +52,20 @@ internal sealed class RegexLWCGCompiler : RegexCompiler description = string.Concat("_", pattern.Length > DescriptionLimit ? pattern.AsSpan(0, DescriptionLimit) : pattern); } - DynamicMethod findFirstCharMethod = DefineDynamicMethod($"Regex{regexNum}_FindFirstChar{description}", typeof(bool), typeof(CompiledRegexRunner)); + DynamicMethod findFirstCharMethod = DefineDynamicMethod($"Regex{regexNum}_FindFirstChar{description}", typeof(bool), typeof(CompiledRegexRunner), s_paramTypes); EmitFindFirstChar(); - DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", null, typeof(CompiledRegexRunner)); + DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", null, typeof(CompiledRegexRunner), s_paramTypes); EmitGo(); - return new CompiledRegexRunnerFactory(goMethod, findFirstCharMethod, code.TrackCount); + DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(Regex), typeof(ReadOnlySpan), typeof(int), typeof(int), typeof(bool), typeof(TimeSpan) }); + EmitScan(findFirstCharMethod, goMethod); + + return new CompiledRegexRunnerFactory(scanMethod, code.TrackCount); } /// Begins the definition of a new method (no args) with a specified return value. - private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Type hostType) + private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Type hostType, Type[] paramTypes) { // We're claiming that these are static methods, but really they are instance methods. // By giving them a parameter which represents "this", we're tricking them into @@ -71,7 +74,7 @@ private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Typ const MethodAttributes Attribs = MethodAttributes.Public | MethodAttributes.Static; const CallingConventions Conventions = CallingConventions.Standard; - var dm = new DynamicMethod(methname, Attribs, Conventions, returntype, s_paramTypes, hostType, skipVisibility: false); + var dm = new DynamicMethod(methname, Attribs, Conventions, returntype, paramTypes, hostType, skipVisibility: false); _ilg = dm.GetILGenerator(); return dm; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 6a238d0afd7303..e8ea102b0421fe 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -104,21 +104,6 @@ protected internal virtual void Scan(Regex regex, ReadOnlySpan text, int t { this.quick = quick; - // Handle timeout argument - _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds - bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; - if (!ignoreTimeout) - { - // We are using Environment.TickCount and not Stopwatch for performance reasons. - // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt - // overflow it will still stay ahead of Environment.TickCount for comparisons made - // in DoCheckTimeout(). - Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected - _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; - _timeoutOccursAt = Environment.TickCount + _timeout; - _timeoutChecksToSkip = TimeoutCheckFrequency; - } - // Configure the additional value to "bump" the position along each time we loop around // to call FindFirstChar again, as well as the stopping position for the loop. We generally // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump @@ -130,10 +115,6 @@ protected internal virtual void Scan(Regex regex, ReadOnlySpan text, int t stoppos = textbeg; } - // Store runtextpos into field, as we may bump it in next check. The remaining arguments - // are stored below once we're past the potential return in the next check. - runtextpos = textstart; - // If previous match was empty or failed, advance by one before matching. if (prevlen == 0) { @@ -145,16 +126,6 @@ protected internal virtual void Scan(Regex regex, ReadOnlySpan text, int t runtextpos += bump; } - // Store remaining arguments into fields now that we're going to start the scan. - // These are referenced by the derived runner. - runregex = regex; - runtext = text; - runtextstart = textstart; - runtextbeg = textbeg; - runtextend = textend; - - // Main loop: FindFirstChar/Go + bump until the ending position. - bool initialized = false; while (true) { // Find the next potential location for a match in the input. @@ -163,18 +134,7 @@ protected internal virtual void Scan(Regex regex, ReadOnlySpan text, int t #endif if (FindFirstChar()) { - if (!ignoreTimeout) - { - DoCheckTimeout(); - } - - // Ensure that the runner is initialized. This includes initializing all of the state in the runner - // that Go might use, such as the backtracking stack, as well as a Match object for it to populate. - if (!initialized) - { - InitializeForGo(); - initialized = true; - } + CheckTimeout(); // See if there's a match at this position. #if DEBUG @@ -219,6 +179,37 @@ protected internal virtual void Scan(Regex regex, ReadOnlySpan text, int t } } + internal void InitializeForScan(Regex regex, ReadOnlySpan text, int textbeg, int textend, int textstart, bool quick) + { + this.quick = quick; + // Store remaining arguments into fields now that we're going to start the scan. + // These are referenced by the derived runner. + runregex = regex; + //runtext = text; + runtextstart = textstart; + runtextbeg = textbeg; + runtextend = textend; + runtextpos = textstart; + } + + internal void InitializeTimeout(TimeSpan timeout) + { + // Handle timeout argument + _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds + bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; + if (!ignoreTimeout) + { + // We are using Environment.TickCount and not Stopwatch for performance reasons. + // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt + // overflow it will still stay ahead of Environment.TickCount for comparisons made + // in DoCheckTimeout(). + Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected + _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; + _timeoutOccursAt = Environment.TickCount + _timeout; + _timeoutChecksToSkip = TimeoutCheckFrequency; + } + } + /// Enumerates all of the matches with the specified regex, invoking the callback for each. /// /// This optionally repeatedly hands out the same Match instance, updated with new information. @@ -262,7 +253,6 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref runtextbeg = 0; // Main loop: FindFirstChar/Go + bump until the ending position. - bool initialized = false; while (true) { // Find the next potential location for a match in the input. @@ -276,14 +266,6 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref DoCheckTimeout(); } - // Ensure that the runner is initialized. This includes initializing all of the state in the runner - // that Go might use, such as the backtracking stack, as well as a Match object for it to populate. - if (!initialized) - { - InitializeForGo(); - initialized = true; - } - #if DEBUG Debug.WriteLineIf(Regex.EnableDebugTracing, $"Calling Go at {nameof(runtextpos)}={runtextpos}"); #endif @@ -303,7 +285,6 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref runmatch = null; } match.Tidy(runtextpos); - initialized = false; if (!callback(ref state, match)) { // If the callback returns false, we're done. @@ -371,7 +352,7 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref } } - protected void CheckTimeout() + protected internal void CheckTimeout() { if (_ignoreTimeout) return; @@ -425,7 +406,7 @@ private void DoCheckTimeout() /// /// Initializes all the data members that are used by Go() /// - private void InitializeForGo() + internal void InitializeForGo() { if (runmatch is null) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index 097a67ccd00326..dd2050848d6925 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -83,14 +83,13 @@ private sealed class Runner : RegexRunner where TSetType : notnull internal Runner(SymbolicRegexMatcher matcher) => _matcher = matcher; - protected override void InitTrackCount() { } // nop, no backtracking - - protected override bool FindFirstChar() => true; // The logic is all in Go. - - protected override void Go() + protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick, TimeSpan timeout) { - ReadOnlySpan inputSpan = runtext; + Go(text); + } + private void Go(ReadOnlySpan inputSpan) + { // Perform the match. SymbolicMatch pos = _matcher.FindMatch(quick, inputSpan, runtextpos, runtextend); if (pos.Success) From 2960e9935de3b9ae8b9895f20a1d6fb77fc5a11d Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Fri, 4 Feb 2022 09:56:40 -0800 Subject: [PATCH 3/6] Remove timeout from Scan, as well as fix most of the test issues --- .../gen/RegexGenerator.Emitter.cs | 4 +- .../ref/System.Text.RegularExpressions.cs | 4 +- .../System/Text/RegularExpressions/Capture.cs | 12 +-- .../RegularExpressions/CompiledRegexRunner.cs | 6 +- .../System/Text/RegularExpressions/Group.cs | 2 +- .../System/Text/RegularExpressions/Match.cs | 7 +- .../System/Text/RegularExpressions/Regex.cs | 82 ++++++++++++++++++- .../Text/RegularExpressions/RegexCompiler.cs | 20 +++-- .../RegularExpressions/RegexInterpreter.cs | 12 +-- .../RegularExpressions/RegexLWCGCompiler.cs | 2 +- .../Text/RegularExpressions/RegexRunner.cs | 47 ++++++++--- .../Symbolic/SymbolicRegexRunnerFactory.cs | 35 +++++++- 12 files changed, 185 insertions(+), 48 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index d0dd2d8dd25909..ad8db87de00849 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -222,7 +222,7 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri DescribeExpression(writer, rm.Code.Tree.Root.Child(0), " // ", rm.Code); // skip implicit root capture writer.WriteLine(); - writer.WriteLine($" protected override void Scan(global::System.Text.RegularExpressions.Regex regex, global::System.ReadOnlySpan text, int textstart, int prevlen, bool quick, global::System.TimeSpan timeout)"); + writer.WriteLine($" protected override void Scan(global::System.Text.RegularExpressions.Regex regex, global::System.ReadOnlySpan text, int textstart, int prevlen, bool quick)"); writer.WriteLine($" {{"); writer.Indent += 4; EmitScan(writer, rm, id); @@ -2261,7 +2261,7 @@ void EmitBoundary(RegexNode node) _ => "base.IsECMABoundary", }; - using (EmitBlock(writer, $"if ({call}(pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))")) + using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))")) { writer.WriteLine($"goto {doneLabel};"); } diff --git a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs index 8e688e348d0e2d..5f5d4b5bf0302c 100644 --- a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs +++ b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs @@ -338,14 +338,16 @@ protected void EnsureStorage() { } protected virtual void Go() { throw null; } protected virtual void InitTrackCount() { throw null; } protected bool IsBoundary(int index, int startpos, int endpos) { throw null; } + protected bool IsBoundary(System.ReadOnlySpan inputSpan, int index, int startpos, int endpos) { throw null; } protected bool IsECMABoundary(int index, int startpos, int endpos) { throw null; } + protected bool IsECMABoundary(System.ReadOnlySpan inputSpan, int index, int startpos, int endpos) { throw null; } protected bool IsMatched(int cap) { throw null; } protected int MatchIndex(int cap) { throw null; } protected int MatchLength(int cap) { throw null; } protected int Popcrawl() { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, System.TimeSpan timeout) { throw null; } - protected internal virtual void Scan(System.Text.RegularExpressions.Regex regex, System.ReadOnlySpan text, int textstart, int prevlen, bool quick, System.TimeSpan timeout) { throw null; } + protected internal virtual void Scan(System.Text.RegularExpressions.Regex regex, System.ReadOnlySpan text, int textstart, int prevlen, bool quick) { throw null; } protected void TransferCapture(int capnum, int uncapnum, int start, int end) { } protected void Uncapture() { } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs index abdb182135d87c..2eee81fadc66f5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs @@ -9,7 +9,7 @@ namespace System.Text.RegularExpressions /// public class Capture { - internal Capture(string text, int index, int length) + internal Capture(string? text, int index, int length) { Text = text; Index = index; @@ -34,23 +34,23 @@ internal void AddBeginningToIndex(int beginning) public int Length { get; private protected set; } /// The original string - internal string Text { get; set; } + internal string? Text { get; set; } /// Gets the captured substring from the input string. /// The substring that is captured by the match. - public string Value => Text.Substring(Index, Length); + public string Value => Text is string text ? text.Substring(Index, Length) : string.Empty; /// Gets the captured span from the input string. /// The span that is captured by the match. - public ReadOnlySpan ValueSpan => Text.AsSpan(Index, Length); + public ReadOnlySpan ValueSpan => Text is string text ? text.AsSpan(Index, Length) : ReadOnlySpan.Empty; /// Returns the substring that was matched. public override string ToString() => Value; /// The substring to the left of the capture - internal ReadOnlyMemory GetLeftSubstring() => Text.AsMemory(0, Index); + internal ReadOnlyMemory GetLeftSubstring() => Text is string text ? text.AsMemory(0, Index) : ReadOnlyMemory.Empty; /// The substring to the right of the capture - internal ReadOnlyMemory GetRightSubstring() => Text.AsMemory(Index + Length, Text.Length - Index - Length); + internal ReadOnlyMemory GetRightSubstring() => Text is string text ? text.AsMemory(Index + Length, Text.Length - Index - Length) : ReadOnlyMemory.Empty; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs index 4711932555bc7c..acd122f8a57e56 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs @@ -7,7 +7,7 @@ internal sealed class CompiledRegexRunner : RegexRunner { private readonly ScanDelegate _scanMethod; - internal delegate void ScanDelegate(RegexRunner runner, Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick, TimeSpan timeout); + internal delegate void ScanDelegate(RegexRunner runner, Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick); public CompiledRegexRunner(ScanDelegate scan, int trackCount) { @@ -15,8 +15,8 @@ public CompiledRegexRunner(ScanDelegate scan, int trackCount) runtrackcount = trackCount; } - protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick, TimeSpan timeout) - => _scanMethod(this, regex, text, textstart, prevlen, quick, timeout); + protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick) + => _scanMethod(this, regex, text, textstart, prevlen, quick); protected override void InitTrackCount() { } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs index f4b2a7fb2e9804..2c34694f1ecaf5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs @@ -16,7 +16,7 @@ public class Group : Capture internal int _capcount; internal CaptureCollection? _capcoll; - internal Group(string text, int[] caps, int capcount, string name) + internal Group(string? text, int[] caps, int capcount, string name) : base(text, capcount == 0 ? 0 : caps[(capcount - 1) * 2], capcount == 0 ? 0 : caps[(capcount * 2) - 1]) { _caps = caps; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs index 3c67526b40e184..19859fd2f0b2d1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs @@ -50,7 +50,7 @@ public class Match : Group internal bool _balancing; // whether we've done any balancing with this match. If we // have done balancing, we'll need to do extra work in Tidy(). - internal Match(Regex? regex, int capcount, string text, int begpos, int len, int startpos) : + internal Match(Regex? regex, int capcount, string? text, int begpos, int len, int startpos) : base(text, new int[2], 0, "0") { _regex = regex; @@ -66,7 +66,7 @@ internal Match(Regex? regex, int capcount, string text, int begpos, int len, int /// Returns an empty Match object. public static Match Empty { get; } = new Match(null, 1, string.Empty, 0, 0, 0); - internal void Reset(Regex regex, string text, int textbeg, int textend, int textstart) + internal void Reset(Regex regex, string? text, int textbeg, int textend, int textstart) { _regex = regex; Text = text; @@ -94,6 +94,7 @@ internal void Reset(Regex regex, string text, int textbeg, int textend, int text public Match NextMatch() { Regex? r = _regex; + Debug.Assert(Text != null); return r != null ? r.Run(false, Length, Text, _textbeg, _textend - _textbeg, _textpos)! : this; @@ -338,7 +339,7 @@ internal sealed class MatchSparse : Match { private new readonly Hashtable _caps; - internal MatchSparse(Regex regex, Hashtable caps, int capcount, string text, int begpos, int len, int startpos) : + internal MatchSparse(Regex regex, Hashtable caps, int capcount, string? text, int begpos, int len, int startpos) : base(regex, capcount, text, begpos, len, startpos) { _caps = caps; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 52983d3ade42da..1e2f176033cfd5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -387,7 +387,7 @@ protected void InitializeReferences() ReadOnlySpan span = input.AsSpan(beginning, length); runner.InitializeForScan(this, span, 0, span.Length, startat - beginning, quick); runner.InitializeForGo(); - runner.Scan(this, span, startat - beginning, prevlen, quick, internalMatchTimeout); + runner.Scan(this, span, startat - beginning, prevlen, quick); Match? m = runner.runmatch; runner.runmatch = null; // Reset runmatch if (m is not null) @@ -433,7 +433,7 @@ protected void InitializeReferences() ReadOnlySpan span = input.Slice(beginning, length); runner.InitializeForScan(this, span, 0, span.Length, startat - beginning, quick); runner.InitializeForGo(); - runner.Scan(this, span, startat - beginning, prevlen, quick, internalMatchTimeout); + runner.Scan(this, span, startat - beginning, prevlen, quick); Match? m = runner.runmatch; runner.runmatch = null; // Reset runmatch return m; @@ -451,7 +451,83 @@ internal void Run(string input, int startat, ref TState state, MatchCall RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { - runner.ScanInternal(this, input, startat, ref state, callback, reuseMatchObject, internalMatchTimeout); + runner.InitializeTimeout(internalMatchTimeout); + while (true) + { + runner.InitializeForScan(this, input, 0, input.Length, startat, false); + runner.InitializeForGo(); + runner.Scan(this, input, startat, -1, false); + Match? m = runner.runmatch; + + if (m is not null) + { + if (m._matchcount[0] > 0) + { + if (m.Text != input) + m.Text = input; + if (!reuseMatchObject) + { + // We're not reusing match objects, so null out our field reference to the instance. + // It'll be recreated the next time one is needed. + runner.runmatch = null; + } + m.Tidy(runner.runtextpos); + if (!callback(ref state, m)) + { + // If the callback returns false, we're done. + // Drop reference to text to avoid keeping it alive in a cache. + runner.runtext = null!; + if (reuseMatchObject) + { + // We're reusing the single match instance, so clear out its text as well. + // We don't do this if we're not reusing instances, as in that case we're + // dropping the whole reference to the match, and we no longer own the instance + // having handed it out to the callback. + m.Text = null!; + } + return; + } + + // Now that we've matched successfully, update the starting position to reflect + // the current position, just as Match.NextMatch() would pass in _textpos as textstart. + runner.runtextstart = startat = runner.runtextpos; + + // Reset state for another iteration. + runner.runtrackpos = runner.runtrack!.Length; + runner.runstackpos = runner.runstack!.Length; + runner.runcrawlpos = runner.runcrawl!.Length; + if (m.Length == 0) + { + if (runner.runtextpos == input.Length) + { + // Drop reference to text to avoid keeping it alive in a cache. + runner.runtext = null!; + if (reuseMatchObject) + { + // See above comment. + m.Text = null!; + } + return; + } + + runner.runtextpos += ((this.Options & RegexOptions.RightToLeft) > 0) ? -1 : 1; + } + + // Loop around to perform next match from where we left off. + continue; + } + else + { + // if we are at the end of the input, just return. + if (startat == input.Length) + { + runner.runtext = null; + runner.runmatch = null; + return; + } + } + } + } } finally { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index cf2167e07c1c88..a015206bb23a89 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.Reflection; using System.Reflection.Emit; @@ -36,9 +37,9 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_isMatchedMethod = RegexRunnerMethod("IsMatched"); private static readonly MethodInfo s_matchLengthMethod = RegexRunnerMethod("MatchLength"); private static readonly MethodInfo s_matchIndexMethod = RegexRunnerMethod("MatchIndex"); - private static readonly MethodInfo s_isBoundaryMethod = RegexRunnerMethod("IsBoundary"); + private static readonly MethodInfo s_isBoundaryMethod = typeof(RegexRunner).GetMethod("IsBoundary", BindingFlags.NonPublic | BindingFlags.Instance, new[] { typeof(ReadOnlySpan), typeof(int), typeof(int), typeof(int) })!; private static readonly MethodInfo s_isWordCharMethod = RegexRunnerMethod("IsWordChar"); - private static readonly MethodInfo s_isECMABoundaryMethod = RegexRunnerMethod("IsECMABoundary"); + private static readonly MethodInfo s_isECMABoundaryMethod = typeof(RegexRunner).GetMethod("IsECMABoundary", BindingFlags.NonPublic | BindingFlags.Instance, new[] { typeof(ReadOnlySpan), typeof(int), typeof(int), typeof(int) })!; private static readonly MethodInfo s_crawlposMethod = RegexRunnerMethod("Crawlpos"); private static readonly MethodInfo s_charInClassMethod = RegexRunnerMethod("CharInClass"); private static readonly MethodInfo s_checkTimeoutMethod = RegexRunnerMethod("CheckTimeout"); @@ -2311,8 +2312,9 @@ void EmitBoundary(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.Boundary or RegexNodeKind.NonBoundary or RegexNodeKind.ECMABoundary or RegexNodeKind.NonECMABoundary, $"Unexpected type: {node.Kind}"); - // if (!IsBoundary(pos + sliceStaticPos, base.runtextbeg, end)) goto doneLabel; + // if (!IsBoundary(inputSpan, pos + sliceStaticPos, base.runtextbeg, end)) goto doneLabel; Ldthis(); + Ldloc(inputSpan); Ldloc(pos); if (sliceStaticPos > 0) { @@ -3970,33 +3972,33 @@ protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMetho // if (prevlen == 0) // { - _ilg!.Emit(OpCodes.Ldarga_S, 4); + Label prevelenIsNotZero = DefineLabel(); + _ilg!.Emit(OpCodes.Ldarg_S, 4); Ldc(0); Ceq(); - Label prevelenIsNotZero = DefineLabel(); BrfalseFar(prevelenIsNotZero); // if (textstart == stoppos) // { + Label textstartNotEqualToStoppos = DefineLabel(); _ilg!.Emit(OpCodes.Ldarg_3); Ldloc(stoppos); Ceq(); - Label textstartNotEqualToStoppos = DefineLabel(); BrfalseFar(textstartNotEqualToStoppos); // runmatch = Match.Empty; // return; + Label returnLabel = DefineLabel(); Ldthis(); Call(s_matchGetEmptyMethod); Stfld(s_runmatchField); - Label returnLabel = DefineLabel(); BrFar(returnLabel); MarkLabel(textstartNotEqualToStoppos); - // runtextpos++; + // runtextpos += bump; Ldthis(); Ldthisfld(s_runtextposField); - Ldc(1); + Ldloc(bump); Add(); Stfld(s_runtextposField); MarkLabel(prevelenIsNotZero); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 712e8e512360b4..fdec70cb0c202a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -324,7 +324,7 @@ private bool MatchRef(int index, int length, ReadOnlySpan inputSpan) private void Backwardnext() => runtextpos += _rightToLeft ? 1 : -1; - protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick, TimeSpan timeout) + protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick) { // Configure the additional value to "bump" the position along each time we loop around // to call FindFirstChar again, as well as the stopping position for the loop. We generally @@ -346,7 +346,7 @@ protected internal override void Scan(Regex regex, ReadOnlySpan text, int return; } - runtextpos++; + runtextpos += bump; } while (true) @@ -770,7 +770,7 @@ private void Go(ReadOnlySpan inputSpan) continue; case RegexOpcode.Boundary: - if (!IsBoundary(runtextpos, runtextbeg, runtextend)) + if (!IsBoundary(inputSpan, runtextpos, runtextbeg, runtextend)) { break; } @@ -778,7 +778,7 @@ private void Go(ReadOnlySpan inputSpan) continue; case RegexOpcode.NonBoundary: - if (IsBoundary(runtextpos, runtextbeg, runtextend)) + if (IsBoundary(inputSpan, runtextpos, runtextbeg, runtextend)) { break; } @@ -786,7 +786,7 @@ private void Go(ReadOnlySpan inputSpan) continue; case RegexOpcode.ECMABoundary: - if (!IsECMABoundary(runtextpos, runtextbeg, runtextend)) + if (!IsECMABoundary(inputSpan, runtextpos, runtextbeg, runtextend)) { break; } @@ -794,7 +794,7 @@ private void Go(ReadOnlySpan inputSpan) continue; case RegexOpcode.NonECMABoundary: - if (IsECMABoundary(runtextpos, runtextbeg, runtextend)) + if (IsECMABoundary(inputSpan, runtextpos, runtextbeg, runtextend)) { break; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index 060750e23d58df..7c0767c02707e8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -58,7 +58,7 @@ internal sealed class RegexLWCGCompiler : RegexCompiler DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", null, typeof(CompiledRegexRunner), s_paramTypes); EmitGo(); - DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(Regex), typeof(ReadOnlySpan), typeof(int), typeof(int), typeof(bool), typeof(TimeSpan) }); + DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(Regex), typeof(ReadOnlySpan), typeof(int), typeof(int), typeof(bool) }); EmitScan(findFirstCharMethod, goMethod); return new CompiledRegexRunnerFactory(scanMethod, code.TrackCount); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index e8ea102b0421fe..8c6378ad2e6d7b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -88,15 +88,16 @@ protected RegexRunner() { } protected Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) => Scan(regex, text, textbeg, textend, textstart, prevlen, quick, regex.MatchTimeout); - protected internal virtual void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick, TimeSpan timeout) + protected internal virtual void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick) { string? s = runtext; if (text != s) { s = text.ToString(); + runtext = s; } - Match? match = Scan(regex, s, 0, text.Length, textstart, prevlen, quick, timeout); + Match? match = Scan(regex, s, 0, text.Length, textstart, prevlen, quick, regex.internalMatchTimeout); runmatch = match; } @@ -378,7 +379,9 @@ private void DoCheckTimeout() if (0 > _timeoutOccursAt && 0 < currentMillis) return; - throw new RegexMatchTimeoutException(runtext!, runregex!.pattern!, TimeSpan.FromMilliseconds(_timeout)); + string input = runtext ?? string.Empty; + + throw new RegexMatchTimeoutException(input, runregex!.pattern!, TimeSpan.FromMilliseconds(_timeout)); } /// @@ -412,12 +415,12 @@ internal void InitializeForGo() { // Use a hashtabled Match object if the capture numbers are sparse runmatch = runregex!.caps is null ? - new Match(runregex, runregex.capsize, runtext!, runtextbeg, runtextend - runtextbeg, runtextstart) : - new MatchSparse(runregex, runregex.caps, runregex.capsize, runtext!, runtextbeg, runtextend - runtextbeg, runtextstart); + new Match(runregex, runregex.capsize, runtext ?? string.Empty, runtextbeg, runtextend - runtextbeg, runtextstart) : + new MatchSparse(runregex, runregex.caps, runregex.capsize, runtext, runtextbeg, runtextend - runtextbeg, runtextstart); } else { - runmatch.Reset(runregex!, runtext!, runtextbeg, runtextend, runtextstart); + runmatch.Reset(runregex!, runtext, runtextbeg, runtextend, runtextstart); } // Note we test runcrawl, because it is the last one to be allocated @@ -480,8 +483,15 @@ protected void EnsureStorage() /// protected bool IsBoundary(int index, int startpos, int endpos) { - return (index > startpos && RegexCharClass.IsBoundaryWordChar(runtext![index - 1])) != - (index < endpos && RegexCharClass.IsBoundaryWordChar(runtext![index])); + Debug.Assert(runtext != null, "runtext should not be null since this method is only callable by old codegen."); + return (index > startpos && RegexCharClass.IsBoundaryWordChar(runtext[index - 1])) != + (index < endpos && RegexCharClass.IsBoundaryWordChar(runtext[index])); + } + + protected bool IsBoundary(ReadOnlySpan inputSpan, int index, int startpos, int endpos) + { + return (index > startpos && RegexCharClass.IsBoundaryWordChar(inputSpan[index - 1])) != + (index < endpos && RegexCharClass.IsBoundaryWordChar(inputSpan[index])); } /// Called to determine a char's inclusion in the \w set. @@ -489,8 +499,15 @@ protected bool IsBoundary(int index, int startpos, int endpos) protected bool IsECMABoundary(int index, int startpos, int endpos) { - return (index > startpos && RegexCharClass.IsECMAWordChar(runtext![index - 1])) != - (index < endpos && RegexCharClass.IsECMAWordChar(runtext![index])); + Debug.Assert(runtext != null, "runtext should not be null since this method is only callable by old codegen."); + return (index > startpos && RegexCharClass.IsECMAWordChar(runtext[index - 1])) != + (index < endpos && RegexCharClass.IsECMAWordChar(runtext[index])); + } + + protected bool IsECMABoundary(ReadOnlySpan inputSpan, int index, int startpos, int endpos) + { + return (index > startpos && RegexCharClass.IsECMAWordChar(inputSpan[index - 1])) != + (index < endpos && RegexCharClass.IsECMAWordChar(inputSpan[index])); } protected static bool CharInSet(char ch, string set, string category) @@ -692,7 +709,10 @@ string DescribeTextPosition() if (runtextpos > runtextbeg) { - sb.Append(RegexCharClass.DescribeChar(runtext![runtextpos - 1])); + if (runtext != null) + { + sb.Append(RegexCharClass.DescribeChar(runtext[runtextpos - 1])); + } } else { @@ -703,7 +723,10 @@ string DescribeTextPosition() for (int i = runtextpos; i < runtextend; i++) { - sb.Append(RegexCharClass.DescribeChar(runtext![i])); + if (runtext != null) + { + sb.Append(RegexCharClass.DescribeChar(runtext[i])); + } } if (sb.Length >= 64) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index dd2050848d6925..6aff09f2ba8974 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -83,9 +83,42 @@ private sealed class Runner : RegexRunner where TSetType : notnull internal Runner(SymbolicRegexMatcher matcher) => _matcher = matcher; - protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick, TimeSpan timeout) + protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick) { + // Configure the additional value to "bump" the position along each time we loop around + // to call FindFirstChar again, as well as the stopping position for the loop. We generally + // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump + // by -1 and stop at textbeg. + int stoppos = text.Length; + if (regex.RightToLeft) + { + stoppos = 0; + } + + // If previous match was empty or failed, advance by one before matching. + if (prevlen == 0) + { + if (textstart == stoppos) + { + runmatch = Match.Empty; + return; + } + + runtextpos += regex.RightToLeft ? -1 : 1; + } + Go(text); + + // If we got a match, we're done. + if (runmatch!._matchcount[0] > 0) + { + if (quick) + { + runmatch = null; + } + } + + return; } private void Go(ReadOnlySpan inputSpan) From d16b8e93ad6f149ccf87c1f58cf0516c302bf51f Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Fri, 4 Feb 2022 10:23:06 -0800 Subject: [PATCH 4/6] Cleaning up some commented out code --- .../gen/RegexGenerator.Emitter.cs | 42 ------------------- .../ref/System.Text.RegularExpressions.cs | 4 +- 2 files changed, 2 insertions(+), 44 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index ad8db87de00849..a7ab5424d9d6ce 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -309,26 +309,6 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht) private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string id) { - RegexOptions options = (RegexOptions)rm.Options; - - // Emit locals initialization - //writer.WriteLine("this.quick = quick;"); - //writer.WriteLine("base.runtextpos = textstart;"); - //writer.WriteLine("base.runregex = regex;"); - //writer.WriteLine("base.runtextstart = textstart;"); - //writer.WriteLine("base.runtextbeg = textbeg;"); - //writer.WriteLine("base.runtextend = textend;"); - //writer.WriteLine("bool initialized = false;"); - //writer.WriteLine(); - - //EmitTimeoutHandling(); - //writer.WriteLine(); - - // Source generator doesn't support Right-To-Left so there is no need to add the sepcial bump logic. - // If Right-to-left is ever added to the source generator, then we would need to the logic to define - // bump, as well as stoppos - Debug.Assert((options & RegexOptions.RightToLeft) == 0); - EmitPrevLenCheck(); writer.WriteLine(); @@ -337,26 +317,6 @@ private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string i return; -#pragma warning disable CS8321 // Local function is declared but never used - void EmitTimeoutHandling() - { - writer.WriteLine("// Handle timeout argument"); - writer.WriteLine("_timeout = -1;"); - writer.WriteLine("bool ignoreTimeout = global::System.Text.RegularExpressions.Regex.InfiniteMatchTimeout == timeout;"); - using (EmitBlock(writer, "if (!ignoreTimeout)")) - { - writer.WriteLine("// We are using Environment.TickCount and not Stopwatch for performance reasons."); - writer.WriteLine("// Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt"); - writer.WriteLine("// overflow it will still stay ahead of Environment.TickCount for comparisons made"); - writer.WriteLine("// in DoCheckTimeout()."); - writer.WriteLine("global::System.Text.RegularExpressions.Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected"); - writer.WriteLine("_timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round;"); - writer.WriteLine("_timeoutOccursAt = global::System.Environment.TickCount + _timeout;"); - writer.WriteLine("_timeoutChecksToSkip = TimeoutCheckFrequency;"); - } - } -#pragma warning restore CS8321 // Local function is declared but never used - void EmitPrevLenCheck() { writer.WriteLine("// If previous match was empty or failed, advance by one before matching."); @@ -387,11 +347,9 @@ void EmitMainSearchLoop() using (EmitBlock(writer, "if (quick)")) { writer.WriteLine("base.runmatch = null;"); - writer.WriteLine("return;"); } writer.WriteLine(); - writer.WriteLine("// base.runmatch!.Tidy(base.runtextpos);"); writer.WriteLine("return;"); } writer.WriteLine(); diff --git a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs index 5f5d4b5bf0302c..1b7b96774321ed 100644 --- a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs +++ b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs @@ -338,9 +338,9 @@ protected void EnsureStorage() { } protected virtual void Go() { throw null; } protected virtual void InitTrackCount() { throw null; } protected bool IsBoundary(int index, int startpos, int endpos) { throw null; } - protected bool IsBoundary(System.ReadOnlySpan inputSpan, int index, int startpos, int endpos) { throw null; } + protected bool IsBoundary(System.ReadOnlySpan inputSpan, int index, int startpos, int endpos) { throw null; } // -> This is just temporary on the prototype. Method will be emitted by the generator engines protected bool IsECMABoundary(int index, int startpos, int endpos) { throw null; } - protected bool IsECMABoundary(System.ReadOnlySpan inputSpan, int index, int startpos, int endpos) { throw null; } + protected bool IsECMABoundary(System.ReadOnlySpan inputSpan, int index, int startpos, int endpos) { throw null; } // -> This is just temporary on the prototype. Method will be emitted by the generator engines protected bool IsMatched(int cap) { throw null; } protected int MatchIndex(int cap) { throw null; } protected int MatchLength(int cap) { throw null; } From 444c017aead154438f6abdf0ef92fee10bfeb6ea Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Tue, 8 Feb 2022 10:21:15 -0800 Subject: [PATCH 5/6] Making Span to only take in a Span and ensure it doesn't require to touch runmatch any more. --- .../gen/RegexGenerator.Emitter.cs | 70 +++------ .../ref/System.Text.RegularExpressions.cs | 2 +- .../RegularExpressions/CompiledRegexRunner.cs | 6 +- .../System/Text/RegularExpressions/Regex.cs | 148 ++++++++++++------ .../Text/RegularExpressions/RegexCompiler.cs | 70 ++------- .../RegularExpressions/RegexInterpreter.cs | 23 +-- .../RegularExpressions/RegexLWCGCompiler.cs | 2 +- .../Text/RegularExpressions/RegexRunner.cs | 19 +-- .../Symbolic/SymbolicRegexRunnerFactory.cs | 36 +---- 9 files changed, 141 insertions(+), 235 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index a7ab5424d9d6ce..5dba985e66c03e 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -222,7 +222,7 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri DescribeExpression(writer, rm.Code.Tree.Root.Child(0), " // ", rm.Code); // skip implicit root capture writer.WriteLine(); - writer.WriteLine($" protected override void Scan(global::System.Text.RegularExpressions.Regex regex, global::System.ReadOnlySpan text, int textstart, int prevlen, bool quick)"); + writer.WriteLine($" protected override void Scan(global::System.ReadOnlySpan text)"); writer.WriteLine($" {{"); writer.Indent += 4; EmitScan(writer, rm, id); @@ -309,68 +309,38 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht) private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string id) { - EmitPrevLenCheck(); - writer.WriteLine(); - - EmitMainSearchLoop(); - writer.WriteLine(); - - return; - - void EmitPrevLenCheck() - { - writer.WriteLine("// If previous match was empty or failed, advance by one before matching."); - using (EmitBlock(writer, "if (prevlen == 0)")) - { - using (EmitBlock(writer, "if (textstart == text.Length)")) - { - writer.WriteLine("base.runmatch = global::System.Text.RegularExpressions.Match.Empty;"); - writer.WriteLine("return;"); - } - writer.WriteLine(); - writer.WriteLine("base.runtextpos++;"); - } - } - - void EmitMainSearchLoop() + using (EmitBlock(writer, "while (true)")) { - using (EmitBlock(writer, "while (true)")) + using (EmitBlock(writer, "if (FindFirstChar(text))")) { - using (EmitBlock(writer, "if (FindFirstChar(text))")) + if (rm.MatchTimeout != int.MaxValue) { writer.WriteLine("base.CheckTimeout();"); writer.WriteLine(); - - writer.WriteLine("// If we got a match, we're done."); - using (EmitBlock(writer, "if (Go(text))")) - { - using (EmitBlock(writer, "if (quick)")) - { - writer.WriteLine("base.runmatch = null;"); - } - writer.WriteLine(); - - writer.WriteLine("return;"); - } - writer.WriteLine(); - - writer.WriteLine("// Reset state for another iteration."); - writer.WriteLine("base.runtrackpos = base.runtrack!.Length;"); - writer.WriteLine("base.runstackpos = base.runstack!.Length;"); - writer.WriteLine("base.runcrawlpos = base.runcrawl!.Length;"); } - writer.WriteLine(); - writer.WriteLine("// We failed to find a match. If we're at the end of the input, then we are done."); - using (EmitBlock(writer, "if (base.runtextpos == text.Length)")) + writer.WriteLine("// If we got a match, we're done."); + using (EmitBlock(writer, "if (Go(text))")) { - writer.WriteLine("base.runmatch = global::System.Text.RegularExpressions.Match.Empty;"); writer.WriteLine("return;"); } writer.WriteLine(); - writer.WriteLine("base.runtextpos++;"); + writer.WriteLine("// Reset state for another iteration."); + writer.WriteLine("base.runtrackpos = base.runtrack!.Length;"); + writer.WriteLine("base.runstackpos = base.runstack!.Length;"); + writer.WriteLine("base.runcrawlpos = base.runcrawl!.Length;"); } + writer.WriteLine(); + + writer.WriteLine("// We failed to find a match. If we're at the end of the input, then we are done."); + using (EmitBlock(writer, "if (base.runtextpos == text.Length)")) + { + writer.WriteLine("return;"); + } + writer.WriteLine(); + + writer.WriteLine("base.runtextpos++;"); } } diff --git a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs index 1b7b96774321ed..bae661abee0754 100644 --- a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs +++ b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs @@ -347,7 +347,7 @@ protected void EnsureStorage() { } protected int Popcrawl() { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, System.TimeSpan timeout) { throw null; } - protected internal virtual void Scan(System.Text.RegularExpressions.Regex regex, System.ReadOnlySpan text, int textstart, int prevlen, bool quick) { throw null; } + protected internal virtual void Scan(System.ReadOnlySpan text) { throw null; } protected void TransferCapture(int capnum, int uncapnum, int start, int end) { } protected void Uncapture() { } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs index acd122f8a57e56..ecc7cbfadf4a80 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs @@ -7,7 +7,7 @@ internal sealed class CompiledRegexRunner : RegexRunner { private readonly ScanDelegate _scanMethod; - internal delegate void ScanDelegate(RegexRunner runner, Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick); + internal delegate void ScanDelegate(RegexRunner runner, ReadOnlySpan text); public CompiledRegexRunner(ScanDelegate scan, int trackCount) { @@ -15,8 +15,8 @@ public CompiledRegexRunner(ScanDelegate scan, int trackCount) runtrackcount = trackCount; } - protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick) - => _scanMethod(this, regex, text, textstart, prevlen, quick); + protected internal override void Scan(ReadOnlySpan text) + => _scanMethod(this, text); protected override void InitTrackCount() { } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 1e2f176033cfd5..97556b667028da 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -383,11 +383,47 @@ protected void InitializeReferences() RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { + bool skipScan = false; runner.InitializeTimeout(internalMatchTimeout); ReadOnlySpan span = input.AsSpan(beginning, length); runner.InitializeForScan(this, span, 0, span.Length, startat - beginning, quick); runner.InitializeForGo(); - runner.Scan(this, span, startat - beginning, prevlen, quick); + + int stoppos = RightToLeft ? 0 : span.Length; + + // If previous match was empty or failed, advance by one before matching. + if (prevlen == 0) + { + if (runner.runtextstart == stoppos) + { + skipScan = true; + runner.runmatch = System.Text.RegularExpressions.Match.Empty; + } + + runner.runtextpos += RightToLeft ? -1 : 1; + } + + if (!skipScan) + { + runner.Scan(span); + + // if we got a match, set runmatch to null if quick is true + if (runner.runmatch!._matchcount[0] > 0) + { + if (quick) + { + runner.runmatch = null; + } + } + else + { + if (runner.runtextpos == stoppos) + { + runner.runmatch = System.Text.RegularExpressions.Match.Empty; + } + } + } + Match? m = runner.runmatch; runner.runmatch = null; // Reset runmatch if (m is not null) @@ -433,7 +469,7 @@ protected void InitializeReferences() ReadOnlySpan span = input.Slice(beginning, length); runner.InitializeForScan(this, span, 0, span.Length, startat - beginning, quick); runner.InitializeForGo(); - runner.Scan(this, span, startat - beginning, prevlen, quick); + runner.Scan(span); Match? m = runner.runmatch; runner.runmatch = null; // Reset runmatch return m; @@ -452,79 +488,89 @@ internal void Run(string input, int startat, ref TState state, MatchCall try { runner.InitializeTimeout(internalMatchTimeout); + int runtextpos = startat; while (true) { runner.InitializeForScan(this, input, 0, input.Length, startat, false); + runner.runtextpos = runtextpos; runner.InitializeForGo(); - runner.Scan(this, input, startat, -1, false); + + int stoppos = RightToLeft ? 0 : input.Length; + runner.Scan(input); Match? m = runner.runmatch; - if (m is not null) + // if we got a match, set runmatch to null if quick is true + if (m is not null && m._matchcount[0] > 0) { - if (m._matchcount[0] > 0) + if (m.Text != input) { - if (m.Text != input) - m.Text = input; - if (!reuseMatchObject) + m.Text = input; + } + + if (!reuseMatchObject) + { + // We're not reusing match objects, so null out our field reference to the instance. + // It'll be recreated the next time one is needed. + runner.runmatch = null; + } + m.Tidy(runner.runtextpos); + if (!callback(ref state, m)) + { + // If the callback returns false, we're done. + // Drop reference to text to avoid keeping it alive in a cache. + runner.runtext = null!; + if (reuseMatchObject) { - // We're not reusing match objects, so null out our field reference to the instance. - // It'll be recreated the next time one is needed. - runner.runmatch = null; + // We're reusing the single match instance, so clear out its text as well. + // We don't do this if we're not reusing instances, as in that case we're + // dropping the whole reference to the match, and we no longer own the instance + // having handed it out to the callback. + m.Text = null!; } - m.Tidy(runner.runtextpos); - if (!callback(ref state, m)) + return; + } + + // Now that we've matched successfully, update the starting position to reflect + // the current position, just as Match.NextMatch() would pass in _textpos as textstart. + runtextpos = startat = runner.runtextpos; + + + // Reset state for another iteration. + runner.runtrackpos = runner.runtrack!.Length; + runner.runstackpos = runner.runstack!.Length; + runner.runcrawlpos = runner.runcrawl!.Length; + + if (m.Length == 0) + { + if (runner.runtextpos == stoppos) { - // If the callback returns false, we're done. // Drop reference to text to avoid keeping it alive in a cache. runner.runtext = null!; if (reuseMatchObject) { - // We're reusing the single match instance, so clear out its text as well. - // We don't do this if we're not reusing instances, as in that case we're - // dropping the whole reference to the match, and we no longer own the instance - // having handed it out to the callback. + // See above comment. m.Text = null!; } return; } - // Now that we've matched successfully, update the starting position to reflect - // the current position, just as Match.NextMatch() would pass in _textpos as textstart. - runner.runtextstart = startat = runner.runtextpos; - - // Reset state for another iteration. - runner.runtrackpos = runner.runtrack!.Length; - runner.runstackpos = runner.runstack!.Length; - runner.runcrawlpos = runner.runcrawl!.Length; - if (m.Length == 0) - { - if (runner.runtextpos == input.Length) - { - // Drop reference to text to avoid keeping it alive in a cache. - runner.runtext = null!; - if (reuseMatchObject) - { - // See above comment. - m.Text = null!; - } - return; - } - - runner.runtextpos += ((this.Options & RegexOptions.RightToLeft) > 0) ? -1 : 1; - } - - // Loop around to perform next match from where we left off. - continue; + runtextpos += RightToLeft ? -1 : 1; } - else + + // Loop around to perform next match from where we left off. + continue; + } + else + { + // We failed to match at this position. If we're at the stopping point, we're done. + if (runner.runtextpos == stoppos) { - // if we are at the end of the input, just return. - if (startat == input.Length) + runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache + if (runner.runmatch != null) { - runner.runtext = null; - runner.runmatch = null; - return; + runner.runmatch.Text = null!; } + return; } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index a015206bb23a89..4e83fcddda574a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -28,6 +28,7 @@ internal abstract class RegexCompiler private static readonly FieldInfo s_runstackposField = RegexRunnerField("runstackpos"); private static readonly FieldInfo s_runcrawlField = RegexRunnerField("runcrawl"); private static readonly FieldInfo s_runcrawlposField = RegexRunnerField("runcrawlpos"); + private static readonly FieldInfo s_runregexField = RegexRunnerField("runregex"); private static readonly FieldInfo s_matchMatchCountField = typeof(Match).GetField("_matchcount", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance | BindingFlags.Static)!; @@ -53,7 +54,6 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_regexGetRightToLeft = typeof(Regex).GetMethod("get_RightToLeft")!; private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan).GetMethod("get_Item", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan).GetMethod("get_Length")!; - private static readonly MethodInfo s_matchGetEmptyMethod = typeof(Match).GetMethod("get_Empty")!; private static readonly MethodInfo s_memoryMarshalGetReference = typeof(MemoryMarshal).GetMethod("GetReference", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfChar = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfSpan = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); @@ -3944,6 +3944,7 @@ protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMetho { LocalBuilder bump = DeclareInt32(); LocalBuilder stoppos = DeclareInt32(); + Label returnLabel = DefineLabel(); Label notRightToLeft = DefineLabel(); // int bump = 1 @@ -3951,13 +3952,13 @@ protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMetho Stloc(bump); // int stoppos = text.Length - _ilg!.Emit(OpCodes.Ldarga_S, 2); + _ilg!.Emit(OpCodes.Ldarga_S, 1); Call(s_spanGetLengthMethod); Stloc(stoppos); - // if (regex.RightToLeft) + // if (runregex.RightToLeft) // { - Ldarg_1(); + Ldthisfld(s_runregexField); Callvirt(s_regexGetRightToLeft); BrfalseFar(notRightToLeft); @@ -3970,49 +3971,16 @@ protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMetho Stloc(stoppos); MarkLabel(notRightToLeft); - // if (prevlen == 0) - // { - Label prevelenIsNotZero = DefineLabel(); - _ilg!.Emit(OpCodes.Ldarg_S, 4); - Ldc(0); - Ceq(); - BrfalseFar(prevelenIsNotZero); - - // if (textstart == stoppos) - // { - Label textstartNotEqualToStoppos = DefineLabel(); - _ilg!.Emit(OpCodes.Ldarg_3); - Ldloc(stoppos); - Ceq(); - BrfalseFar(textstartNotEqualToStoppos); - - // runmatch = Match.Empty; - // return; - Label returnLabel = DefineLabel(); - Ldthis(); - Call(s_matchGetEmptyMethod); - Stfld(s_runmatchField); - BrFar(returnLabel); - MarkLabel(textstartNotEqualToStoppos); - - // runtextpos += bump; - Ldthis(); - Ldthisfld(s_runtextposField); - Ldloc(bump); - Add(); - Stfld(s_runtextposField); - MarkLabel(prevelenIsNotZero); - // while (true) Label whileLoopEnd = DefineLabel(); Label whileLoopBody = DefineLabel(); MarkLabel(whileLoopBody); // if (FindFirstChar(text)) + Label afterFindFirstCharLabel = DefineLabel(); Ldthis(); - _ilg!.Emit(OpCodes.Ldarg_2); + Ldarg_1(); Call(findFirstCharMethod); - Label afterFindFirstCharLabel = DefineLabel(); BrfalseFar(afterFindFirstCharLabel); // CheckTimeout(); @@ -4021,31 +3989,19 @@ protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMetho // Go(text); Ldthis(); - _ilg!.Emit(OpCodes.Ldarg_2); + Ldarg_1(); Call(goMethod); // if (runmatch!._matchcount[0] > 0) + // return; + Label afterSuccessMatchLabel = DefineLabel(); Ldthisfld(s_runmatchField); _ilg!.Emit(OpCodes.Ldfld, s_matchMatchCountField); Ldc(0); LdelemI4(); Ldc(0); _ilg!.Emit(OpCodes.Cgt); - Label afterSuccessMatchLabel = DefineLabel(); BrfalseFar(afterSuccessMatchLabel); - - // if (quick) - _ilg!.Emit(OpCodes.Ldarg_S, 5); - Label afterQuickCheckLabel = DefineLabel(); - BrfalseFar(afterQuickCheckLabel); - - // runmatch = null; - Ldthis(); - _ilg!.Emit(OpCodes.Ldnull); - Stfld(s_runmatchField); - - // return; - MarkLabel(afterQuickCheckLabel); BrFar(returnLabel); // runtrackpos = runtrack!.Length; @@ -4071,18 +4027,14 @@ protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMetho Stfld(s_runcrawlposField); // if (runtextpos == stoppos) + Label incrementRuntextPosLabel = DefineLabel(); MarkLabel(afterFindFirstCharLabel); Ldthisfld(s_runtextposField); Ldloc(stoppos); Ceq(); - Label incrementRuntextPosLabel = DefineLabel(); BrfalseFar(incrementRuntextPosLabel); - // runmatch = Match.Empty; // return; - Ldthis(); - Call(s_matchGetEmptyMethod); - Stfld(s_runmatchField); BrFar(returnLabel); // runtextpos += bump diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index fdec70cb0c202a..280c94736d46c4 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -324,31 +324,20 @@ private bool MatchRef(int index, int length, ReadOnlySpan inputSpan) private void Backwardnext() => runtextpos += _rightToLeft ? 1 : -1; - protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick) + protected internal override void Scan(ReadOnlySpan text) { // Configure the additional value to "bump" the position along each time we loop around // to call FindFirstChar again, as well as the stopping position for the loop. We generally // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump // by -1 and stop at textbeg. int bump = 1, stoppos = text.Length; - if (regex.RightToLeft) + Debug.Assert(runregex != null); + if (runregex.RightToLeft) { bump = -1; stoppos = 0; } - // If previous match was empty or failed, advance by one before matching. - if (prevlen == 0) - { - if (textstart == stoppos) - { - runmatch = Match.Empty; - return; - } - - runtextpos += bump; - } - while (true) { if (FindFirstChar(text)) @@ -360,11 +349,6 @@ protected internal override void Scan(Regex regex, ReadOnlySpan text, int // If we got a match, we're done. if (runmatch!._matchcount[0] > 0) { - if (quick) - { - runmatch = null; - } - return; } @@ -376,7 +360,6 @@ protected internal override void Scan(Regex regex, ReadOnlySpan text, int if (runtextpos == stoppos) { - runmatch = Match.Empty; return; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index 7c0767c02707e8..22befb48e48505 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -58,7 +58,7 @@ internal sealed class RegexLWCGCompiler : RegexCompiler DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", null, typeof(CompiledRegexRunner), s_paramTypes); EmitGo(); - DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(Regex), typeof(ReadOnlySpan), typeof(int), typeof(int), typeof(bool) }); + DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(ReadOnlySpan) }); EmitScan(findFirstCharMethod, goMethod); return new CompiledRegexRunnerFactory(scanMethod, code.TrackCount); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 8c6378ad2e6d7b..eb8190b32dcc9e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -88,7 +88,7 @@ protected RegexRunner() { } protected Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) => Scan(regex, text, textbeg, textend, textstart, prevlen, quick, regex.MatchTimeout); - protected internal virtual void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick) + protected internal virtual void Scan(ReadOnlySpan text) { string? s = runtext; if (text != s) @@ -97,14 +97,13 @@ protected internal virtual void Scan(Regex regex, ReadOnlySpan text, int t runtext = s; } - Match? match = Scan(regex, s, 0, text.Length, textstart, prevlen, quick, regex.internalMatchTimeout); + Debug.Assert(runregex != null); + Match? match = Scan(runregex, s, 0, text.Length, runtextstart, -1, quick, runregex.internalMatchTimeout); runmatch = match; } protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) { - this.quick = quick; - // Configure the additional value to "bump" the position along each time we loop around // to call FindFirstChar again, as well as the stopping position for the loop. We generally // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump @@ -116,17 +115,6 @@ protected internal virtual void Scan(Regex regex, ReadOnlySpan text, int t stoppos = textbeg; } - // If previous match was empty or failed, advance by one before matching. - if (prevlen == 0) - { - if (textstart == stoppos) - { - return Match.Empty; - } - - runtextpos += bump; - } - while (true) { // Find the next potential location for a match in the input. @@ -186,7 +174,6 @@ internal void InitializeForScan(Regex regex, ReadOnlySpan text, int textbe // Store remaining arguments into fields now that we're going to start the scan. // These are referenced by the derived runner. runregex = regex; - //runtext = text; runtextstart = textstart; runtextbeg = textbeg; runtextend = textend; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index 6aff09f2ba8974..4146fbcfbeb889 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Diagnostics; using System.Globalization; using System.Text.RegularExpressions.Symbolic.Unicode; @@ -83,42 +84,9 @@ private sealed class Runner : RegexRunner where TSetType : notnull internal Runner(SymbolicRegexMatcher matcher) => _matcher = matcher; - protected internal override void Scan(Regex regex, ReadOnlySpan text, int textstart, int prevlen, bool quick) + protected internal override void Scan(ReadOnlySpan text) { - // Configure the additional value to "bump" the position along each time we loop around - // to call FindFirstChar again, as well as the stopping position for the loop. We generally - // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump - // by -1 and stop at textbeg. - int stoppos = text.Length; - if (regex.RightToLeft) - { - stoppos = 0; - } - - // If previous match was empty or failed, advance by one before matching. - if (prevlen == 0) - { - if (textstart == stoppos) - { - runmatch = Match.Empty; - return; - } - - runtextpos += regex.RightToLeft ? -1 : 1; - } - Go(text); - - // If we got a match, we're done. - if (runmatch!._matchcount[0] > 0) - { - if (quick) - { - runmatch = null; - } - } - - return; } private void Go(ReadOnlySpan inputSpan) From cc926232aa022da65d932907ab6628439369f922 Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Tue, 8 Feb 2022 14:00:01 -0800 Subject: [PATCH 6/6] Adress some PR feedback --- .../gen/RegexGenerator.Emitter.cs | 2 +- .../Text/RegularExpressions/CompiledRegexRunner.cs | 2 -- .../src/System/Text/RegularExpressions/RegexRunner.cs | 10 ++++------ 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 5dba985e66c03e..121b2a2f6072bf 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -313,7 +313,7 @@ private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string i { using (EmitBlock(writer, "if (FindFirstChar(text))")) { - if (rm.MatchTimeout != int.MaxValue) + if (rm.MatchTimeout != Timeout.Infinite) { writer.WriteLine("base.CheckTimeout();"); writer.WriteLine(); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs index ecc7cbfadf4a80..4de55562816575 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs @@ -17,7 +17,5 @@ public CompiledRegexRunner(ScanDelegate scan, int trackCount) protected internal override void Scan(ReadOnlySpan text) => _scanMethod(this, text); - - protected override void InitTrackCount() { } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index eb8190b32dcc9e..ac5fdd16e7a07d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -470,9 +470,8 @@ protected void EnsureStorage() /// protected bool IsBoundary(int index, int startpos, int endpos) { - Debug.Assert(runtext != null, "runtext should not be null since this method is only callable by old codegen."); - return (index > startpos && RegexCharClass.IsBoundaryWordChar(runtext[index - 1])) != - (index < endpos && RegexCharClass.IsBoundaryWordChar(runtext[index])); + return (index > startpos && RegexCharClass.IsBoundaryWordChar(runtext![index - 1])) != + (index < endpos && RegexCharClass.IsBoundaryWordChar(runtext![index])); } protected bool IsBoundary(ReadOnlySpan inputSpan, int index, int startpos, int endpos) @@ -486,9 +485,8 @@ protected bool IsBoundary(ReadOnlySpan inputSpan, int index, int startpos, protected bool IsECMABoundary(int index, int startpos, int endpos) { - Debug.Assert(runtext != null, "runtext should not be null since this method is only callable by old codegen."); - return (index > startpos && RegexCharClass.IsECMAWordChar(runtext[index - 1])) != - (index < endpos && RegexCharClass.IsECMAWordChar(runtext[index])); + return (index > startpos && RegexCharClass.IsECMAWordChar(runtext![index - 1])) != + (index < endpos && RegexCharClass.IsECMAWordChar(runtext![index])); } protected bool IsECMABoundary(ReadOnlySpan inputSpan, int index, int startpos, int endpos)