diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 1f2a5d7de00a03..121b2a2f6072bf 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -222,7 +222,15 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri DescribeExpression(writer, rm.Code.Tree.Root.Child(0), " // ", rm.Code); // skip implicit root capture writer.WriteLine(); - writer.WriteLine($" protected override bool FindFirstChar()"); + writer.WriteLine($" protected override void Scan(global::System.ReadOnlySpan text)"); + writer.WriteLine($" {{"); + writer.Indent += 4; + EmitScan(writer, rm, id); + writer.Indent -= 4; + writer.WriteLine($" }}"); + writer.WriteLine(); + + writer.WriteLine($" private bool FindFirstChar(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; RequiredHelperFunctions requiredHelpers = EmitFindFirstChar(writer, rm, id); @@ -233,7 +241,7 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri { writer.WriteLine($" [global::System.Runtime.CompilerServices.SkipLocalsInit]"); } - writer.WriteLine($" protected override void Go()"); + writer.WriteLine($" private bool Go(global::System.ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 4; requiredHelpers |= EmitGo(writer, rm, id); @@ -299,6 +307,43 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht) } } + private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string id) + { + using (EmitBlock(writer, "while (true)")) + { + using (EmitBlock(writer, "if (FindFirstChar(text))")) + { + if (rm.MatchTimeout != Timeout.Infinite) + { + writer.WriteLine("base.CheckTimeout();"); + writer.WriteLine(); + } + + writer.WriteLine("// If we got a match, we're done."); + using (EmitBlock(writer, "if (Go(text))")) + { + writer.WriteLine("return;"); + } + writer.WriteLine(); + + writer.WriteLine("// Reset state for another iteration."); + writer.WriteLine("base.runtrackpos = base.runtrack!.Length;"); + writer.WriteLine("base.runstackpos = base.runstack!.Length;"); + writer.WriteLine("base.runcrawlpos = base.runcrawl!.Length;"); + } + writer.WriteLine(); + + writer.WriteLine("// We failed to find a match. If we're at the end of the input, then we are done."); + using (EmitBlock(writer, "if (base.runtextpos == text.Length)")) + { + writer.WriteLine("return;"); + } + writer.WriteLine(); + + writer.WriteLine("base.runtextpos++;"); + } + } + /// Emits the body of the FindFirstChar override. private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id) { @@ -347,7 +392,6 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ { case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitIndexOf(code.FindOptimizations.LeadingCaseSensitivePrefix); break; @@ -356,13 +400,11 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitFixedSet(); break; case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight_CaseSensitive: Debug.Assert(code.FindOptimizations.LiteralAfterLoop is not null); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); EmitLiteralAfterAtomicLoop(); break; @@ -463,7 +505,6 @@ bool EmitAnchors() // the other anchors, which all skip all subsequent processing if found, with BOL we just use it // to boost our position to the next line, and then continue normally with any searches. writer.WriteLine("// Beginning-of-line anchor"); - additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); additionalDeclarations.Add("int beginning = base.runtextbeg;"); using (EmitBlock(writer, "if (pos > beginning && inputSpan[pos - 1] != '\\n')")) { @@ -763,6 +804,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe writer.WriteLine($"int end = start + {(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1)};"); writer.WriteLine("base.Capture(0, start, end);"); writer.WriteLine("base.runtextpos = end;"); + writer.WriteLine("return true;"); return requiredHelpers; case RegexNodeKind.Empty: @@ -770,6 +812,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // source generator and seeing what happens as you add more to expressions. When approaching // it from a learning perspective, this is very common, as it's the empty string you start with. writer.WriteLine("base.Capture(0, base.runtextpos, base.runtextpos);"); + writer.WriteLine("return true;"); return requiredHelpers; } @@ -781,7 +824,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // Declare some locals. string sliceSpan = "slice"; - writer.WriteLine("global::System.ReadOnlySpan inputSpan = base.runtext;"); writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;"); writer.WriteLine($"int original_pos = pos;"); bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm); @@ -826,7 +868,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe } writer.WriteLine("base.runtextpos = pos;"); writer.WriteLine("base.Capture(0, original_pos, pos);"); - writer.WriteLine("return;"); + writer.WriteLine("return true;"); writer.WriteLine(); // We only get here in the code if the whole expression fails to match and jumps to @@ -837,6 +879,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe { EmitUncaptureUntil("0"); } + writer.WriteLine("return false;"); // We're done with the match. @@ -846,8 +889,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe // And emit any required helpers. if (additionalLocalFunctions.Count != 0) { - writer.WriteLine("return;"); // not strictly necessary, just for readability - foreach (KeyValuePair localFunctions in additionalLocalFunctions.OrderBy(k => k.Key)) { writer.WriteLine(); @@ -2148,7 +2189,7 @@ void EmitBoundary(RegexNode node) _ => "base.IsECMABoundary", }; - using (EmitBlock(writer, $"if ({call}(pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))")) + using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))")) { writer.WriteLine($"goto {doneLabel};"); } diff --git a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs index 251d595081072d..bae661abee0754 100644 --- a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs +++ b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs @@ -171,6 +171,10 @@ public static void CompileToAssembly(System.Text.RegularExpressions.RegexCompila public string GroupNameFromNumber(int i) { throw null; } public int GroupNumberFromName(string name) { throw null; } protected void InitializeReferences() { } + public bool IsMatch(System.ReadOnlySpan input) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options) { throw null; } + public static bool IsMatch(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options, System.TimeSpan matchTimeout) { throw null; } public bool IsMatch(string input) { throw null; } public bool IsMatch(string input, int startat) { throw null; } public static bool IsMatch(string input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; } @@ -330,17 +334,20 @@ protected void DoubleCrawl() { } protected void DoubleStack() { } protected void DoubleTrack() { } protected void EnsureStorage() { } - protected abstract bool FindFirstChar(); - protected abstract void Go(); - protected abstract void InitTrackCount(); + protected virtual bool FindFirstChar() { throw null; } + protected virtual void Go() { throw null; } + protected virtual void InitTrackCount() { throw null; } protected bool IsBoundary(int index, int startpos, int endpos) { throw null; } + protected bool IsBoundary(System.ReadOnlySpan inputSpan, int index, int startpos, int endpos) { throw null; } // -> This is just temporary on the prototype. Method will be emitted by the generator engines protected bool IsECMABoundary(int index, int startpos, int endpos) { throw null; } + protected bool IsECMABoundary(System.ReadOnlySpan inputSpan, int index, int startpos, int endpos) { throw null; } // -> This is just temporary on the prototype. Method will be emitted by the generator engines protected bool IsMatched(int cap) { throw null; } protected int MatchIndex(int cap) { throw null; } protected int MatchLength(int cap) { throw null; } protected int Popcrawl() { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) { throw null; } protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, System.TimeSpan timeout) { throw null; } + protected internal virtual void Scan(System.ReadOnlySpan text) { throw null; } protected void TransferCapture(int capnum, int uncapnum, int start, int end) { } protected void Uncapture() { } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs index 81683e09bec1ff..2eee81fadc66f5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs @@ -9,7 +9,7 @@ namespace System.Text.RegularExpressions /// public class Capture { - internal Capture(string text, int index, int length) + internal Capture(string? text, int index, int length) { Text = text; Index = index; @@ -19,27 +19,38 @@ internal Capture(string text, int index, int length) /// Returns the position in the original string where the first character of captured substring was found. public int Index { get; private protected set; } + /// + /// This method should only be called when the text for matching was sliced with a different beginning, so the resulting index of + /// the match is not from the start of the text, but instead the start of the slice. This method will add back that extra indices + /// to account for the original text beginning. + /// + /// The original text's beginning offset. + internal void AddBeginningToIndex(int beginning) + { + Index += beginning; + } + /// Returns the length of the captured substring. public int Length { get; private protected set; } /// The original string - internal string Text { get; set; } + internal string? Text { get; set; } /// Gets the captured substring from the input string. /// The substring that is captured by the match. - public string Value => Text.Substring(Index, Length); + public string Value => Text is string text ? text.Substring(Index, Length) : string.Empty; /// Gets the captured span from the input string. /// The span that is captured by the match. - public ReadOnlySpan ValueSpan => Text.AsSpan(Index, Length); + public ReadOnlySpan ValueSpan => Text is string text ? text.AsSpan(Index, Length) : ReadOnlySpan.Empty; /// Returns the substring that was matched. public override string ToString() => Value; /// The substring to the left of the capture - internal ReadOnlyMemory GetLeftSubstring() => Text.AsMemory(0, Index); + internal ReadOnlyMemory GetLeftSubstring() => Text is string text ? text.AsMemory(0, Index) : ReadOnlyMemory.Empty; /// The substring to the right of the capture - internal ReadOnlyMemory GetRightSubstring() => Text.AsMemory(Index + Length, Text.Length - Index - Length); + internal ReadOnlyMemory GetRightSubstring() => Text is string text ? text.AsMemory(Index + Length, Text.Length - Index - Length) : ReadOnlyMemory.Empty; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs index c9e6419fed87cf..4de55562816575 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs @@ -5,20 +5,17 @@ namespace System.Text.RegularExpressions { internal sealed class CompiledRegexRunner : RegexRunner { - private readonly Action _goMethod; - private readonly Func _findFirstCharMethod; + private readonly ScanDelegate _scanMethod; - public CompiledRegexRunner(Action go, Func findFirstChar, int trackCount) + internal delegate void ScanDelegate(RegexRunner runner, ReadOnlySpan text); + + public CompiledRegexRunner(ScanDelegate scan, int trackCount) { - _goMethod = go; - _findFirstCharMethod = findFirstChar; + _scanMethod = scan; runtrackcount = trackCount; } - protected override void Go() => _goMethod(this); - - protected override bool FindFirstChar() => _findFirstCharMethod(this); - - protected override void InitTrackCount() { } + protected internal override void Scan(ReadOnlySpan text) + => _scanMethod(this, text); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs index 958d5cf3dc16f1..6575a3856e97d9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs @@ -7,25 +7,21 @@ namespace System.Text.RegularExpressions { internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory { - private readonly DynamicMethod _goMethod; - private readonly DynamicMethod _findFirstCharMethod; + private readonly DynamicMethod _scanMethod; private readonly int _trackcount; // Delegates are lazily created to avoid forcing JIT'ing until the regex is actually executed. - private Action? _go; - private Func? _findFirstChar; + private CompiledRegexRunner.ScanDelegate? _scan; - public CompiledRegexRunnerFactory(DynamicMethod goMethod, DynamicMethod findFirstCharMethod, int trackcount) + public CompiledRegexRunnerFactory(DynamicMethod scanMethod, int trackcount) { - _goMethod = goMethod; - _findFirstCharMethod = findFirstCharMethod; + _scanMethod = scanMethod; _trackcount = trackcount; } protected internal override RegexRunner CreateInstance() => new CompiledRegexRunner( - _go ??= _goMethod.CreateDelegate>(), - _findFirstChar ??= _findFirstCharMethod.CreateDelegate>(), + _scan ??= _scanMethod.CreateDelegate(), _trackcount); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs index f4b2a7fb2e9804..2c34694f1ecaf5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Group.cs @@ -16,7 +16,7 @@ public class Group : Capture internal int _capcount; internal CaptureCollection? _capcoll; - internal Group(string text, int[] caps, int capcount, string name) + internal Group(string? text, int[] caps, int capcount, string name) : base(text, capcount == 0 ? 0 : caps[(capcount - 1) * 2], capcount == 0 ? 0 : caps[(capcount * 2) - 1]) { _caps = caps; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs index 3c67526b40e184..19859fd2f0b2d1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs @@ -50,7 +50,7 @@ public class Match : Group internal bool _balancing; // whether we've done any balancing with this match. If we // have done balancing, we'll need to do extra work in Tidy(). - internal Match(Regex? regex, int capcount, string text, int begpos, int len, int startpos) : + internal Match(Regex? regex, int capcount, string? text, int begpos, int len, int startpos) : base(text, new int[2], 0, "0") { _regex = regex; @@ -66,7 +66,7 @@ internal Match(Regex? regex, int capcount, string text, int begpos, int len, int /// Returns an empty Match object. public static Match Empty { get; } = new Match(null, 1, string.Empty, 0, 0, 0); - internal void Reset(Regex regex, string text, int textbeg, int textend, int textstart) + internal void Reset(Regex regex, string? text, int textbeg, int textend, int textstart) { _regex = regex; Text = text; @@ -94,6 +94,7 @@ internal void Reset(Regex regex, string text, int textbeg, int textend, int text public Match NextMatch() { Regex? r = _regex; + Debug.Assert(Text != null); return r != null ? r.Run(false, Length, Text, _textbeg, _textend - _textbeg, _textpos)! : this; @@ -338,7 +339,7 @@ internal sealed class MatchSparse : Match { private new readonly Hashtable _caps; - internal MatchSparse(Regex regex, Hashtable caps, int capcount, string text, int begpos, int len, int startpos) : + internal MatchSparse(Regex regex, Hashtable caps, int capcount, string? text, int begpos, int len, int startpos) : base(regex, capcount, text, begpos, len, startpos) { _caps = caps; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs index cd0494c3dc0092..f2a058a4d70924 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs @@ -13,6 +13,15 @@ public partial class Regex public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern) => RegexCache.GetOrAdd(pattern).IsMatch(input); + /// + /// Searches the input span for one or more occurrences of the text supplied in the given pattern. + /// + /// The input span to be searched on. + /// The Regex pattern to be used for matching. + /// if the input matches the pattern, otherwise. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern) => + RegexCache.GetOrAdd(pattern).IsMatch(input); + /// /// Searches the input string for one or more occurrences of the text /// supplied in the pattern parameter with matching options supplied in the options @@ -21,9 +30,30 @@ public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Reg public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options) => RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).IsMatch(input); + /// + /// Searches the input span for one or more occurrences of the text supplied in the given pattern. It uses the passed in options. + /// + /// The input span to be searched on. + /// The Regex pattern to be used for matching. + /// The options to be used for matching + /// if the input matches the pattern, otherwise. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options) => + RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).IsMatch(input); + public static bool IsMatch(string input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options, TimeSpan matchTimeout) => RegexCache.GetOrAdd(pattern, options, matchTimeout).IsMatch(input); + /// + /// Searches the input span for one or more occurrences of the text supplied in the given pattern under the specified timeout. It uses the passed in options. + /// + /// The input span to be searched on. + /// The Regex pattern to be used for matching. + /// The options to be used for matching + /// Max time to be used for matching before returning. + /// if the input matches the pattern, otherwise. Also returns for time out. + public static bool IsMatch(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options, TimeSpan matchTimeout) => + RegexCache.GetOrAdd(pattern, options, matchTimeout).IsMatch(input); + /// /// Searches the input string for one or more matches using the previous pattern, /// options, and starting position. @@ -38,6 +68,16 @@ public bool IsMatch(string input) return Run(quick: true, -1, input, 0, input.Length, UseOptionR() ? input.Length : 0) is null; } + /// + /// Searches the input span for one or more matches using the previous pattern, + /// options, and starting position. + /// + /// if the input matches the pattern, otherwise. + public bool IsMatch(ReadOnlySpan input) + { + return Run(quick: true, -1, input, 0, input.Length, UseOptionR() ? input.Length : 0) is null; + } + /// /// Searches the input string for one or more matches using the previous pattern and options, /// with a new starting position. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index d07f3440de4b8a..97556b667028da 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -383,7 +383,96 @@ protected void InitializeReferences() RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { - return runner.Scan(this, input, beginning, beginning + length, startat, prevlen, quick, internalMatchTimeout); + bool skipScan = false; + runner.InitializeTimeout(internalMatchTimeout); + ReadOnlySpan span = input.AsSpan(beginning, length); + runner.InitializeForScan(this, span, 0, span.Length, startat - beginning, quick); + runner.InitializeForGo(); + + int stoppos = RightToLeft ? 0 : span.Length; + + // If previous match was empty or failed, advance by one before matching. + if (prevlen == 0) + { + if (runner.runtextstart == stoppos) + { + skipScan = true; + runner.runmatch = System.Text.RegularExpressions.Match.Empty; + } + + runner.runtextpos += RightToLeft ? -1 : 1; + } + + if (!skipScan) + { + runner.Scan(span); + + // if we got a match, set runmatch to null if quick is true + if (runner.runmatch!._matchcount[0] > 0) + { + if (quick) + { + runner.runmatch = null; + } + } + else + { + if (runner.runtextpos == stoppos) + { + runner.runmatch = System.Text.RegularExpressions.Match.Empty; + } + } + } + + Match? m = runner.runmatch; + runner.runmatch = null; // Reset runmatch + if (m is not null) + { + if (!quick) + { + if (m._matchcount[0] > 0) + m.Tidy(runner.runtextpos); + if (m.Text != input) + m.Text = input; + } + + // If there was a match and the original text was sliced, then add beggining to the index to get the real + // Index of the match. + if (m.Success && beginning != 0) + { + m.AddBeginningToIndex(beginning); + } + } + return m; + } + finally + { + _runner = runner; + } + } + + internal Match? Run(bool quick, int prevlen, ReadOnlySpan input, int beginning, int length, int startat) + { + if ((uint)startat > (uint)input.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.startat, ExceptionResource.BeginIndexNotNegative); + } + if ((uint)length > (uint)input.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.length, ExceptionResource.LengthNotNegative); + } + + RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); + try + { + runner.InitializeTimeout(internalMatchTimeout); + ReadOnlySpan span = input.Slice(beginning, length); + runner.InitializeForScan(this, span, 0, span.Length, startat - beginning, quick); + runner.InitializeForGo(); + runner.Scan(span); + Match? m = runner.runmatch; + runner.runmatch = null; // Reset runmatch + return m; } finally { @@ -398,7 +487,93 @@ internal void Run(string input, int startat, ref TState state, MatchCall RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner(); try { - runner.ScanInternal(this, input, startat, ref state, callback, reuseMatchObject, internalMatchTimeout); + runner.InitializeTimeout(internalMatchTimeout); + int runtextpos = startat; + while (true) + { + runner.InitializeForScan(this, input, 0, input.Length, startat, false); + runner.runtextpos = runtextpos; + runner.InitializeForGo(); + + int stoppos = RightToLeft ? 0 : input.Length; + runner.Scan(input); + Match? m = runner.runmatch; + + // if we got a match, set runmatch to null if quick is true + if (m is not null && m._matchcount[0] > 0) + { + if (m.Text != input) + { + m.Text = input; + } + + if (!reuseMatchObject) + { + // We're not reusing match objects, so null out our field reference to the instance. + // It'll be recreated the next time one is needed. + runner.runmatch = null; + } + m.Tidy(runner.runtextpos); + if (!callback(ref state, m)) + { + // If the callback returns false, we're done. + // Drop reference to text to avoid keeping it alive in a cache. + runner.runtext = null!; + if (reuseMatchObject) + { + // We're reusing the single match instance, so clear out its text as well. + // We don't do this if we're not reusing instances, as in that case we're + // dropping the whole reference to the match, and we no longer own the instance + // having handed it out to the callback. + m.Text = null!; + } + return; + } + + // Now that we've matched successfully, update the starting position to reflect + // the current position, just as Match.NextMatch() would pass in _textpos as textstart. + runtextpos = startat = runner.runtextpos; + + + // Reset state for another iteration. + runner.runtrackpos = runner.runtrack!.Length; + runner.runstackpos = runner.runstack!.Length; + runner.runcrawlpos = runner.runcrawl!.Length; + + if (m.Length == 0) + { + if (runner.runtextpos == stoppos) + { + // Drop reference to text to avoid keeping it alive in a cache. + runner.runtext = null!; + if (reuseMatchObject) + { + // See above comment. + m.Text = null!; + } + return; + } + + runtextpos += RightToLeft ? -1 : 1; + } + + // Loop around to perform next match from where we left off. + continue; + } + else + { + // We failed to match at this position. If we're at the stopping point, we're done. + if (runner.runtextpos == stoppos) + { + runner.runtext = null; // drop reference to text to avoid keeping it alive in a cache + if (runner.runmatch != null) + { + runner.runmatch.Text = null!; + } + return; + } + } + } } finally { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 96ca0512634625..4e83fcddda574a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.Reflection; using System.Reflection.Emit; @@ -20,8 +21,16 @@ internal abstract class RegexCompiler private static readonly FieldInfo s_runtextendField = RegexRunnerField("runtextend"); private static readonly FieldInfo s_runtextstartField = RegexRunnerField("runtextstart"); private static readonly FieldInfo s_runtextposField = RegexRunnerField("runtextpos"); - private static readonly FieldInfo s_runtextField = RegexRunnerField("runtext"); private static readonly FieldInfo s_runstackField = RegexRunnerField("runstack"); + private static readonly FieldInfo s_runmatchField = RegexRunnerField("runmatch"); + private static readonly FieldInfo s_runtrackField = RegexRunnerField("runtrack"); + private static readonly FieldInfo s_runtrackposField = RegexRunnerField("runtrackpos"); + private static readonly FieldInfo s_runstackposField = RegexRunnerField("runstackpos"); + private static readonly FieldInfo s_runcrawlField = RegexRunnerField("runcrawl"); + private static readonly FieldInfo s_runcrawlposField = RegexRunnerField("runcrawlpos"); + private static readonly FieldInfo s_runregexField = RegexRunnerField("runregex"); + + private static readonly FieldInfo s_matchMatchCountField = typeof(Match).GetField("_matchcount", BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance | BindingFlags.Static)!; private static readonly MethodInfo s_captureMethod = RegexRunnerMethod("Capture"); private static readonly MethodInfo s_transferCaptureMethod = RegexRunnerMethod("TransferCapture"); @@ -29,9 +38,9 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_isMatchedMethod = RegexRunnerMethod("IsMatched"); private static readonly MethodInfo s_matchLengthMethod = RegexRunnerMethod("MatchLength"); private static readonly MethodInfo s_matchIndexMethod = RegexRunnerMethod("MatchIndex"); - private static readonly MethodInfo s_isBoundaryMethod = RegexRunnerMethod("IsBoundary"); + private static readonly MethodInfo s_isBoundaryMethod = typeof(RegexRunner).GetMethod("IsBoundary", BindingFlags.NonPublic | BindingFlags.Instance, new[] { typeof(ReadOnlySpan), typeof(int), typeof(int), typeof(int) })!; private static readonly MethodInfo s_isWordCharMethod = RegexRunnerMethod("IsWordChar"); - private static readonly MethodInfo s_isECMABoundaryMethod = RegexRunnerMethod("IsECMABoundary"); + private static readonly MethodInfo s_isECMABoundaryMethod = typeof(RegexRunner).GetMethod("IsECMABoundary", BindingFlags.NonPublic | BindingFlags.Instance, new[] { typeof(ReadOnlySpan), typeof(int), typeof(int), typeof(int) })!; private static readonly MethodInfo s_crawlposMethod = RegexRunnerMethod("Crawlpos"); private static readonly MethodInfo s_charInClassMethod = RegexRunnerMethod("CharInClass"); private static readonly MethodInfo s_checkTimeoutMethod = RegexRunnerMethod("CheckTimeout"); @@ -42,6 +51,7 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!; private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!; private static readonly MethodInfo s_cultureInfoGetTextInfoMethod = typeof(CultureInfo).GetMethod("get_TextInfo")!; + private static readonly MethodInfo s_regexGetRightToLeft = typeof(Regex).GetMethod("get_RightToLeft")!; private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan).GetMethod("get_Item", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan).GetMethod("get_Length")!; private static readonly MethodInfo s_memoryMarshalGetReference = typeof(MemoryMarshal).GetMethod("GetReference", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); @@ -180,6 +190,9 @@ internal abstract class RegexCompiler /// A macro for _ilg.Emit(OpCodes.Ldarg_0). protected void Ldthis() => _ilg!.Emit(OpCodes.Ldarg_0); + /// A macro for _ilgEmit(OpCodes.Ldarg_1) + private void Ldarg_1() => _ilg!.Emit(OpCodes.Ldarg_1); + /// A macro for Ldthis(); Ldfld(); protected void Ldthisfld(FieldInfo ft) { @@ -271,6 +284,9 @@ private void Mvfldloc(FieldInfo ft, LocalBuilder lt) private void Switch(Label[] table) => _ilg!.Emit(OpCodes.Switch, table); + /// Declares a local bool. + private LocalBuilder DeclareBool() => _ilg!.DeclareLocal(typeof(bool)); + /// Declares a local int. private LocalBuilder DeclareInt32() => _ilg!.DeclareLocal(typeof(int)); @@ -388,11 +404,10 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or // Load necessary locals // int pos = base.runtextpos; // int end = base.runtextend; - // ReadOnlySpan inputSpan = base.runtext.AsSpan(); + // ReadOnlySpan inputSpan = input; Mvfldloc(s_runtextposField, pos); Mvfldloc(s_runtextendField, end); - Ldthisfld(s_runtextField); - Call(s_stringAsSpanMethod); + Ldarg_1(); Stloc(inputSpan); // Generate length check. If the input isn't long enough to possibly match, fail quickly. @@ -1098,10 +1113,9 @@ protected void EmitGo() // CultureInfo culture = CultureInfo.CurrentCulture; // only if the whole expression or any subportion is ignoring case, and we're not using invariant InitializeCultureForGoIfNecessary(); - // ReadOnlySpan inputSpan = base.runtext.AsSpan(); + // ReadOnlySpan inputSpan = input; // int end = base.runtextend; - Ldthisfld(s_runtextField); - Call(s_stringAsSpanMethod); + Ldarg_1(); Stloc(inputSpan); Mvfldloc(s_runtextendField, end); @@ -2298,8 +2312,9 @@ void EmitBoundary(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.Boundary or RegexNodeKind.NonBoundary or RegexNodeKind.ECMABoundary or RegexNodeKind.NonECMABoundary, $"Unexpected type: {node.Kind}"); - // if (!IsBoundary(pos + sliceStaticPos, base.runtextbeg, end)) goto doneLabel; + // if (!IsBoundary(inputSpan, pos + sliceStaticPos, base.runtextbeg, end)) goto doneLabel; Ldthis(); + Ldloc(inputSpan); Ldloc(pos); if (sliceStaticPos > 0) { @@ -3925,6 +3940,121 @@ void EmitStackPop() } } + protected void EmitScan(DynamicMethod findFirstCharMethod, DynamicMethod goMethod) + { + LocalBuilder bump = DeclareInt32(); + LocalBuilder stoppos = DeclareInt32(); + Label returnLabel = DefineLabel(); + + Label notRightToLeft = DefineLabel(); + // int bump = 1 + Ldc(1); + Stloc(bump); + + // int stoppos = text.Length + _ilg!.Emit(OpCodes.Ldarga_S, 1); + Call(s_spanGetLengthMethod); + Stloc(stoppos); + + // if (runregex.RightToLeft) + // { + Ldthisfld(s_runregexField); + Callvirt(s_regexGetRightToLeft); + BrfalseFar(notRightToLeft); + + // bump = -1; + // stoppos = 0; + // } + Ldc(-1); + Stloc(bump); + Ldc(0); + Stloc(stoppos); + MarkLabel(notRightToLeft); + + // while (true) + Label whileLoopEnd = DefineLabel(); + Label whileLoopBody = DefineLabel(); + MarkLabel(whileLoopBody); + + // if (FindFirstChar(text)) + Label afterFindFirstCharLabel = DefineLabel(); + Ldthis(); + Ldarg_1(); + Call(findFirstCharMethod); + BrfalseFar(afterFindFirstCharLabel); + + // CheckTimeout(); + Ldthis(); + Call(s_checkTimeoutMethod); + + // Go(text); + Ldthis(); + Ldarg_1(); + Call(goMethod); + + // if (runmatch!._matchcount[0] > 0) + // return; + Label afterSuccessMatchLabel = DefineLabel(); + Ldthisfld(s_runmatchField); + _ilg!.Emit(OpCodes.Ldfld, s_matchMatchCountField); + Ldc(0); + LdelemI4(); + Ldc(0); + _ilg!.Emit(OpCodes.Cgt); + BrfalseFar(afterSuccessMatchLabel); + BrFar(returnLabel); + + // runtrackpos = runtrack!.Length; + MarkLabel(afterSuccessMatchLabel); + Ldthis(); + Ldthisfld(s_runtrackField); + Ldlen(); + _ilg!.Emit(OpCodes.Conv_I4); + Stfld(s_runtrackposField); + + // runtrackpos = runstack!.Length; + Ldthis(); + Ldthisfld(s_runstackField); + Ldlen(); + _ilg!.Emit(OpCodes.Conv_I4); + Stfld(s_runstackposField); + + // runcrawlpos = runcrawl!.Length; + Ldthis(); + Ldthisfld(s_runcrawlField); + Ldlen(); + _ilg!.Emit(OpCodes.Conv_I4); + Stfld(s_runcrawlposField); + + // if (runtextpos == stoppos) + Label incrementRuntextPosLabel = DefineLabel(); + MarkLabel(afterFindFirstCharLabel); + Ldthisfld(s_runtextposField); + Ldloc(stoppos); + Ceq(); + BrfalseFar(incrementRuntextPosLabel); + + // return; + BrFar(returnLabel); + + // runtextpos += bump + MarkLabel(incrementRuntextPosLabel); + Ldthis(); + Ldthisfld(s_runtextposField); + Ldloc(bump); + Add(); + Stfld(s_runtextposField); + + // End loop body. + BrFar(whileLoopBody); + MarkLabel(whileLoopEnd); + + // return; + MarkLabel(returnLabel); + _ilg!.Emit(OpCodes.Nop); + Ret(); + } + private void InitializeCultureForGoIfNecessary() { _textInfo = null; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 9f615754a808fd..280c94736d46c4 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -324,15 +324,57 @@ private bool MatchRef(int index, int length, ReadOnlySpan inputSpan) private void Backwardnext() => runtextpos += _rightToLeft ? 1 : -1; - protected override bool FindFirstChar() => - _code.FindOptimizations.TryFindNextStartingPosition(runtext!, ref runtextpos, runtextbeg, runtextstart, runtextend); + protected internal override void Scan(ReadOnlySpan text) + { + // Configure the additional value to "bump" the position along each time we loop around + // to call FindFirstChar again, as well as the stopping position for the loop. We generally + // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump + // by -1 and stop at textbeg. + int bump = 1, stoppos = text.Length; + Debug.Assert(runregex != null); + if (runregex.RightToLeft) + { + bump = -1; + stoppos = 0; + } + + while (true) + { + if (FindFirstChar(text)) + { + CheckTimeout(); + + Go(text); + + // If we got a match, we're done. + if (runmatch!._matchcount[0] > 0) + { + return; + } + + // Reset state for another iteration. + runtrackpos = runtrack!.Length; + runstackpos = runstack!.Length; + runcrawlpos = runcrawl!.Length; + } + + if (runtextpos == stoppos) + { + return; + } + + runtextpos += bump; + } + } + + private bool FindFirstChar(ReadOnlySpan inputSpan) => + _code.FindOptimizations.TryFindNextStartingPosition(inputSpan, ref runtextpos, runtextbeg, runtextstart, runtextend); - protected override void Go() + private void Go(ReadOnlySpan inputSpan) { SetOperator((RegexOpcode)_code.Codes[0]); _codepos = 0; int advance = -1; - ReadOnlySpan inputSpan = runtext; while (true) { @@ -711,7 +753,7 @@ protected override void Go() continue; case RegexOpcode.Boundary: - if (!IsBoundary(runtextpos, runtextbeg, runtextend)) + if (!IsBoundary(inputSpan, runtextpos, runtextbeg, runtextend)) { break; } @@ -719,7 +761,7 @@ protected override void Go() continue; case RegexOpcode.NonBoundary: - if (IsBoundary(runtextpos, runtextbeg, runtextend)) + if (IsBoundary(inputSpan, runtextpos, runtextbeg, runtextend)) { break; } @@ -727,7 +769,7 @@ protected override void Go() continue; case RegexOpcode.ECMABoundary: - if (!IsECMABoundary(runtextpos, runtextbeg, runtextend)) + if (!IsECMABoundary(inputSpan, runtextpos, runtextbeg, runtextend)) { break; } @@ -735,7 +777,7 @@ protected override void Go() continue; case RegexOpcode.NonECMABoundary: - if (IsECMABoundary(runtextpos, runtextbeg, runtextend)) + if (IsECMABoundary(inputSpan, runtextpos, runtextbeg, runtextend)) { break; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index 34b7f1b1130592..22befb48e48505 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -24,7 +24,7 @@ internal sealed class RegexLWCGCompiler : RegexCompiler private static readonly bool s_includePatternInName = Environment.GetEnvironmentVariable(IncludePatternInNamesEnvVar) == "1"; /// Parameter types for the generated Go and FindFirstChar methods. - private static readonly Type[] s_paramTypes = new Type[] { typeof(RegexRunner) }; + private static readonly Type[] s_paramTypes = new Type[] { typeof(RegexRunner), typeof(ReadOnlySpan) }; /// Id number to use for the next compiled regex. private static int s_regexCount; @@ -52,17 +52,20 @@ internal sealed class RegexLWCGCompiler : RegexCompiler description = string.Concat("_", pattern.Length > DescriptionLimit ? pattern.AsSpan(0, DescriptionLimit) : pattern); } - DynamicMethod findFirstCharMethod = DefineDynamicMethod($"Regex{regexNum}_FindFirstChar{description}", typeof(bool), typeof(CompiledRegexRunner)); + DynamicMethod findFirstCharMethod = DefineDynamicMethod($"Regex{regexNum}_FindFirstChar{description}", typeof(bool), typeof(CompiledRegexRunner), s_paramTypes); EmitFindFirstChar(); - DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", null, typeof(CompiledRegexRunner)); + DynamicMethod goMethod = DefineDynamicMethod($"Regex{regexNum}_Go{description}", null, typeof(CompiledRegexRunner), s_paramTypes); EmitGo(); - return new CompiledRegexRunnerFactory(goMethod, findFirstCharMethod, code.TrackCount); + DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(ReadOnlySpan) }); + EmitScan(findFirstCharMethod, goMethod); + + return new CompiledRegexRunnerFactory(scanMethod, code.TrackCount); } /// Begins the definition of a new method (no args) with a specified return value. - private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Type hostType) + private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Type hostType, Type[] paramTypes) { // We're claiming that these are static methods, but really they are instance methods. // By giving them a parameter which represents "this", we're tricking them into @@ -71,7 +74,7 @@ private DynamicMethod DefineDynamicMethod(string methname, Type? returntype, Typ const MethodAttributes Attribs = MethodAttributes.Public | MethodAttributes.Static; const CallingConventions Conventions = CallingConventions.Standard; - var dm = new DynamicMethod(methname, Attribs, Conventions, returntype, s_paramTypes, hostType, skipVisibility: false); + var dm = new DynamicMethod(methname, Attribs, Conventions, returntype, paramTypes, hostType, skipVisibility: false); _ilg = dm.GetILGenerator(); return dm; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index 13a9fbf155bb12..ac5fdd16e7a07d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -88,25 +88,22 @@ protected RegexRunner() { } protected Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) => Scan(regex, text, textbeg, textend, textstart, prevlen, quick, regex.MatchTimeout); - protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) + protected internal virtual void Scan(ReadOnlySpan text) { - this.quick = quick; - - // Handle timeout argument - _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds - bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; - if (!ignoreTimeout) + string? s = runtext; + if (text != s) { - // We are using Environment.TickCount and not Stopwatch for performance reasons. - // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt - // overflow it will still stay ahead of Environment.TickCount for comparisons made - // in DoCheckTimeout(). - Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected - _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; - _timeoutOccursAt = Environment.TickCount + _timeout; - _timeoutChecksToSkip = TimeoutCheckFrequency; + s = text.ToString(); + runtext = s; } + Debug.Assert(runregex != null); + Match? match = Scan(runregex, s, 0, text.Length, runtextstart, -1, quick, runregex.internalMatchTimeout); + runmatch = match; + } + + protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) + { // Configure the additional value to "bump" the position along each time we loop around // to call FindFirstChar again, as well as the stopping position for the loop. We generally // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump @@ -118,31 +115,6 @@ protected RegexRunner() { } stoppos = textbeg; } - // Store runtextpos into field, as we may bump it in next check. The remaining arguments - // are stored below once we're past the potential return in the next check. - runtextpos = textstart; - - // If previous match was empty or failed, advance by one before matching. - if (prevlen == 0) - { - if (textstart == stoppos) - { - return Match.Empty; - } - - runtextpos += bump; - } - - // Store remaining arguments into fields now that we're going to start the scan. - // These are referenced by the derived runner. - runregex = regex; - runtext = text; - runtextstart = textstart; - runtextbeg = textbeg; - runtextend = textend; - - // Main loop: FindFirstChar/Go + bump until the ending position. - bool initialized = false; while (true) { // Find the next potential location for a match in the input. @@ -151,18 +123,7 @@ protected RegexRunner() { } #endif if (FindFirstChar()) { - if (!ignoreTimeout) - { - DoCheckTimeout(); - } - - // Ensure that the runner is initialized. This includes initializing all of the state in the runner - // that Go might use, such as the backtracking stack, as well as a Match object for it to populate. - if (!initialized) - { - InitializeForGo(); - initialized = true; - } + CheckTimeout(); // See if there's a match at this position. #if DEBUG @@ -207,6 +168,36 @@ protected RegexRunner() { } } } + internal void InitializeForScan(Regex regex, ReadOnlySpan text, int textbeg, int textend, int textstart, bool quick) + { + this.quick = quick; + // Store remaining arguments into fields now that we're going to start the scan. + // These are referenced by the derived runner. + runregex = regex; + runtextstart = textstart; + runtextbeg = textbeg; + runtextend = textend; + runtextpos = textstart; + } + + internal void InitializeTimeout(TimeSpan timeout) + { + // Handle timeout argument + _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds + bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; + if (!ignoreTimeout) + { + // We are using Environment.TickCount and not Stopwatch for performance reasons. + // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt + // overflow it will still stay ahead of Environment.TickCount for comparisons made + // in DoCheckTimeout(). + Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected + _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; + _timeoutOccursAt = Environment.TickCount + _timeout; + _timeoutChecksToSkip = TimeoutCheckFrequency; + } + } + /// Enumerates all of the matches with the specified regex, invoking the callback for each. /// /// This optionally repeatedly hands out the same Match instance, updated with new information. @@ -250,7 +241,6 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref runtextbeg = 0; // Main loop: FindFirstChar/Go + bump until the ending position. - bool initialized = false; while (true) { // Find the next potential location for a match in the input. @@ -264,14 +254,6 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref DoCheckTimeout(); } - // Ensure that the runner is initialized. This includes initializing all of the state in the runner - // that Go might use, such as the backtracking stack, as well as a Match object for it to populate. - if (!initialized) - { - InitializeForGo(); - initialized = true; - } - #if DEBUG Debug.WriteLineIf(Regex.EnableDebugTracing, $"Calling Go at {nameof(runtextpos)}={runtextpos}"); #endif @@ -291,7 +273,6 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref runmatch = null; } match.Tidy(runtextpos); - initialized = false; if (!callback(ref state, match)) { // If the callback returns false, we're done. @@ -359,7 +340,7 @@ internal void ScanInternal(Regex regex, string text, int textstart, ref } } - protected void CheckTimeout() + protected internal void CheckTimeout() { if (_ignoreTimeout) return; @@ -385,7 +366,9 @@ private void DoCheckTimeout() if (0 > _timeoutOccursAt && 0 < currentMillis) return; - throw new RegexMatchTimeoutException(runtext!, runregex!.pattern!, TimeSpan.FromMilliseconds(_timeout)); + string input = runtext ?? string.Empty; + + throw new RegexMatchTimeoutException(input, runregex!.pattern!, TimeSpan.FromMilliseconds(_timeout)); } /// @@ -394,37 +377,37 @@ private void DoCheckTimeout() /// then to leave runtextpos at the ending position. It should leave /// runtextpos where it started if there was no match. /// - protected abstract void Go(); + protected virtual void Go() => throw new NotImplementedException(); /// /// The responsibility of FindFirstChar() is to advance runtextpos /// until it is at the next position which is a candidate for the /// beginning of a successful match. /// - protected abstract bool FindFirstChar(); + protected virtual bool FindFirstChar() => throw new NotImplementedException(); /// /// InitTrackCount must initialize the runtrackcount field; this is /// used to know how large the initial runtrack and runstack arrays /// must be. /// - protected abstract void InitTrackCount(); + protected virtual void InitTrackCount() => runtrackcount = 0; /// /// Initializes all the data members that are used by Go() /// - private void InitializeForGo() + internal void InitializeForGo() { if (runmatch is null) { // Use a hashtabled Match object if the capture numbers are sparse runmatch = runregex!.caps is null ? - new Match(runregex, runregex.capsize, runtext!, runtextbeg, runtextend - runtextbeg, runtextstart) : - new MatchSparse(runregex, runregex.caps, runregex.capsize, runtext!, runtextbeg, runtextend - runtextbeg, runtextstart); + new Match(runregex, runregex.capsize, runtext ?? string.Empty, runtextbeg, runtextend - runtextbeg, runtextstart) : + new MatchSparse(runregex, runregex.caps, runregex.capsize, runtext, runtextbeg, runtextend - runtextbeg, runtextstart); } else { - runmatch.Reset(runregex!, runtext!, runtextbeg, runtextend, runtextstart); + runmatch.Reset(runregex!, runtext, runtextbeg, runtextend, runtextstart); } // Note we test runcrawl, because it is the last one to be allocated @@ -491,6 +474,12 @@ protected bool IsBoundary(int index, int startpos, int endpos) (index < endpos && RegexCharClass.IsBoundaryWordChar(runtext![index])); } + protected bool IsBoundary(ReadOnlySpan inputSpan, int index, int startpos, int endpos) + { + return (index > startpos && RegexCharClass.IsBoundaryWordChar(inputSpan[index - 1])) != + (index < endpos && RegexCharClass.IsBoundaryWordChar(inputSpan[index])); + } + /// Called to determine a char's inclusion in the \w set. internal static bool IsWordChar(char ch) => RegexCharClass.IsWordChar(ch); @@ -500,6 +489,12 @@ protected bool IsECMABoundary(int index, int startpos, int endpos) (index < endpos && RegexCharClass.IsECMAWordChar(runtext![index])); } + protected bool IsECMABoundary(ReadOnlySpan inputSpan, int index, int startpos, int endpos) + { + return (index > startpos && RegexCharClass.IsECMAWordChar(inputSpan[index - 1])) != + (index < endpos && RegexCharClass.IsECMAWordChar(inputSpan[index])); + } + protected static bool CharInSet(char ch, string set, string category) { string charClass = RegexCharClass.ConvertOldStringsToClass(set, category); @@ -699,7 +694,10 @@ string DescribeTextPosition() if (runtextpos > runtextbeg) { - sb.Append(RegexCharClass.DescribeChar(runtext![runtextpos - 1])); + if (runtext != null) + { + sb.Append(RegexCharClass.DescribeChar(runtext[runtextpos - 1])); + } } else { @@ -710,7 +708,10 @@ string DescribeTextPosition() for (int i = runtextpos; i < runtextend; i++) { - sb.Append(RegexCharClass.DescribeChar(runtext![i])); + if (runtext != null) + { + sb.Append(RegexCharClass.DescribeChar(runtext[i])); + } } if (sb.Length >= 64) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index 097a67ccd00326..4146fbcfbeb889 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Diagnostics; using System.Globalization; using System.Text.RegularExpressions.Symbolic.Unicode; @@ -83,14 +84,13 @@ private sealed class Runner : RegexRunner where TSetType : notnull internal Runner(SymbolicRegexMatcher matcher) => _matcher = matcher; - protected override void InitTrackCount() { } // nop, no backtracking - - protected override bool FindFirstChar() => true; // The logic is all in Go. - - protected override void Go() + protected internal override void Scan(ReadOnlySpan text) { - ReadOnlySpan inputSpan = runtext; + Go(text); + } + private void Go(ReadOnlySpan inputSpan) + { // Perform the match. SymbolicMatch pos = _matcher.FindMatch(quick, inputSpan, runtextpos, runtextend); if (pos.Success)