From 202edb53115504840a3f8b163e5258541510f0c2 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 17 Sep 2020 16:43:47 -0400 Subject: [PATCH 1/2] Remove implicit anchoring optimization from Regex In .NET 5 we added a bunch of optimizations to Regex. One of them was a transform that optimized for the case where the pattern begins with `.*`. If it does, then we insert an implicit anchor at the beginning in order to avoid unnecessary backtracking. Imagine the pattern `.*a` and the pattern `bcdefghijklmnopqrstuvwxyz`. This is going to start matching at `b`, find the next newline, and then backtrack from there looking for the `a`; it won't find it and will backtrack all the way, failing the match at that position. At that point it'll bump to the next position, starting at `c`, and do it all over. It'll fail, backtrack all the way, and bump again, starting at `d`, and doing it all over. Etc. The optimization recognizes that since `.` will match anything other than newline, after it fails to match at the first position, we can just skip all subsequent positions until the next newline, as they're all going to fail. However, the optimization failed to take into account that someone can explicitly start a match in the middle of the provided text. In that case, the implicitly added anchor will fail the match in the actual "Go" matching logic. There are safe ways to do this optimization, e.g. introducing a variant of these anchors that let FindFirstChar skip ahead but that aren't considered for Go's matching purposes, but we can look at employing those for .NET 6. For now for .NET 5, this commit just deletes the faulty optimization and adds a few tests that were failing it. --- .../Text/RegularExpressions/RegexNode.cs | 52 ------------------- .../tests/Regex.Match.Tests.cs | 45 ++++++++++++++-- .../tests/Regex.MultipleMatches.Tests.cs | 10 ++++ 3 files changed, 50 insertions(+), 57 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 8286589654cce7..bd55727af62a97 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -349,58 +349,6 @@ internal RegexNode FinalOptimize() break; } } - - // Optimization: implicit anchoring. - // If the expression begins with a .* loop, add an anchor to the beginning: - // - If Singleline is set such that '.' eats anything, the .* will zip to the end of the string and then backtrack through - // the whole thing looking for a match; since it will have examined everything, there's no benefit to examining it all - // again, and we can anchor to beginning. - // - If Singleline is not set, then '.' eats anything up until a '\n' and backtracks from there, so we can similarly avoid - // re-examining that content and anchor to the beginning of lines. - // We are currently very conservative here, only examining concat nodes. This could be loosened in the future, e.g. to - // explore captures (but think through any implications of there being a back ref to that capture), to explore loops and - // lazy loops a positive minimum (but the anchor shouldn't be part of the loop), to explore alternations and support adding - // an anchor if all of them begin with appropriate star loops (though this could also be accomplished by factoring out the - // loops to be before the alternation), etc. - { - RegexNode node = rootNode.Child(0); // skip implicit root capture node - while (true) - { - bool singleline = (node.Options & RegexOptions.Singleline) != 0; - switch (node.Type) - { - case Concatenate: - node = node.Child(0); - continue; - - case Setloop when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass: - case Setloopatomic when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass: - case Notoneloop when !singleline && node.N == int.MaxValue && node.Ch == '\n': - case Notoneloopatomic when !singleline && node.N == int.MaxValue && node.Ch == '\n': - RegexNode? parent = node.Next; - var anchor = new RegexNode(singleline ? Beginning : Bol, node.Options); - Debug.Assert(parent != null); - if (parent.Type == Concatenate) - { - Debug.Assert(parent.ChildCount() >= 2); - Debug.Assert(parent.Children is List); - anchor.Next = parent; - ((List)parent.Children).Insert(0, anchor); - } - else - { - Debug.Assert(parent.Type == Capture && parent.Next is null, "Only valid capture is the implicit root capture"); - var concat = new RegexNode(Concatenate, parent.Options); - concat.AddChild(anchor); - concat.AddChild(node); - parent.ReplaceChild(0, concat); - } - break; - } - - break; - } - } } // Optimization: Unnecessary root atomic. diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 70dfee3c47d9f3..95afc20c339653 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -127,6 +127,13 @@ public static IEnumerable Match_Basic_TestData() yield return new object[] { @"(?>\w+)(? Matches_TestData() } }; + yield return new object[] + { + ".*", "abc", RegexOptions.None, + new[] + { + new CaptureData("abc", 0, 3), + new CaptureData("", 3, 0) + } + }; + if (!PlatformDetection.IsNetFramework) { // .NET Framework missing fix in https://github.com/dotnet/runtime/pull/1075 From e24d454ba0ce4711f447880e81f58bb2e1ffe3a5 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Fri, 18 Sep 2020 08:22:03 -0400 Subject: [PATCH 2/2] Address PR feedback --- .../tests/Regex.Match.Tests.cs | 43 +++++++++++-------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 95afc20c339653..a7e73699f36e8e 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -860,7 +860,7 @@ public void Match_Advanced(string pattern, string input, RegexOptions options, i } // Note: this block will fail if any inputs attempt to look for anchors or lookbehinds at the initial position, - // as there is a difference between Match(input, beginning) and Match(input, beginning, input.Lenght - beginning) + // as there is a difference between Match(input, beginning) and Match(input, beginning, input.Length - beginning) // in that the former doesn't modify from 0 what the engine sees as the beginning of the input whereas the latter // is equivalent to taking a substring and then matching on that. However, as we currently don't have any such inputs, // it's currently a viable way to test the additional overload. Same goes for the similar case below with options. @@ -900,31 +900,36 @@ public void Match_Advanced(string pattern, string input, RegexOptions options, i } } - [Theory] - // Anchors - [InlineData(@"^.*", "abc", 0, true, true)] - [InlineData(@"^.*", "abc", 1, false, true)] - // Positive Lookbehinds - [InlineData(@"(?<=abc)def", "abcdef", 3, true, false)] - // Negative Lookbehinds - [InlineData(@"(? Match_StartatDiffersFromBeginning_MemberData() { - foreach (RegexOptions line in new[] { RegexOptions.None, RegexOptions.Singleline, RegexOptions.Multiline }) + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline, RegexOptions.Multiline }) { - foreach (RegexOptions options in new[] { line, line | RegexOptions.Compiled }) - { - var r = new Regex(pattern, options); + // Anchors + yield return new object[] { @"^.*", "abc", options, 0, true, true }; + yield return new object[] { @"^.*", "abc", options, 1, false, true }; - Assert.Equal(expectedSuccessStartAt, r.IsMatch(input, startat)); - Assert.Equal(expectedSuccessStartAt, r.Match(input, startat).Success); + // Positive Lookbehinds + yield return new object[] { @"(?<=abc)def", "abcdef", options, 3, true, false }; - Assert.Equal(expectedSuccessBeginning, r.Match(input.Substring(startat)).Success); - Assert.Equal(expectedSuccessBeginning, r.Match(input, startat, input.Length - startat).Success); - } + // Negative Lookbehinds + yield return new object[] { @"(?