diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index 1291dff977e9fc..8b3611f5f7678a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -1,7 +1,9 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +#if SYSTEM_TEXT_REGULAREXPRESSIONS using System.Buffers; +#endif using System.Collections.Generic; using System.Diagnostics; @@ -10,28 +12,55 @@ namespace System.Text.RegularExpressions /// Contains state and provides operations related to finding the next location a match could possibly begin. internal sealed class RegexFindOptimizations { - /// True if the input should be processed right-to-left rather than left-to-right. - private readonly bool _rightToLeft; /// Lookup table used for optimizing ASCII when doing set queries. private readonly uint[]?[]? _asciiLookups; - public RegexFindOptimizations(RegexNode root, RegexOptions options) + public static RegexFindOptimizations Create(RegexNode root, RegexOptions options) + { + RegexFindOptimizations opts = new(root, options, isLeadingPartial: false); + + if ((options & RegexOptions.RightToLeft) == 0 && + !opts.IsUseful && + RegexPrefixAnalyzer.FindLeadingPositiveLookahead(root) is RegexNode positiveLookahead) + { + RegexFindOptimizations positiveLookaheadOpts = new(positiveLookahead.Child(0), options, isLeadingPartial: true); + + // Fixups to incorporate relevant information from the original optimizations. + // - If the original has a larger minimum length than the lookahead, use it. Lookaheads don't currently factor into + // the computation of the minimum as it complicates the logic due to them possibly overlapping with other portions. + // - Use whatever max came from the original, if any. We shouldn't have computed a max for the lookahead because + // it's partial. + positiveLookaheadOpts.MinRequiredLength = Math.Max(opts.MinRequiredLength, positiveLookaheadOpts.MinRequiredLength); + positiveLookaheadOpts.MaxPossibleLength = opts.MaxPossibleLength; + + opts = positiveLookaheadOpts; + } + + return opts; + } + + /// Creates optimization information for searching with the pattern represented by . + /// The root of the pattern node tree. + /// Options used when creating the regex. + /// true if may not represent the whole pattern, only a leading node in it. + private RegexFindOptimizations(RegexNode root, RegexOptions options, bool isLeadingPartial) { - _rightToLeft = (options & RegexOptions.RightToLeft) != 0; + bool rightToLeft = (options & RegexOptions.RightToLeft) != 0; + Debug.Assert(!isLeadingPartial || !rightToLeft, "RightToLeft unexpected when isLeadingPartial"); MinRequiredLength = root.ComputeMinLength(); // Compute any anchor starting the expression. If there is one, we won't need to search for anything, // as we can just match at that single location. LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(root); - if (_rightToLeft && LeadingAnchor == RegexNodeKind.Bol) + if (rightToLeft && LeadingAnchor == RegexNodeKind.Bol) { // Filter out Bol for RightToLeft, as we don't currently optimize for it. LeadingAnchor = RegexNodeKind.Unknown; } if (LeadingAnchor is RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.EndZ or RegexNodeKind.End) { - FindMode = (LeadingAnchor, _rightToLeft) switch + FindMode = (LeadingAnchor, rightToLeft) switch { (RegexNodeKind.Beginning, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning, (RegexNodeKind.Beginning, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning, @@ -47,7 +76,8 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) // Compute any anchor trailing the expression. If there is one, and we can also compute a fixed length // for the whole expression, we can use that to quickly jump to the right location in the input. - if (!_rightToLeft) // haven't added FindNextStartingPositionMode trailing anchor support for RTL + if (!rightToLeft && // haven't added FindNextStartingPositionMode trailing anchor support for RTL + !isLeadingPartial) // trailing anchors in a partial root aren't relevant { TrailingAnchor = RegexPrefixAnalyzer.FindTrailingAnchor(root); if (TrailingAnchor is RegexNodeKind.End or RegexNodeKind.EndZ && @@ -70,7 +100,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) if (prefix.Length > 1) { LeadingPrefix = prefix; - FindMode = _rightToLeft ? + FindMode = rightToLeft ? FindNextStartingPositionMode.LeadingString_RightToLeft : FindNextStartingPositionMode.LeadingString_LeftToRight; return; @@ -89,7 +119,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) // more expensive; someone who wants to pay to do more work can specify Compiled. So for the interpreter // we focus only on creating a set for the first character. Same for right-to-left, which is used very // rarely and thus we don't need to invest in special-casing it. - if (_rightToLeft) + if (rightToLeft) { // Determine a set for anything that can possibly start the expression. if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass) @@ -253,21 +283,21 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch; /// Gets the leading anchor (e.g. RegexNodeKind.Bol) if one exists and was computed. - public RegexNodeKind LeadingAnchor { get; } + public RegexNodeKind LeadingAnchor { get; private set; } /// Gets the trailing anchor (e.g. RegexNodeKind.Bol) if one exists and was computed. public RegexNodeKind TrailingAnchor { get; } /// Gets the minimum required length an input need be to match the pattern. /// 0 is a valid minimum length. This value may also be the max (and hence fixed) length of the expression. - public int MinRequiredLength { get; } + public int MinRequiredLength { get; private set; } /// The maximum possible length an input could be to match the pattern. /// /// This is currently only set when is found to be an end anchor. /// That can be expanded in the future as needed. /// - public int? MaxPossibleLength { get; } + public int? MaxPossibleLength { get; private set; } /// Gets the leading prefix. May be an empty string. public string LeadingPrefix { get; } = string.Empty; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 8326cf4ef3eaca..706171f0ac599c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -940,7 +940,7 @@ private RegexNode ReduceAlternation() node = ExtractCommonPrefixText(node); if (node.Kind == RegexNodeKind.Alternate) { - node = ExtractCommonPrefixOneNotoneSet(node); + node = ExtractCommonPrefixNode(node); if (node.Kind == RegexNodeKind.Alternate) { node = RemoveRedundantEmptiesAndNothings(node); @@ -1072,7 +1072,7 @@ void ReduceSingleLetterAndNestedAlternations() // This function optimizes out prefix nodes from alternation branches that are // the same across multiple contiguous branches. // e.g. \w12|\d34|\d56|\w78|\w90 => \w12|\d(?:34|56)|\w(?:78|90) - static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation) + static RegexNode ExtractCommonPrefixNode(RegexNode alternation) { Debug.Assert(alternation.Kind == RegexNodeKind.Alternate); Debug.Assert(alternation.Children is List { Count: >= 2 }); @@ -1097,7 +1097,7 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation) { Debug.Assert(children[startingIndex].Children is List { Count: >= 2 }); - // Only handle the case where each branch begins with the same One, Notone, or Set (individual or loop). + // Only handle the case where each branch begins with the same One, Notone, Set (individual or loop), or Anchor. // Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing // it for non-atomic variable length loops could change behavior as each branch could otherwise have a // different number of characters consumed by the loop based on what's after it. @@ -1107,6 +1107,10 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation) case RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set: case RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloopatomic: case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when required.M == required.N: + case RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.Bol + or RegexNodeKind.End or RegexNodeKind.EndZ or RegexNodeKind.Eol + or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary + or RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary: break; default: diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 556a187203f035..6be7fdda3fc948 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -1228,6 +1228,80 @@ public static (RegexNode LoopNode, (char Char, string? String, StringComparison return null; } + /// Finds a positive lookahead node that can be considered to start the pattern. + public static RegexNode? FindLeadingPositiveLookahead(RegexNode node) + { + RegexNode? positiveLookahead = null; + FindLeadingPositiveLookahead(node, ref positiveLookahead); + return positiveLookahead; + + // Returns whether to keep examining subsequent nodes in a concatenation. + static bool FindLeadingPositiveLookahead(RegexNode node, ref RegexNode? positiveLookahead) + { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + return false; + } + + while (true) + { + if ((node.Options & RegexOptions.RightToLeft) != 0) + { + return false; + } + + switch (node.Kind) + { + case RegexNodeKind.PositiveLookaround: + // Got one. + positiveLookahead = node; + return false; + + case RegexNodeKind.Bol: + case RegexNodeKind.Eol: + case RegexNodeKind.Beginning: + case RegexNodeKind.Start: + case RegexNodeKind.EndZ: + case RegexNodeKind.End: + case RegexNodeKind.Boundary: + case RegexNodeKind.ECMABoundary: + case RegexNodeKind.NegativeLookaround: + case RegexNodeKind.Empty: + // Skip past zero-width nodes. + return true; + + case RegexNodeKind.Atomic: + case RegexNodeKind.Capture: + // Process the child instead of the group, which adds no semantics for this purpose. + node = node.Child(0); + continue; + + case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M >= 1: + // Process the child and then stop. We don't know how many times the + // loop will actually repeat, only that it must execute at least once. + FindLeadingPositiveLookahead(node.Child(0), ref positiveLookahead); + return false; + + case RegexNodeKind.Concatenate: + // Check each child, stopping the search if processing it says we can't process further. + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (!FindLeadingPositiveLookahead(node.Child(i), ref positiveLookahead)) + { + return false; + } + } + + return true; + + default: + return false; + } + } + } + } + /// Computes the leading anchor of a node. public static RegexNodeKind FindLeadingAnchor(RegexNode node) => FindLeadingOrTrailingAnchor(node, leading: true); @@ -1263,7 +1337,14 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le case RegexNodeKind.Atomic: case RegexNodeKind.Capture: - // For groups, continue exploring the sole child. + case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when leading && node.M >= 1: + case RegexNodeKind.PositiveLookaround when leading && (node.Options & RegexOptions.RightToLeft) == 0: + // For atomic and capture groups, for the purposes of finding anchors they add no semantics around the child. + // Loops are like atomic and captures for the purposes of finding leading anchors, as long as the loop has + // at least one guaranteed iteration (if its min is 0, any anchors inside might not apply). + // Positive lookaheads are also relevant as long as we're looking for leading anchors, as an anchor + // at the beginning of a starting positive lookahead has the same semantics as the same anchor at the + // beginning of the pattern. node = node.Child(0); continue; @@ -1274,15 +1355,35 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le { int childCount = node.ChildCount(); RegexNode? child = null; + RegexNodeKind bestAnchorFound = RegexNodeKind.Unknown; if (leading) { for (int i = 0; i < childCount; i++) { - if (node.Child(i).Kind is not (RegexNodeKind.Empty or RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround)) + RegexNode tmpChild = node.Child(i); + switch (tmpChild.Kind) { - child = node.Child(i); - break; + case RegexNodeKind.Empty or RegexNodeKind.NegativeLookaround: + case RegexNodeKind.PositiveLookaround when ((node.Options | tmpChild.Options) & RegexOptions.RightToLeft) != 0: + // Skip over zero-width assertions. + continue; + + case RegexNodeKind.PositiveLookaround: + // Except for positive lookaheads at the beginning of the pattern, as any anchor it has at + // its beginning can also be used. + bestAnchorFound = ChooseBetterAnchor(bestAnchorFound, FindLeadingOrTrailingAnchor(tmpChild, leading)); + if (IsBestAnchor(bestAnchorFound)) + { + return bestAnchorFound; + } + + // Now that we know what anchor might be in the lookahead, skip the zero-width assertion + // and continue examining subsequent nodes of the concatenation. + continue; } + + child = tmpChild; + break; } } else @@ -1297,13 +1398,57 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le } } - if (child is not null) + if (bestAnchorFound is not RegexNodeKind.Unknown) + { + // We found a leading anchor in a positive lookahead. If we don't have a child node, then + // just return the anchor we got. If we do have a child node, recur into it to find any anchor + // it has, and then choose the best between that and the lookahead. + Debug.Assert(leading); + return child is not null ? + ChooseBetterAnchor(bestAnchorFound, FindLeadingAnchor(child)) : + bestAnchorFound; + } + else if (child is not null) { + // Loop around to process the child node. node = child; continue; } goto default; + + // Decide which of two anchors we'd rather search for, based on which yields the fastest + // search, e.g. Beginning means we only have one place to search, whereas worse Bol could be at the + // start of any line, and even worse Boundary could be at the start or end of any word. + static RegexNodeKind ChooseBetterAnchor(RegexNodeKind anchor1, RegexNodeKind anchor2) + { + return + anchor1 == RegexNodeKind.Unknown ? anchor2 : + anchor2 == RegexNodeKind.Unknown ? anchor1 : + RankAnchorQuality(anchor1) >= RankAnchorQuality(anchor2) ? anchor1 : + anchor2; + + static int RankAnchorQuality(RegexNodeKind node) => + node switch + { + RegexNodeKind.Beginning => 3, + RegexNodeKind.Start => 3, + RegexNodeKind.End => 3, + RegexNodeKind.EndZ => 3, + + RegexNodeKind.Bol => 2, + RegexNodeKind.Eol => 2, + + RegexNodeKind.Boundary => 1, + RegexNodeKind.ECMABoundary => 1, + + _ => 0 + }; + } + + static bool IsBestAnchor(RegexNodeKind anchor) => + // Keep in sync with ChooseBetterAnchor + anchor is RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.End or RegexNodeKind.EndZ; } case RegexNodeKind.Alternate: diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs index 4ee944e47f91a5..8a827001cc2f98 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs @@ -77,7 +77,7 @@ internal RegexTree(RegexNode root, int captureCount, string[]? captureNames, Has CaptureNameToNumberMapping = captureNameToNumberMapping; CaptureNames = captureNames; Options = options; - FindOptimizations = new RegexFindOptimizations(root, options); + FindOptimizations = RegexFindOptimizations.Create(root, options); } } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs index 4c0b02b266142e..c1030f86b11f1a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs @@ -30,6 +30,23 @@ public class RegexFindOptimizationsTests [InlineData(@"\zhello$", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End)] [InlineData(@"\zhi|\zhello", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End)] + [InlineData(@"(?=^abc)", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)] + [InlineData(@"(?=^)abc", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)] + [InlineData(@"(?=.*$)abc", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight)] + [InlineData(@"(?=^)abc", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft)] + [InlineData(@"abc(?=^)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft)] + [InlineData(@"(?[a-c]|def|[gh])")] [InlineData("this|that|there|then|those", "th(?>is|at|ere|en|ose)")] + [InlineData("^this|^that|^there|^then|^those", "^th(?>is|at|ere|en|ose)")] + [InlineData("\bthis|\bthat|\bthere|\bthen|\bthose", "\bth(?>is|at|ere|en|ose)")] [InlineData("it's (?>this|that|there|then|those)", "it's (?>th(?>is|at|e(?>re|n)|ose))")] [InlineData("it's (?>this|that|there|then|those)!", "it's (?>th(?>is|at|e(?>re|n)|ose))!")] [InlineData("abcd|abce", "abc[de]")] [InlineData("abcd|abef", "ab(?>cd|ef)")] [InlineData("abcd|aefg", "a(?>bcd|efg)")] [InlineData("abcd|abc|ab|a", "a(?>bcd|bc|b|)")] + [InlineData("^abcd|^abce", "^(?:abc[de])")] // [InlineData("abcde|abcdef", "abcde(?>|f)")] // TODO https://github.com/dotnet/runtime/issues/66031: Need to reorganize optimizations to avoid an extra Empty being left at the end of the tree [InlineData("abcdef|abcde", "abcde(?>f|)")] [InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")] @@ -495,6 +498,8 @@ public void PatternsReduceIdentically(string actual, string expected) [InlineData("a*(?(xyz)acd|efg)", "(?>a*)(?(xyz)acd|efg)")] [InlineData("a*(?(xyz)bcd|afg)", "(?>a*)(?(xyz)bcd|afg)")] [InlineData("a*(?(xyz)bcd)", "(?>a*)(?(xyz)bcd)")] + // Different prefixes on alternation branches + [InlineData("^abcd|$abce", "^abcd|^abce")] public void PatternsReduceDifferently(string actual, string expected) { // NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project.