diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
index 1291dff977e9fc..8b3611f5f7678a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
@@ -1,7 +1,9 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
+#if SYSTEM_TEXT_REGULAREXPRESSIONS
using System.Buffers;
+#endif
using System.Collections.Generic;
using System.Diagnostics;
@@ -10,28 +12,55 @@ namespace System.Text.RegularExpressions
/// Contains state and provides operations related to finding the next location a match could possibly begin.
internal sealed class RegexFindOptimizations
{
- /// True if the input should be processed right-to-left rather than left-to-right.
- private readonly bool _rightToLeft;
/// Lookup table used for optimizing ASCII when doing set queries.
private readonly uint[]?[]? _asciiLookups;
- public RegexFindOptimizations(RegexNode root, RegexOptions options)
+ public static RegexFindOptimizations Create(RegexNode root, RegexOptions options)
+ {
+ RegexFindOptimizations opts = new(root, options, isLeadingPartial: false);
+
+ if ((options & RegexOptions.RightToLeft) == 0 &&
+ !opts.IsUseful &&
+ RegexPrefixAnalyzer.FindLeadingPositiveLookahead(root) is RegexNode positiveLookahead)
+ {
+ RegexFindOptimizations positiveLookaheadOpts = new(positiveLookahead.Child(0), options, isLeadingPartial: true);
+
+ // Fixups to incorporate relevant information from the original optimizations.
+ // - If the original has a larger minimum length than the lookahead, use it. Lookaheads don't currently factor into
+ // the computation of the minimum as it complicates the logic due to them possibly overlapping with other portions.
+ // - Use whatever max came from the original, if any. We shouldn't have computed a max for the lookahead because
+ // it's partial.
+ positiveLookaheadOpts.MinRequiredLength = Math.Max(opts.MinRequiredLength, positiveLookaheadOpts.MinRequiredLength);
+ positiveLookaheadOpts.MaxPossibleLength = opts.MaxPossibleLength;
+
+ opts = positiveLookaheadOpts;
+ }
+
+ return opts;
+ }
+
+ /// Creates optimization information for searching with the pattern represented by .
+ /// The root of the pattern node tree.
+ /// Options used when creating the regex.
+ /// true if may not represent the whole pattern, only a leading node in it.
+ private RegexFindOptimizations(RegexNode root, RegexOptions options, bool isLeadingPartial)
{
- _rightToLeft = (options & RegexOptions.RightToLeft) != 0;
+ bool rightToLeft = (options & RegexOptions.RightToLeft) != 0;
+ Debug.Assert(!isLeadingPartial || !rightToLeft, "RightToLeft unexpected when isLeadingPartial");
MinRequiredLength = root.ComputeMinLength();
// Compute any anchor starting the expression. If there is one, we won't need to search for anything,
// as we can just match at that single location.
LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(root);
- if (_rightToLeft && LeadingAnchor == RegexNodeKind.Bol)
+ if (rightToLeft && LeadingAnchor == RegexNodeKind.Bol)
{
// Filter out Bol for RightToLeft, as we don't currently optimize for it.
LeadingAnchor = RegexNodeKind.Unknown;
}
if (LeadingAnchor is RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.EndZ or RegexNodeKind.End)
{
- FindMode = (LeadingAnchor, _rightToLeft) switch
+ FindMode = (LeadingAnchor, rightToLeft) switch
{
(RegexNodeKind.Beginning, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning,
(RegexNodeKind.Beginning, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning,
@@ -47,7 +76,8 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
// Compute any anchor trailing the expression. If there is one, and we can also compute a fixed length
// for the whole expression, we can use that to quickly jump to the right location in the input.
- if (!_rightToLeft) // haven't added FindNextStartingPositionMode trailing anchor support for RTL
+ if (!rightToLeft && // haven't added FindNextStartingPositionMode trailing anchor support for RTL
+ !isLeadingPartial) // trailing anchors in a partial root aren't relevant
{
TrailingAnchor = RegexPrefixAnalyzer.FindTrailingAnchor(root);
if (TrailingAnchor is RegexNodeKind.End or RegexNodeKind.EndZ &&
@@ -70,7 +100,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
if (prefix.Length > 1)
{
LeadingPrefix = prefix;
- FindMode = _rightToLeft ?
+ FindMode = rightToLeft ?
FindNextStartingPositionMode.LeadingString_RightToLeft :
FindNextStartingPositionMode.LeadingString_LeftToRight;
return;
@@ -89,7 +119,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
// more expensive; someone who wants to pay to do more work can specify Compiled. So for the interpreter
// we focus only on creating a set for the first character. Same for right-to-left, which is used very
// rarely and thus we don't need to invest in special-casing it.
- if (_rightToLeft)
+ if (rightToLeft)
{
// Determine a set for anything that can possibly start the expression.
if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass)
@@ -253,21 +283,21 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch;
/// Gets the leading anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.
- public RegexNodeKind LeadingAnchor { get; }
+ public RegexNodeKind LeadingAnchor { get; private set; }
/// Gets the trailing anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.
public RegexNodeKind TrailingAnchor { get; }
/// Gets the minimum required length an input need be to match the pattern.
/// 0 is a valid minimum length. This value may also be the max (and hence fixed) length of the expression.
- public int MinRequiredLength { get; }
+ public int MinRequiredLength { get; private set; }
/// The maximum possible length an input could be to match the pattern.
///
/// This is currently only set when is found to be an end anchor.
/// That can be expanded in the future as needed.
///
- public int? MaxPossibleLength { get; }
+ public int? MaxPossibleLength { get; private set; }
/// Gets the leading prefix. May be an empty string.
public string LeadingPrefix { get; } = string.Empty;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
index 8326cf4ef3eaca..706171f0ac599c 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -940,7 +940,7 @@ private RegexNode ReduceAlternation()
node = ExtractCommonPrefixText(node);
if (node.Kind == RegexNodeKind.Alternate)
{
- node = ExtractCommonPrefixOneNotoneSet(node);
+ node = ExtractCommonPrefixNode(node);
if (node.Kind == RegexNodeKind.Alternate)
{
node = RemoveRedundantEmptiesAndNothings(node);
@@ -1072,7 +1072,7 @@ void ReduceSingleLetterAndNestedAlternations()
// This function optimizes out prefix nodes from alternation branches that are
// the same across multiple contiguous branches.
// e.g. \w12|\d34|\d56|\w78|\w90 => \w12|\d(?:34|56)|\w(?:78|90)
- static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
+ static RegexNode ExtractCommonPrefixNode(RegexNode alternation)
{
Debug.Assert(alternation.Kind == RegexNodeKind.Alternate);
Debug.Assert(alternation.Children is List { Count: >= 2 });
@@ -1097,7 +1097,7 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
{
Debug.Assert(children[startingIndex].Children is List { Count: >= 2 });
- // Only handle the case where each branch begins with the same One, Notone, or Set (individual or loop).
+ // Only handle the case where each branch begins with the same One, Notone, Set (individual or loop), or Anchor.
// Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing
// it for non-atomic variable length loops could change behavior as each branch could otherwise have a
// different number of characters consumed by the loop based on what's after it.
@@ -1107,6 +1107,10 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
case RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set:
case RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloopatomic:
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when required.M == required.N:
+ case RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.Bol
+ or RegexNodeKind.End or RegexNodeKind.EndZ or RegexNodeKind.Eol
+ or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary
+ or RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary:
break;
default:
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
index 556a187203f035..6be7fdda3fc948 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@@ -1228,6 +1228,80 @@ public static (RegexNode LoopNode, (char Char, string? String, StringComparison
return null;
}
+ /// Finds a positive lookahead node that can be considered to start the pattern.
+ public static RegexNode? FindLeadingPositiveLookahead(RegexNode node)
+ {
+ RegexNode? positiveLookahead = null;
+ FindLeadingPositiveLookahead(node, ref positiveLookahead);
+ return positiveLookahead;
+
+ // Returns whether to keep examining subsequent nodes in a concatenation.
+ static bool FindLeadingPositiveLookahead(RegexNode node, ref RegexNode? positiveLookahead)
+ {
+ if (!StackHelper.TryEnsureSufficientExecutionStack())
+ {
+ return false;
+ }
+
+ while (true)
+ {
+ if ((node.Options & RegexOptions.RightToLeft) != 0)
+ {
+ return false;
+ }
+
+ switch (node.Kind)
+ {
+ case RegexNodeKind.PositiveLookaround:
+ // Got one.
+ positiveLookahead = node;
+ return false;
+
+ case RegexNodeKind.Bol:
+ case RegexNodeKind.Eol:
+ case RegexNodeKind.Beginning:
+ case RegexNodeKind.Start:
+ case RegexNodeKind.EndZ:
+ case RegexNodeKind.End:
+ case RegexNodeKind.Boundary:
+ case RegexNodeKind.ECMABoundary:
+ case RegexNodeKind.NegativeLookaround:
+ case RegexNodeKind.Empty:
+ // Skip past zero-width nodes.
+ return true;
+
+ case RegexNodeKind.Atomic:
+ case RegexNodeKind.Capture:
+ // Process the child instead of the group, which adds no semantics for this purpose.
+ node = node.Child(0);
+ continue;
+
+ case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M >= 1:
+ // Process the child and then stop. We don't know how many times the
+ // loop will actually repeat, only that it must execute at least once.
+ FindLeadingPositiveLookahead(node.Child(0), ref positiveLookahead);
+ return false;
+
+ case RegexNodeKind.Concatenate:
+ // Check each child, stopping the search if processing it says we can't process further.
+ int childCount = node.ChildCount();
+ for (int i = 0; i < childCount; i++)
+ {
+ if (!FindLeadingPositiveLookahead(node.Child(i), ref positiveLookahead))
+ {
+ return false;
+ }
+ }
+
+ return true;
+
+ default:
+ return false;
+ }
+ }
+ }
+ }
+
/// Computes the leading anchor of a node.
public static RegexNodeKind FindLeadingAnchor(RegexNode node) =>
FindLeadingOrTrailingAnchor(node, leading: true);
@@ -1263,7 +1337,14 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le
case RegexNodeKind.Atomic:
case RegexNodeKind.Capture:
- // For groups, continue exploring the sole child.
+ case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when leading && node.M >= 1:
+ case RegexNodeKind.PositiveLookaround when leading && (node.Options & RegexOptions.RightToLeft) == 0:
+ // For atomic and capture groups, for the purposes of finding anchors they add no semantics around the child.
+ // Loops are like atomic and captures for the purposes of finding leading anchors, as long as the loop has
+ // at least one guaranteed iteration (if its min is 0, any anchors inside might not apply).
+ // Positive lookaheads are also relevant as long as we're looking for leading anchors, as an anchor
+ // at the beginning of a starting positive lookahead has the same semantics as the same anchor at the
+ // beginning of the pattern.
node = node.Child(0);
continue;
@@ -1274,15 +1355,35 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le
{
int childCount = node.ChildCount();
RegexNode? child = null;
+ RegexNodeKind bestAnchorFound = RegexNodeKind.Unknown;
if (leading)
{
for (int i = 0; i < childCount; i++)
{
- if (node.Child(i).Kind is not (RegexNodeKind.Empty or RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround))
+ RegexNode tmpChild = node.Child(i);
+ switch (tmpChild.Kind)
{
- child = node.Child(i);
- break;
+ case RegexNodeKind.Empty or RegexNodeKind.NegativeLookaround:
+ case RegexNodeKind.PositiveLookaround when ((node.Options | tmpChild.Options) & RegexOptions.RightToLeft) != 0:
+ // Skip over zero-width assertions.
+ continue;
+
+ case RegexNodeKind.PositiveLookaround:
+ // Except for positive lookaheads at the beginning of the pattern, as any anchor it has at
+ // its beginning can also be used.
+ bestAnchorFound = ChooseBetterAnchor(bestAnchorFound, FindLeadingOrTrailingAnchor(tmpChild, leading));
+ if (IsBestAnchor(bestAnchorFound))
+ {
+ return bestAnchorFound;
+ }
+
+ // Now that we know what anchor might be in the lookahead, skip the zero-width assertion
+ // and continue examining subsequent nodes of the concatenation.
+ continue;
}
+
+ child = tmpChild;
+ break;
}
}
else
@@ -1297,13 +1398,57 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le
}
}
- if (child is not null)
+ if (bestAnchorFound is not RegexNodeKind.Unknown)
+ {
+ // We found a leading anchor in a positive lookahead. If we don't have a child node, then
+ // just return the anchor we got. If we do have a child node, recur into it to find any anchor
+ // it has, and then choose the best between that and the lookahead.
+ Debug.Assert(leading);
+ return child is not null ?
+ ChooseBetterAnchor(bestAnchorFound, FindLeadingAnchor(child)) :
+ bestAnchorFound;
+ }
+ else if (child is not null)
{
+ // Loop around to process the child node.
node = child;
continue;
}
goto default;
+
+ // Decide which of two anchors we'd rather search for, based on which yields the fastest
+ // search, e.g. Beginning means we only have one place to search, whereas worse Bol could be at the
+ // start of any line, and even worse Boundary could be at the start or end of any word.
+ static RegexNodeKind ChooseBetterAnchor(RegexNodeKind anchor1, RegexNodeKind anchor2)
+ {
+ return
+ anchor1 == RegexNodeKind.Unknown ? anchor2 :
+ anchor2 == RegexNodeKind.Unknown ? anchor1 :
+ RankAnchorQuality(anchor1) >= RankAnchorQuality(anchor2) ? anchor1 :
+ anchor2;
+
+ static int RankAnchorQuality(RegexNodeKind node) =>
+ node switch
+ {
+ RegexNodeKind.Beginning => 3,
+ RegexNodeKind.Start => 3,
+ RegexNodeKind.End => 3,
+ RegexNodeKind.EndZ => 3,
+
+ RegexNodeKind.Bol => 2,
+ RegexNodeKind.Eol => 2,
+
+ RegexNodeKind.Boundary => 1,
+ RegexNodeKind.ECMABoundary => 1,
+
+ _ => 0
+ };
+ }
+
+ static bool IsBestAnchor(RegexNodeKind anchor) =>
+ // Keep in sync with ChooseBetterAnchor
+ anchor is RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.End or RegexNodeKind.EndZ;
}
case RegexNodeKind.Alternate:
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs
index 4ee944e47f91a5..8a827001cc2f98 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs
@@ -77,7 +77,7 @@ internal RegexTree(RegexNode root, int captureCount, string[]? captureNames, Has
CaptureNameToNumberMapping = captureNameToNumberMapping;
CaptureNames = captureNames;
Options = options;
- FindOptimizations = new RegexFindOptimizations(root, options);
+ FindOptimizations = RegexFindOptimizations.Create(root, options);
}
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs
index 4c0b02b266142e..c1030f86b11f1a 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs
@@ -30,6 +30,23 @@ public class RegexFindOptimizationsTests
[InlineData(@"\zhello$", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End)]
[InlineData(@"\zhi|\zhello", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End)]
+ [InlineData(@"(?=^abc)", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)]
+ [InlineData(@"(?=^)abc", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)]
+ [InlineData(@"(?=.*$)abc", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight)]
+ [InlineData(@"(?=^)abc", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft)]
+ [InlineData(@"abc(?=^)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft)]
+ [InlineData(@"(?[a-c]|def|[gh])")]
[InlineData("this|that|there|then|those", "th(?>is|at|ere|en|ose)")]
+ [InlineData("^this|^that|^there|^then|^those", "^th(?>is|at|ere|en|ose)")]
+ [InlineData("\bthis|\bthat|\bthere|\bthen|\bthose", "\bth(?>is|at|ere|en|ose)")]
[InlineData("it's (?>this|that|there|then|those)", "it's (?>th(?>is|at|e(?>re|n)|ose))")]
[InlineData("it's (?>this|that|there|then|those)!", "it's (?>th(?>is|at|e(?>re|n)|ose))!")]
[InlineData("abcd|abce", "abc[de]")]
[InlineData("abcd|abef", "ab(?>cd|ef)")]
[InlineData("abcd|aefg", "a(?>bcd|efg)")]
[InlineData("abcd|abc|ab|a", "a(?>bcd|bc|b|)")]
+ [InlineData("^abcd|^abce", "^(?:abc[de])")]
// [InlineData("abcde|abcdef", "abcde(?>|f)")] // TODO https://github.com/dotnet/runtime/issues/66031: Need to reorganize optimizations to avoid an extra Empty being left at the end of the tree
[InlineData("abcdef|abcde", "abcde(?>f|)")]
[InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")]
@@ -495,6 +498,8 @@ public void PatternsReduceIdentically(string actual, string expected)
[InlineData("a*(?(xyz)acd|efg)", "(?>a*)(?(xyz)acd|efg)")]
[InlineData("a*(?(xyz)bcd|afg)", "(?>a*)(?(xyz)bcd|afg)")]
[InlineData("a*(?(xyz)bcd)", "(?>a*)(?(xyz)bcd)")]
+ // Different prefixes on alternation branches
+ [InlineData("^abcd|$abce", "^abcd|^abce")]
public void PatternsReduceDifferently(string actual, string expected)
{
// NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project.