Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2088,12 +2088,18 @@ static void ProcessNode(RegexNode node, RegexNode subsequent)
loopChild = loopChild.Child(loopChild.ChildCount() - 1);
}

// MakeLoopAtomic wraps the loop in an Atomic group, which discards all backtracking
// state from within the body. CanBeMadeAtomic only proves that giving back iterations
// is unnecessary, but the Atomic wrapper also prevents within-body backtracking from
// being triggered by subsequent failures. That's only safe when the body has no
// backtracking of its own and the last descendant is a type that won't be adversely
// affected by seeing an Atomic ancestor.
if (loopChild.Kind is
RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or
RegexNodeKind.Multi or
RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set)
RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or
RegexNodeKind.Multi or
RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set &&
!MayContainBacktracking(node.Child(0)))
{
// For types on the allow list, we can make the loop itself atomic.
node.MakeLoopAtomic();
}
else if (node.Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop)
Expand Down Expand Up @@ -2586,6 +2592,52 @@ bool MayOverlapStartingOrEndingSet(string set) =>
}
}

/// <summary>Gets whether this node is itself a backtracking construct.</summary>
/// <remarks>
/// This checks the node in isolation (not its children). A node is a backtracking construct
/// if it's a variable-width loop or an alternation.
/// </remarks>
public bool IsBacktrackingConstruct => Kind switch
{
RegexNodeKind.Alternate => true,
RegexNodeKind.Loop or RegexNodeKind.Lazyloop when M != N => true,
RegexNodeKind.Oneloop or RegexNodeKind.Onelazy or
RegexNodeKind.Notoneloop or RegexNodeKind.Notonelazy or
RegexNodeKind.Setloop or RegexNodeKind.Setlazy when M != N => true,
_ => false,
};

/// <summary>
/// Checks whether a node tree may contain backtracking constructs (variable-width loops or alternations).
/// </summary>
private static bool MayContainBacktracking(RegexNode node)
{
// If we can't recur, just assume the worst and say that it may contain backtracking constructs.
if (!StackHelper.TryEnsureSufficientExecutionStack())
{
return true;
}

// If this node is a backtracking construct, then obviously it may contain backtracking constructs.
if (node.IsBacktrackingConstruct)
{
return true;
}

// Otherwise, we need to check the children to see if any of them may contain backtracking constructs.
int childCount = node.ChildCount();
for (int i = 0; i < childCount; i++)
{
if (MayContainBacktracking(node.Child(i)))
{
return true;
}
}

// No backtracking is possible.
return false;
}

/// <summary>Gets whether this node is known to be immediately preceded by a word character.</summary>
public bool IsKnownPrecededByWordChar() => IsKnownPrecededOrSucceededByWordChar(false);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,9 @@ static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByA
// Certain kinds of nodes incur backtracking logic themselves: add them to the backtracking collection.
// We may later find that a node contains another that has backtracking; we'll add nodes based on that
// after examining the children.
switch (node.Kind)
if (node.IsBacktrackingConstruct)
{
case RegexNodeKind.Alternate:
case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M != node.N:
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when node.M != node.N:
(results._mayBacktrack ??= new HashSet<RegexNode>()).Add(node);
break;
(results._mayBacktrack ??= new HashSet<RegexNode>()).Add(node);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,42 @@ public static IEnumerable<object[]> Match_MemberData()
yield return (@"(abcd*?)+e", "abcde", RegexOptions.None, 0, 5, true, "abcde");
yield return (@"(abcd*)+?e", "abcde", RegexOptions.None, 0, 5, true, "abcde");
yield return (@"(abcd*?)+?e", "abcde", RegexOptions.None, 0, 5, true, "abcde");

// Lazy loop inside loop body where the Atomic wrapper must not suppress within-body backtracking.
// Each variation tests a different kind of backtracking construct inside the loop body.
// Notonelazy (.*?): lazy . expands past first ) to find the correct closing delimiter.
yield return (@"\w+(\(.*?\))?,", @"Foo(one()?,two,three),", RegexOptions.None, 0, 22, true, @"Foo(one()?,two,three),");
yield return (@"\w+(<.*?>)?!", "foo<a>b>!", RegexOptions.None, 0, 9, true, "foo<a>b>!");
yield return (@"(\(.*?\))+;", "(a)b);", RegexOptions.None, 0, 6, true, "(a)b);");
yield return (@"(a.*?b)+c", "axbbc", RegexOptions.None, 0, 5, true, "axbbc");
// Notoneloop (.+): greedy . overshoots past the right ), must back up to find an earlier one.
yield return (@"\w+(\(.+\))?,", "Foo(a),(b)x,", RegexOptions.None, 0, 12, true, "Foo(a),");
// Setlazy ([^,]+?): lazy char class expands past first ) to find the right one.
yield return (@"\w+(\([^,]+?\))?,", "Foo(one()three),", RegexOptions.None, 0, 16, true, "Foo(one()three),");
// Setloop ([^;]+): greedy char class overshoots past the right ), must back up to earlier one.
yield return (@"\w+(\([^;]+\))?,", "Foo(a),b)x,", RegexOptions.None, 0, 11, true, "Foo(a),");
// Onelazy ()+?): lazy single-char loop must expand to consume more )'s for , to follow.
yield return (@"\w+(\(\)+?\))?,", "Foo())),", RegexOptions.None, 0, 8, true, "Foo())),");
// Oneloop ()+): greedy single-char loop of the delimiter char itself.
yield return (@"\w+(\(\)+\))?,", "Foo())),", RegexOptions.None, 0, 8, true, "Foo())),");
// Alternate: short branch matches + \) succeeds, but , fails. Longer branch includes \) in it.
yield return (@"\w+(\((?:a|a\)b)\))?,", "Foo(a)b),", RegexOptions.None, 0, 9, true, "Foo(a)b),");
// Loop (general greedy multi-char body): 2 iters of \), overshoots, backing up to 1 iter works.
yield return (@"\w+(\((?:\),){1,3}\))?,", "Foo(),),)),", RegexOptions.None, 0, 11, true, "Foo(),),");
// Lazyloop (general lazy multi-char body): 1 iter of \)a, outer \) OK but , fails. Expand to 2.
yield return (@"\w+(\((?:\)a){1,3}?\))?,", "Foo()a)a),", RegexOptions.None, 0, 10, true, "Foo()a)a),");
// Same patterns with lazy outer loops (??, +?, *?) to ensure lazy expansion is not broken.
yield return (@"\w+(\(.*?\))??,", @"Foo(one()?,two,three),", RegexOptions.None, 0, 22, true, @"Foo(one()?,two,three),");
yield return (@"(\(.*?\))+?;", "(a)b);", RegexOptions.None, 0, 6, true, "(a)b);");
yield return (@"(\(.*?\))*;", "(a)b);", RegexOptions.None, 0, 6, true, "(a)b);");
yield return (@"(\(.*?\))*?;", "(a)b);", RegexOptions.None, 0, 6, true, "(a)b);");
// Non-backtracking body: optimization correctly applies. These verify bodies without backtracking
// inside the outer loop are still correctly matched after the outer loop is made atomic.
yield return (@"\w+(\(abc\))?,", "Foo(abc),", RegexOptions.None, 0, 9, true, "Foo(abc),");
yield return (@"\w+(\(abc\))??,", "Foo(abc),", RegexOptions.None, 0, 9, true, "Foo(abc),");
yield return (@"\w+(\(\w{3}\))?,", "Foo(abc),", RegexOptions.None, 0, 9, true, "Foo(abc),");
yield return (@"\w+(\(\w+!\))?,", "Foo(abc!),", RegexOptions.None, 0, 10, true, "Foo(abc!),");

yield return (@"(?:m(?:((e)?)??)|a)\b", "you m you", RegexOptions.None, 0, 9, true, "m");
yield return (@"(?:m(?:((e)?)??)|a)\b", "you me you", RegexOptions.None, 0, 10, true, "me");
yield return (@"(?:m(?:((e)?)??)|a)\b", "you a you", RegexOptions.None, 0, 9, true, "a");
Expand Down
Loading