From b2ac7f2af7f59dcf97d4a99a6a1c7402e701fc62 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 10 Feb 2026 15:55:59 -0500 Subject: [PATCH] Fix incorrect atomic loop optimization when body contains backtracking MakeLoopAtomic wraps a loop in an Atomic group, which discards all backtracking state from the body. However, CanBeMadeAtomic only proves that giving back iterations is unnecessary... it does not account for within-body backtracking. When the body itself contains backtracking constructs, the atomic wrapper incorrectly suppresses that internal backtracking, producing wrong match results. The fix adds a MayContainBacktracking check that walks the loop body tree before allowing MakeLoopAtomic. A loop is only made atomic when the body has no backtracking constructs of its own. The backtracking-construct detection logic is extracted into a reusable IsBacktrackingConstruct property on RegexNode, which RegexTreeAnalyzer now also uses, removing the duplicated switch. Tests are added covering lazy/greedy single-char loops, char-class loops, alternations, and general multi-char loops inside outer loop bodies, as well as cases where the optimization should still apply (non-backtracking bodies). --- .../Text/RegularExpressions/RegexNode.cs | 60 +++++++++++++++++-- .../RegularExpressions/RegexTreeAnalyzer.cs | 8 +-- .../FunctionalTests/Regex.Match.Tests.cs | 36 +++++++++++ 3 files changed, 94 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index e8ecee73f7a1df..65f470ce2d31c8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2018,12 +2018,18 @@ static void ProcessNode(RegexNode node, RegexNode subsequent) loopChild = loopChild.Child(loopChild.ChildCount() - 1); } + // MakeLoopAtomic wraps the loop in an Atomic group, which discards all backtracking + // state from within the body. CanBeMadeAtomic only proves that giving back iterations + // is unnecessary, but the Atomic wrapper also prevents within-body backtracking from + // being triggered by subsequent failures. That's only safe when the body has no + // backtracking of its own and the last descendant is a type that won't be adversely + // affected by seeing an Atomic ancestor. if (loopChild.Kind is - RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or - RegexNodeKind.Multi or - RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set) + RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or + RegexNodeKind.Multi or + RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set && + !MayContainBacktracking(node.Child(0))) { - // For types on the allow list, we can make the loop itself atomic. node.MakeLoopAtomic(); } else if (node.Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop) @@ -2516,6 +2522,52 @@ bool MayOverlapStartingOrEndingSet(string set) => } } + /// Gets whether this node is itself a backtracking construct. + /// + /// This checks the node in isolation (not its children). A node is a backtracking construct + /// if it's a variable-width loop or an alternation. + /// + public bool IsBacktrackingConstruct => Kind switch + { + RegexNodeKind.Alternate => true, + RegexNodeKind.Loop or RegexNodeKind.Lazyloop when M != N => true, + RegexNodeKind.Oneloop or RegexNodeKind.Onelazy or + RegexNodeKind.Notoneloop or RegexNodeKind.Notonelazy or + RegexNodeKind.Setloop or RegexNodeKind.Setlazy when M != N => true, + _ => false, + }; + + /// + /// Checks whether a node tree may contain backtracking constructs (variable-width loops or alternations). + /// + private static bool MayContainBacktracking(RegexNode node) + { + // If we can't recur, just assume the worst and say that it may contain backtracking constructs. + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + return true; + } + + // If this node is a backtracking construct, then obviously it may contain backtracking constructs. + if (node.IsBacktrackingConstruct) + { + return true; + } + + // Otherwise, we need to check the children to see if any of them may contain backtracking constructs. + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (MayContainBacktracking(node.Child(i))) + { + return true; + } + } + + // No backtracking is possible. + return false; + } + /// Gets whether this node is known to be immediately preceded by a word character. public bool IsKnownPrecededByWordChar() => IsKnownPrecededOrSucceededByWordChar(false); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs index 5d02fe6bd8c863..20f658ba753603 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs @@ -44,13 +44,9 @@ static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByA // Certain kinds of nodes incur backtracking logic themselves: add them to the backtracking collection. // We may later find that a node contains another that has backtracking; we'll add nodes based on that // after examining the children. - switch (node.Kind) + if (node.IsBacktrackingConstruct) { - case RegexNodeKind.Alternate: - case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M != node.N: - case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when node.M != node.N: - (results._mayBacktrack ??= new HashSet()).Add(node); - break; + (results._mayBacktrack ??= new HashSet()).Add(node); } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 2cc54b7540dae8..1aae6a238685d9 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -334,6 +334,42 @@ public static IEnumerable Match_MemberData() yield return (@"(abcd*?)+e", "abcde", RegexOptions.None, 0, 5, true, "abcde"); yield return (@"(abcd*)+?e", "abcde", RegexOptions.None, 0, 5, true, "abcde"); yield return (@"(abcd*?)+?e", "abcde", RegexOptions.None, 0, 5, true, "abcde"); + + // Lazy loop inside loop body where the Atomic wrapper must not suppress within-body backtracking. + // Each variation tests a different kind of backtracking construct inside the loop body. + // Notonelazy (.*?): lazy . expands past first ) to find the correct closing delimiter. + yield return (@"\w+(\(.*?\))?,", @"Foo(one()?,two,three),", RegexOptions.None, 0, 22, true, @"Foo(one()?,two,three),"); + yield return (@"\w+(<.*?>)?!", "foob>!", RegexOptions.None, 0, 9, true, "foob>!"); + yield return (@"(\(.*?\))+;", "(a)b);", RegexOptions.None, 0, 6, true, "(a)b);"); + yield return (@"(a.*?b)+c", "axbbc", RegexOptions.None, 0, 5, true, "axbbc"); + // Notoneloop (.+): greedy . overshoots past the right ), must back up to find an earlier one. + yield return (@"\w+(\(.+\))?,", "Foo(a),(b)x,", RegexOptions.None, 0, 12, true, "Foo(a),"); + // Setlazy ([^,]+?): lazy char class expands past first ) to find the right one. + yield return (@"\w+(\([^,]+?\))?,", "Foo(one()three),", RegexOptions.None, 0, 16, true, "Foo(one()three),"); + // Setloop ([^;]+): greedy char class overshoots past the right ), must back up to earlier one. + yield return (@"\w+(\([^;]+\))?,", "Foo(a),b)x,", RegexOptions.None, 0, 11, true, "Foo(a),"); + // Onelazy ()+?): lazy single-char loop must expand to consume more )'s for , to follow. + yield return (@"\w+(\(\)+?\))?,", "Foo())),", RegexOptions.None, 0, 8, true, "Foo())),"); + // Oneloop ()+): greedy single-char loop of the delimiter char itself. + yield return (@"\w+(\(\)+\))?,", "Foo())),", RegexOptions.None, 0, 8, true, "Foo())),"); + // Alternate: short branch matches + \) succeeds, but , fails. Longer branch includes \) in it. + yield return (@"\w+(\((?:a|a\)b)\))?,", "Foo(a)b),", RegexOptions.None, 0, 9, true, "Foo(a)b),"); + // Loop (general greedy multi-char body): 2 iters of \), overshoots, backing up to 1 iter works. + yield return (@"\w+(\((?:\),){1,3}\))?,", "Foo(),),)),", RegexOptions.None, 0, 11, true, "Foo(),),"); + // Lazyloop (general lazy multi-char body): 1 iter of \)a, outer \) OK but , fails. Expand to 2. + yield return (@"\w+(\((?:\)a){1,3}?\))?,", "Foo()a)a),", RegexOptions.None, 0, 10, true, "Foo()a)a),"); + // Same patterns with lazy outer loops (??, +?, *?) to ensure lazy expansion is not broken. + yield return (@"\w+(\(.*?\))??,", @"Foo(one()?,two,three),", RegexOptions.None, 0, 22, true, @"Foo(one()?,two,three),"); + yield return (@"(\(.*?\))+?;", "(a)b);", RegexOptions.None, 0, 6, true, "(a)b);"); + yield return (@"(\(.*?\))*;", "(a)b);", RegexOptions.None, 0, 6, true, "(a)b);"); + yield return (@"(\(.*?\))*?;", "(a)b);", RegexOptions.None, 0, 6, true, "(a)b);"); + // Non-backtracking body: optimization correctly applies. These verify bodies without backtracking + // inside the outer loop are still correctly matched after the outer loop is made atomic. + yield return (@"\w+(\(abc\))?,", "Foo(abc),", RegexOptions.None, 0, 9, true, "Foo(abc),"); + yield return (@"\w+(\(abc\))??,", "Foo(abc),", RegexOptions.None, 0, 9, true, "Foo(abc),"); + yield return (@"\w+(\(\w{3}\))?,", "Foo(abc),", RegexOptions.None, 0, 9, true, "Foo(abc),"); + yield return (@"\w+(\(\w+!\))?,", "Foo(abc!),", RegexOptions.None, 0, 10, true, "Foo(abc!),"); + yield return (@"(?:m(?:((e)?)??)|a)\b", "you m you", RegexOptions.None, 0, 9, true, "m"); yield return (@"(?:m(?:((e)?)??)|a)\b", "you me you", RegexOptions.None, 0, 10, true, "me"); yield return (@"(?:m(?:((e)?)??)|a)\b", "you a you", RegexOptions.None, 0, 9, true, "a");