From 0de9177ff7f509ea6cbca8fc4c77a66db9ea793a Mon Sep 17 00:00:00 2001 From: Dan Moseley Date: Fri, 6 Mar 2026 13:04:53 -0700 Subject: [PATCH 1/5] Fix NonBacktracking exponential blowup on nested loops The NonBacktracking regex engine suffered exponential time complexity when processing deeply nested loop patterns like ((a)*)*. Even depth 20 could take >60 seconds, and depth 2000 with a single-character input took ~11s. Two changes fix this: 1. Loop flattening in CreateLoop: simplify (R*)* to R* (and variants like (R+)*, (R*)+) when both loops have unbounded upper, compatible lower bounds, same laziness, and no capture effects. This eliminates the exponential state space in derivative computation. 2. StripEffects caching: cache results of StripEffects to avoid repeatedly traversing shared sub-trees in the derivative DAG, which otherwise causes exponential work even after loop flattening for patterns with capture groups. The loop flattening condition is carefully restricted to preserve correctness: - body._lower must be 0 or 1 (with outer lower 0): (R{2,})* cannot match a single R, so it must not be simplified to R*. - Both loops must have the same laziness: (?:R*)+? (greedy inner, lazy outer) has different match semantics than R*?. - No capture effects: collapsing loops would change capture group bindings. Re-enables StressTestDeepNestingOfLoops for NonBacktracking (depth 120). Fixes #84188 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Symbolic/SymbolicRegexBuilder.cs | 8 +++ .../Symbolic/SymbolicRegexNode.cs | 62 ++++++++++++++++--- .../FunctionalTests/Regex.Match.Tests.cs | 32 ++++++++-- 3 files changed, 86 insertions(+), 16 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs index c71441a24a00a3..a6f166d106570a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs @@ -102,6 +102,14 @@ public bool Equals(SymbolicRegexBuilder.NodeCacheKey other) => /// internal readonly Dictionary<(SymbolicRegexNode, uint), SymbolicRegexNode> _pruneLowerPriorityThanNullabilityCache = new(); + /// + /// Cache for keyed by the node + /// to strip. The value is the stripped node. This cache is essential for avoiding exponential blowup when + /// stripping effects from derivatives of deeply nested loop patterns (e.g. ((a)*)*), where the derivative + /// tree is a DAG with shared sub-trees that would otherwise be traversed repeatedly. + /// + internal readonly Dictionary, SymbolicRegexNode> _stripEffectsCache = new(); + /// /// Cache for keyed by: /// -The node R potentially subsuming S diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 0c3c6edf7c168f..27e8c6d9f12336 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -416,6 +416,26 @@ internal static SymbolicRegexNode CreateLoop(SymbolicRegexBuilder bu Debug.Assert(body._left is not null); return CreateLoop(builder, body._left, 0, 1, isLazy || body.IsLazy); } + // Simplify (R*)* → R*, (R+)* → R*, (R*)+ → R* + // More generally: (R{a,∞}){b,∞} → R{0,∞} when the combined loop can match any + // number of R's including zero. This holds when either: + // - a == 0 (inner loop already matches ε, so each outer iteration can be empty), or + // - a == 1 && b == 0 (outer can take 0 iterations for ε, or n iterations of R+ for any n ≥ 1). + // Counterexample: (R{2,∞})* cannot match a single R, so it's NOT equivalent to R*. + // This is critical for performance: without it, deeply nested patterns like ((a)*)* + // cause exponential blowup in derivative computation (https://github.com/dotnet/runtime/issues/84188). + // Additional requirements: + // - No effects (captures), since collapsing loops would change capture group bindings. + // - Same laziness for both loops, since mixing greedy/lazy changes match priorities + // (e.g. (?:0*)+? is not equivalent to 0*? — the former prefers fewer outer iterations + // each greedily consuming, while the latter prefers less overall). + if (upper == int.MaxValue && body._kind == SymbolicRegexNodeKind.Loop && body._upper == int.MaxValue + && (body._lower == 0 || (body._lower == 1 && lower == 0)) + && !body._info.ContainsEffect && isLazy == body.IsLazy) + { + Debug.Assert(body._left is not null); + return CreateLoop(builder, body._left, 0, int.MaxValue, isLazy); + } return Create(builder, SymbolicRegexNodeKind.Loop, body, null, lower, upper, default, SymbolicRegexInfo.Loop(body._info, lower, isLazy)); } @@ -1372,40 +1392,62 @@ internal SymbolicRegexNode StripEffects(SymbolicRegexBuilder builder if (!_info.ContainsEffect) return this; + // Check the cache to avoid redundant work. The derivative of deeply nested loop patterns + // like ((a)*)* produces a DAG with shared sub-trees that differ only in their Effect + // wrappers. Without caching, each shared sub-tree is traversed once per reference rather + // than once per unique node, leading to exponential time complexity in the nesting depth. + if (builder._stripEffectsCache.TryGetValue(this, out SymbolicRegexNode? cached)) + return cached; + + SymbolicRegexNode result; + // Recurse over the structure of the node to strip effects switch (_kind) { case SymbolicRegexNodeKind.Effect: Debug.Assert(_left is not null && _right is not null); // This is the place where the effect (the right child) is getting ignored - return _left.StripEffects(builder); + result = _left.StripEffects(builder); + break; case SymbolicRegexNodeKind.Concat: Debug.Assert(_left is not null && _right is not null); Debug.Assert(_left._info.ContainsEffect && !_right._info.ContainsEffect); - return builder.CreateConcat(_left.StripEffects(builder), _right); + result = builder.CreateConcat(_left.StripEffects(builder), _right); + break; case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); - // This iterative handling of nested alternations is important to avoid quadratic work in deduplicating - // the elements. We don't want to omit deduplication here, since he stripping may make nodes equal. - List> elems = ToList(listKind: SymbolicRegexNodeKind.Alternate); - for (int i = 0; i < elems.Count; i++) - elems[i] = elems[i].StripEffects(builder); - return builder.Alternate(elems); + // Strip effects from each child first, leveraging the cache for shared sub-trees, + // then flatten and deduplicate the results. Stripping before flattening is important: + // the pre-stripped tree may have exponentially many paths that collapse after effects + // are removed, but the cache ensures each unique sub-tree is only processed once. + { + SymbolicRegexNode strippedLeft = _left.StripEffects(builder); + SymbolicRegexNode strippedRight = _right.StripEffects(builder); + List> elems = strippedLeft.ToList(listKind: SymbolicRegexNodeKind.Alternate); + strippedRight.ToList(elems, listKind: SymbolicRegexNodeKind.Alternate); + result = builder.Alternate(elems); + } + break; case SymbolicRegexNodeKind.DisableBacktrackingSimulation: Debug.Assert(_left is not null); - return builder.CreateDisableBacktrackingSimulation(_left.StripEffects(builder)); + result = builder.CreateDisableBacktrackingSimulation(_left.StripEffects(builder)); + break; case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - return builder.CreateLoop(_left.StripEffects(builder), IsLazy, _lower, _upper); + result = builder.CreateLoop(_left.StripEffects(builder), IsLazy, _lower, _upper); + break; default: Debug.Fail($"{nameof(StripEffects)}:{_kind}"); return null; } + + builder._stripEffectsCache[this] = result; + return result; } /// diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index c168ed04c54396..7f42d33aa628c5 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2505,6 +2505,25 @@ public static IEnumerable Match_Count_TestData() yield return new object[] { engine, $@"{b2}\w+{b2}", "one two three", 1 }; yield return new object[] { engine, $@"{b2}\w+{b2}", "one two", 0 }; } + + // Nested loop simplification correctness. The NonBacktracking engine simplifies + // certain nested loop patterns (e.g. (R*)* → R*) during construction. These tests + // verify the simplification produces results consistent across all engines. + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + // Patterns that ARE simplified: (R*)*, (R+)*, (R*)+ all collapse to R* + yield return new object[] { engine, @"^(?:(?:a)*)*$", "aaa", 1 }; + yield return new object[] { engine, @"^(?:(?:a)+)*$", "aaa", 1 }; + yield return new object[] { engine, @"^(?:(?:a)*)+$", "aaa", 1 }; + + // (R{2,})* must NOT be simplified to R* — a single R is not in the language + yield return new object[] { engine, @"^(?:(?:a){2,})*$", "a", 0 }; + yield return new object[] { engine, @"^(?:(?:a){2,})*$", "aa", 1 }; + yield return new object[] { engine, @"^(?:(?:a){2,})*$", "", 1 }; + + // Mixed laziness: greedy inner with lazy outer must not be conflated + yield return new object[] { engine, @"(?:(?:a*)+?)", "aaa", 2 }; + } } [Theory] @@ -2579,13 +2598,14 @@ public static IEnumerable StressTestDeepNestingOfLoops_TestData() { foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - if (engine != RegexEngine.NonBacktracking) // Hangs, or effectively hangs. https://github.com/dotnet/runtime/issues/84188 - { - yield return new object[] { engine, "(", "a", ")*", "a", 2000, 1000 }; - } + // NonBacktracking uses a lower nesting depth to avoid stack overflow in the recursive + // derivative processing. 120 levels is still well beyond what would previously hang + // (previously, even 20 levels would cause >60s execution time). + int depth = RegexHelpers.IsNonBacktracking(engine) ? 120 : 2000; - yield return new object[] { engine, "(", "[aA]", ")+", "aA", 2000, 3000 }; - yield return new object[] { engine, "(", "ab", "){0,1}", "ab", 2000, 1000 }; + yield return new object[] { engine, "(", "a", ")*", "a", depth, 1000 }; + yield return new object[] { engine, "(", "[aA]", ")+", "aA", depth, 3000 }; + yield return new object[] { engine, "(", "ab", "){0,1}", "ab", depth, 1000 }; } } From 39623a5e460947aba036b84e0dce86cb45b4173d Mon Sep 17 00:00:00 2001 From: Dan Moseley Date: Fri, 6 Mar 2026 13:33:42 -0700 Subject: [PATCH 2/5] Make nested loop simplification iterative Change the nested loop flattening in CreateLoop from a recursive call (if + return CreateLoop) to a while loop that mutates lower/body in place. This avoids consuming stack proportional to the nesting depth, which could cause a stack overflow for deeply nested patterns. Also note in comments that the exponential blowup occurs during lazy DFA construction within matching, not just pattern compilation, so it violates the engine's linear-time-in-input-length guarantee. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../RegularExpressions/Symbolic/SymbolicRegexNode.cs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 27e8c6d9f12336..aaf21d20006457 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -424,17 +424,22 @@ internal static SymbolicRegexNode CreateLoop(SymbolicRegexBuilder bu // Counterexample: (R{2,∞})* cannot match a single R, so it's NOT equivalent to R*. // This is critical for performance: without it, deeply nested patterns like ((a)*)* // cause exponential blowup in derivative computation (https://github.com/dotnet/runtime/issues/84188). + // The blowup occurs during lazy DFA construction within matching, not just pattern compilation, + // so it violates the engine's linear-time-in-input-length guarantee. // Additional requirements: // - No effects (captures), since collapsing loops would change capture group bindings. // - Same laziness for both loops, since mixing greedy/lazy changes match priorities // (e.g. (?:0*)+? is not equivalent to 0*? — the former prefers fewer outer iterations // each greedily consuming, while the latter prefers less overall). - if (upper == int.MaxValue && body._kind == SymbolicRegexNodeKind.Loop && body._upper == int.MaxValue + // This is a while loop (not if + recursive CreateLoop call) so that arbitrary nesting + // depth is handled without consuming stack proportional to the depth. + while (upper == int.MaxValue && body._kind == SymbolicRegexNodeKind.Loop && body._upper == int.MaxValue && (body._lower == 0 || (body._lower == 1 && lower == 0)) && !body._info.ContainsEffect && isLazy == body.IsLazy) { Debug.Assert(body._left is not null); - return CreateLoop(builder, body._left, 0, int.MaxValue, isLazy); + lower = 0; + body = body._left; } return Create(builder, SymbolicRegexNodeKind.Loop, body, null, lower, upper, default, SymbolicRegexInfo.Loop(body._info, lower, isLazy)); } From 882dc6e5d0ce844dea1e397cf0e3c63cff3e51ef Mon Sep 17 00:00:00 2001 From: Dan Moseley Date: Tue, 10 Mar 2026 20:11:56 -0600 Subject: [PATCH 3/5] Fix comment: nested loop blowup doesn't violate linear-time guarantee The engine remains O(input_length) for any given pattern. The blowup is in the per-character cost (pattern-dependent constant), not in input-length scaling. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index aaf21d20006457..0a77d8978fb21a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -424,8 +424,9 @@ internal static SymbolicRegexNode CreateLoop(SymbolicRegexBuilder bu // Counterexample: (R{2,∞})* cannot match a single R, so it's NOT equivalent to R*. // This is critical for performance: without it, deeply nested patterns like ((a)*)* // cause exponential blowup in derivative computation (https://github.com/dotnet/runtime/issues/84188). - // The blowup occurs during lazy DFA construction within matching, not just pattern compilation, - // so it violates the engine's linear-time-in-input-length guarantee. + // The blowup is in the per-character cost (pattern-dependent constant), not in input-length + // scaling — the engine remains O(input_length) for any given pattern — but the constant + // grows exponentially with nesting depth, making matching impractically slow. // Additional requirements: // - No effects (captures), since collapsing loops would change capture group bindings. // - Same laziness for both loops, since mixing greedy/lazy changes match priorities From 1c017caa0d223167692dcd1c1cb28964fa24cc6c Mon Sep 17 00:00:00 2001 From: Dan Moseley Date: Tue, 10 Mar 2026 20:13:38 -0600 Subject: [PATCH 4/5] Remove issue URL from code comment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 0a77d8978fb21a..9c24d9f6ceefbe 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -423,7 +423,7 @@ internal static SymbolicRegexNode CreateLoop(SymbolicRegexBuilder bu // - a == 1 && b == 0 (outer can take 0 iterations for ε, or n iterations of R+ for any n ≥ 1). // Counterexample: (R{2,∞})* cannot match a single R, so it's NOT equivalent to R*. // This is critical for performance: without it, deeply nested patterns like ((a)*)* - // cause exponential blowup in derivative computation (https://github.com/dotnet/runtime/issues/84188). + // cause exponential blowup in derivative computation. // The blowup is in the per-character cost (pattern-dependent constant), not in input-length // scaling — the engine remains O(input_length) for any given pattern — but the constant // grows exponentially with nesting depth, making matching impractically slow. From 6e341c52757dfeb3f7bc2b994abebebf1d90ae1e Mon Sep 17 00:00:00 2001 From: Dan Moseley Date: Wed, 11 Mar 2026 19:15:32 -0600 Subject: [PATCH 5/5] Account for nested unbounded loops in NFA size estimate Improve CountSingletons to multiply the estimate by 2 when an unbounded loop's body is itself an unbounded loop. This causes the NFA size estimate to grow exponentially with nesting depth, so patterns with alternating lazy/greedy nesting at 15+ levels are rejected upfront. Add tests: - Alternating-laziness depth-20 pattern exceeds unsafe threshold - Same-laziness depth-100 stays safe (loop flattening collapses it) - (a{2})* correctness: odd-length strings must not match Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Symbolic/SymbolicRegexNode.cs | 19 ++++++- .../FunctionalTests/Regex.Match.Tests.cs | 7 +++ .../tests/UnitTests/SymbolicRegexTests.cs | 50 +++++++++++++++++++ 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 9c24d9f6ceefbe..ba119848183047 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -2343,8 +2343,23 @@ internal int CountSingletons() { if (_lower is 0 or int.MaxValue) { - // infinite loop has the same size as a *-loop - return _left.CountSingletons(); + int bodyCount = _left.CountSingletons(); + + // When an unbounded loop's body is itself an unbounded loop, the derivative + // computation creates a 2-way Alternate at each nesting level (because the + // Concat derivative branches when its left side is nullable). This means + // the per-character derivative cost doubles with each such nesting level. + // Account for this so that the NFA size estimate catches deeply nested + // patterns that would cause exponential blowup. + // The loop flattening in CreateLoop already collapses same-laziness nesting + // without captures, so this multiplier primarily affects patterns that survive + // flattening (e.g., alternating lazy/greedy nesting at every level). + if (_left._kind == SymbolicRegexNodeKind.Loop && _left._upper == int.MaxValue) + { + return Times(2, bodyCount); + } + + return bodyCount; } // the upper bound is not being used, so the lower must be non-zero diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 7f42d33aa628c5..330fb94f03ffa8 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2521,6 +2521,13 @@ public static IEnumerable Match_Count_TestData() yield return new object[] { engine, @"^(?:(?:a){2,})*$", "aa", 1 }; yield return new object[] { engine, @"^(?:(?:a){2,})*$", "", 1 }; + // (R{2})* must NOT be simplified to R* — odd-length strings must not match + yield return new object[] { engine, @"^(?:(?:a){2})*$", "", 1 }; + yield return new object[] { engine, @"^(?:(?:a){2})*$", "a", 0 }; + yield return new object[] { engine, @"^(?:(?:a){2})*$", "aa", 1 }; + yield return new object[] { engine, @"^(?:(?:a){2})*$", "aaa", 0 }; + yield return new object[] { engine, @"^(?:(?:a){2})*$", "aaaa", 1 }; + // Mixed laziness: greedy inner with lazy outer must not be conflated yield return new object[] { engine, @"(?:(?:a*)+?)", "aaa", 2 }; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs index c14e5e366e53b0..b053ab126104c5 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Globalization; +using System.Text; using Xunit; using System.Text.RegularExpressions.Symbolic; using System.Collections.Generic; @@ -228,6 +229,25 @@ public static IEnumerable UnsafeThresholdTests_MemberData() SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); yield return new object[] { rootNode }; } + + // Deeply nested unbounded loops with alternating laziness bypass loop flattening + // (which requires matching laziness) and cause exponential derivative blowup. + // CountSingletons accounts for this by multiplying by 2 per nesting level, so + // at depth 15+ the estimate exceeds the default 10,000 threshold. + // Build: (?:(?:(?:a)*?)*)*? ... with alternating greedy/lazy at each level + { + const int depth = 20; + var sb = new StringBuilder(); + for (int i = 0; i < depth; i++) + sb.Append("(?:"); + sb.Append('a'); + for (int i = depth - 1; i >= 0; i--) + sb.Append(i % 2 == 0 ? ")*" : ")*?"); + + RegexNode tree = RegexParser.Parse(sb.ToString(), options, CultureInfo.CurrentCulture).Root; + SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); + yield return new object[] { rootNode }; + } } [Theory] @@ -238,6 +258,36 @@ public void UnsafeThresholdTests(object node) Assert.True(size > SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold()); } + /// + /// Verifies that deeply nested same-laziness unbounded loops do NOT exceed the threshold, + /// because loop flattening in CreateLoop collapses them (e.g. ((?:a)*)* at depth 100 → a*). + /// + [Fact] + public void SameLazinessDeepNestingIsSafeAfterFlattening() + { + var charSetSolver = new CharSetSolver(); + var bddBuilder = new SymbolicRegexBuilder(charSetSolver, charSetSolver); + var converter = new RegexNodeConverter(bddBuilder, null); + RegexOptions options = RegexOptions.NonBacktracking | RegexOptions.ExplicitCapture; + + // Build (?:(?:(?:a)*)*...)* at depth 100 — all same laziness (greedy) + const int depth = 100; + var sb = new StringBuilder(); + for (int i = 0; i < depth; i++) + sb.Append("(?:"); + sb.Append('a'); + for (int i = 0; i < depth; i++) + sb.Append(")*"); + + RegexNode tree = RegexParser.Parse(sb.ToString(), options, CultureInfo.CurrentCulture).Root; + SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); + + // After loop flattening, this collapses to a*, so the estimate should be tiny + int size = rootNode.EstimateNfaSize(); + Assert.True(size <= SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold(), + $"Same-laziness nested loops should be safe after flattening, but EstimateNfaSize returned {size}"); + } + [Theory] [InlineData(200, 200)] [InlineData(10_000, 10_000)]