diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs index c71441a24a00a3..a6f166d106570a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs @@ -102,6 +102,14 @@ public bool Equals(SymbolicRegexBuilder.NodeCacheKey other) => /// internal readonly Dictionary<(SymbolicRegexNode, uint), SymbolicRegexNode> _pruneLowerPriorityThanNullabilityCache = new(); + /// + /// Cache for keyed by the node + /// to strip. The value is the stripped node. This cache is essential for avoiding exponential blowup when + /// stripping effects from derivatives of deeply nested loop patterns (e.g. ((a)*)*), where the derivative + /// tree is a DAG with shared sub-trees that would otherwise be traversed repeatedly. + /// + internal readonly Dictionary, SymbolicRegexNode> _stripEffectsCache = new(); + /// /// Cache for keyed by: /// -The node R potentially subsuming S diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 0c3c6edf7c168f..ba119848183047 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -416,6 +416,32 @@ internal static SymbolicRegexNode CreateLoop(SymbolicRegexBuilder bu Debug.Assert(body._left is not null); return CreateLoop(builder, body._left, 0, 1, isLazy || body.IsLazy); } + // Simplify (R*)* → R*, (R+)* → R*, (R*)+ → R* + // More generally: (R{a,∞}){b,∞} → R{0,∞} when the combined loop can match any + // number of R's including zero. This holds when either: + // - a == 0 (inner loop already matches ε, so each outer iteration can be empty), or + // - a == 1 && b == 0 (outer can take 0 iterations for ε, or n iterations of R+ for any n ≥ 1). + // Counterexample: (R{2,∞})* cannot match a single R, so it's NOT equivalent to R*. + // This is critical for performance: without it, deeply nested patterns like ((a)*)* + // cause exponential blowup in derivative computation. + // The blowup is in the per-character cost (pattern-dependent constant), not in input-length + // scaling — the engine remains O(input_length) for any given pattern — but the constant + // grows exponentially with nesting depth, making matching impractically slow. + // Additional requirements: + // - No effects (captures), since collapsing loops would change capture group bindings. + // - Same laziness for both loops, since mixing greedy/lazy changes match priorities + // (e.g. (?:0*)+? is not equivalent to 0*? — the former prefers fewer outer iterations + // each greedily consuming, while the latter prefers less overall). + // This is a while loop (not if + recursive CreateLoop call) so that arbitrary nesting + // depth is handled without consuming stack proportional to the depth. + while (upper == int.MaxValue && body._kind == SymbolicRegexNodeKind.Loop && body._upper == int.MaxValue + && (body._lower == 0 || (body._lower == 1 && lower == 0)) + && !body._info.ContainsEffect && isLazy == body.IsLazy) + { + Debug.Assert(body._left is not null); + lower = 0; + body = body._left; + } return Create(builder, SymbolicRegexNodeKind.Loop, body, null, lower, upper, default, SymbolicRegexInfo.Loop(body._info, lower, isLazy)); } @@ -1372,40 +1398,62 @@ internal SymbolicRegexNode StripEffects(SymbolicRegexBuilder builder if (!_info.ContainsEffect) return this; + // Check the cache to avoid redundant work. The derivative of deeply nested loop patterns + // like ((a)*)* produces a DAG with shared sub-trees that differ only in their Effect + // wrappers. Without caching, each shared sub-tree is traversed once per reference rather + // than once per unique node, leading to exponential time complexity in the nesting depth. + if (builder._stripEffectsCache.TryGetValue(this, out SymbolicRegexNode? cached)) + return cached; + + SymbolicRegexNode result; + // Recurse over the structure of the node to strip effects switch (_kind) { case SymbolicRegexNodeKind.Effect: Debug.Assert(_left is not null && _right is not null); // This is the place where the effect (the right child) is getting ignored - return _left.StripEffects(builder); + result = _left.StripEffects(builder); + break; case SymbolicRegexNodeKind.Concat: Debug.Assert(_left is not null && _right is not null); Debug.Assert(_left._info.ContainsEffect && !_right._info.ContainsEffect); - return builder.CreateConcat(_left.StripEffects(builder), _right); + result = builder.CreateConcat(_left.StripEffects(builder), _right); + break; case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); - // This iterative handling of nested alternations is important to avoid quadratic work in deduplicating - // the elements. We don't want to omit deduplication here, since he stripping may make nodes equal. - List> elems = ToList(listKind: SymbolicRegexNodeKind.Alternate); - for (int i = 0; i < elems.Count; i++) - elems[i] = elems[i].StripEffects(builder); - return builder.Alternate(elems); + // Strip effects from each child first, leveraging the cache for shared sub-trees, + // then flatten and deduplicate the results. Stripping before flattening is important: + // the pre-stripped tree may have exponentially many paths that collapse after effects + // are removed, but the cache ensures each unique sub-tree is only processed once. + { + SymbolicRegexNode strippedLeft = _left.StripEffects(builder); + SymbolicRegexNode strippedRight = _right.StripEffects(builder); + List> elems = strippedLeft.ToList(listKind: SymbolicRegexNodeKind.Alternate); + strippedRight.ToList(elems, listKind: SymbolicRegexNodeKind.Alternate); + result = builder.Alternate(elems); + } + break; case SymbolicRegexNodeKind.DisableBacktrackingSimulation: Debug.Assert(_left is not null); - return builder.CreateDisableBacktrackingSimulation(_left.StripEffects(builder)); + result = builder.CreateDisableBacktrackingSimulation(_left.StripEffects(builder)); + break; case SymbolicRegexNodeKind.Loop: Debug.Assert(_left is not null); - return builder.CreateLoop(_left.StripEffects(builder), IsLazy, _lower, _upper); + result = builder.CreateLoop(_left.StripEffects(builder), IsLazy, _lower, _upper); + break; default: Debug.Fail($"{nameof(StripEffects)}:{_kind}"); return null; } + + builder._stripEffectsCache[this] = result; + return result; } /// @@ -2295,8 +2343,23 @@ internal int CountSingletons() { if (_lower is 0 or int.MaxValue) { - // infinite loop has the same size as a *-loop - return _left.CountSingletons(); + int bodyCount = _left.CountSingletons(); + + // When an unbounded loop's body is itself an unbounded loop, the derivative + // computation creates a 2-way Alternate at each nesting level (because the + // Concat derivative branches when its left side is nullable). This means + // the per-character derivative cost doubles with each such nesting level. + // Account for this so that the NFA size estimate catches deeply nested + // patterns that would cause exponential blowup. + // The loop flattening in CreateLoop already collapses same-laziness nesting + // without captures, so this multiplier primarily affects patterns that survive + // flattening (e.g., alternating lazy/greedy nesting at every level). + if (_left._kind == SymbolicRegexNodeKind.Loop && _left._upper == int.MaxValue) + { + return Times(2, bodyCount); + } + + return bodyCount; } // the upper bound is not being used, so the lower must be non-zero diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index cd5eb03827ffbd..da3dd0bbd72274 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2581,6 +2581,32 @@ public static IEnumerable Match_Count_TestData() yield return new object[] { engine, $@"{b2}\w+{b2}", "one two three", 1 }; yield return new object[] { engine, $@"{b2}\w+{b2}", "one two", 0 }; } + + // Nested loop simplification correctness. The NonBacktracking engine simplifies + // certain nested loop patterns (e.g. (R*)* → R*) during construction. These tests + // verify the simplification produces results consistent across all engines. + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + // Patterns that ARE simplified: (R*)*, (R+)*, (R*)+ all collapse to R* + yield return new object[] { engine, @"^(?:(?:a)*)*$", "aaa", 1 }; + yield return new object[] { engine, @"^(?:(?:a)+)*$", "aaa", 1 }; + yield return new object[] { engine, @"^(?:(?:a)*)+$", "aaa", 1 }; + + // (R{2,})* must NOT be simplified to R* — a single R is not in the language + yield return new object[] { engine, @"^(?:(?:a){2,})*$", "a", 0 }; + yield return new object[] { engine, @"^(?:(?:a){2,})*$", "aa", 1 }; + yield return new object[] { engine, @"^(?:(?:a){2,})*$", "", 1 }; + + // (R{2})* must NOT be simplified to R* — odd-length strings must not match + yield return new object[] { engine, @"^(?:(?:a){2})*$", "", 1 }; + yield return new object[] { engine, @"^(?:(?:a){2})*$", "a", 0 }; + yield return new object[] { engine, @"^(?:(?:a){2})*$", "aa", 1 }; + yield return new object[] { engine, @"^(?:(?:a){2})*$", "aaa", 0 }; + yield return new object[] { engine, @"^(?:(?:a){2})*$", "aaaa", 1 }; + + // Mixed laziness: greedy inner with lazy outer must not be conflated + yield return new object[] { engine, @"(?:(?:a*)+?)", "aaa", 2 }; + } } [Theory] @@ -2655,13 +2681,14 @@ public static IEnumerable StressTestDeepNestingOfLoops_TestData() { foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - if (engine != RegexEngine.NonBacktracking) // Hangs, or effectively hangs. https://github.com/dotnet/runtime/issues/84188 - { - yield return new object[] { engine, "(", "a", ")*", "a", 2000, 1000 }; - } - - yield return new object[] { engine, "(", "[aA]", ")+", "aA", 2000, 3000 }; - yield return new object[] { engine, "(", "ab", "){0,1}", "ab", 2000, 1000 }; + // NonBacktracking uses a lower nesting depth to avoid stack overflow in the recursive + // derivative processing. 120 levels is still well beyond what would previously hang + // (previously, even 20 levels would cause >60s execution time). + int depth = RegexHelpers.IsNonBacktracking(engine) ? 120 : 2000; + + yield return new object[] { engine, "(", "a", ")*", "a", depth, 1000 }; + yield return new object[] { engine, "(", "[aA]", ")+", "aA", depth, 3000 }; + yield return new object[] { engine, "(", "ab", "){0,1}", "ab", depth, 1000 }; } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs index c14e5e366e53b0..b053ab126104c5 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Globalization; +using System.Text; using Xunit; using System.Text.RegularExpressions.Symbolic; using System.Collections.Generic; @@ -228,6 +229,25 @@ public static IEnumerable UnsafeThresholdTests_MemberData() SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); yield return new object[] { rootNode }; } + + // Deeply nested unbounded loops with alternating laziness bypass loop flattening + // (which requires matching laziness) and cause exponential derivative blowup. + // CountSingletons accounts for this by multiplying by 2 per nesting level, so + // at depth 15+ the estimate exceeds the default 10,000 threshold. + // Build: (?:(?:(?:a)*?)*)*? ... with alternating greedy/lazy at each level + { + const int depth = 20; + var sb = new StringBuilder(); + for (int i = 0; i < depth; i++) + sb.Append("(?:"); + sb.Append('a'); + for (int i = depth - 1; i >= 0; i--) + sb.Append(i % 2 == 0 ? ")*" : ")*?"); + + RegexNode tree = RegexParser.Parse(sb.ToString(), options, CultureInfo.CurrentCulture).Root; + SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); + yield return new object[] { rootNode }; + } } [Theory] @@ -238,6 +258,36 @@ public void UnsafeThresholdTests(object node) Assert.True(size > SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold()); } + /// + /// Verifies that deeply nested same-laziness unbounded loops do NOT exceed the threshold, + /// because loop flattening in CreateLoop collapses them (e.g. ((?:a)*)* at depth 100 → a*). + /// + [Fact] + public void SameLazinessDeepNestingIsSafeAfterFlattening() + { + var charSetSolver = new CharSetSolver(); + var bddBuilder = new SymbolicRegexBuilder(charSetSolver, charSetSolver); + var converter = new RegexNodeConverter(bddBuilder, null); + RegexOptions options = RegexOptions.NonBacktracking | RegexOptions.ExplicitCapture; + + // Build (?:(?:(?:a)*)*...)* at depth 100 — all same laziness (greedy) + const int depth = 100; + var sb = new StringBuilder(); + for (int i = 0; i < depth; i++) + sb.Append("(?:"); + sb.Append('a'); + for (int i = 0; i < depth; i++) + sb.Append(")*"); + + RegexNode tree = RegexParser.Parse(sb.ToString(), options, CultureInfo.CurrentCulture).Root; + SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); + + // After loop flattening, this collapses to a*, so the estimate should be tiny + int size = rootNode.EstimateNfaSize(); + Assert.True(size <= SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold(), + $"Same-laziness nested loops should be safe after flattening, but EstimateNfaSize returned {size}"); + } + [Theory] [InlineData(200, 200)] [InlineData(10_000, 10_000)]