Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,14 @@ public bool Equals(SymbolicRegexBuilder<TSet>.NodeCacheKey other) =>
/// </summary>
internal readonly Dictionary<(SymbolicRegexNode<TSet>, uint), SymbolicRegexNode<TSet>> _pruneLowerPriorityThanNullabilityCache = new();

/// <summary>
/// Cache for <see cref="SymbolicRegexNode{TSet}.StripEffects(SymbolicRegexBuilder{TSet})"/> keyed by the node
/// to strip. The value is the stripped node. This cache is essential for avoiding exponential blowup when
/// stripping effects from derivatives of deeply nested loop patterns (e.g. ((a)*)*), where the derivative
/// tree is a DAG with shared sub-trees that would otherwise be traversed repeatedly.
/// </summary>
internal readonly Dictionary<SymbolicRegexNode<TSet>, SymbolicRegexNode<TSet>> _stripEffectsCache = new();

/// <summary>
/// Cache for <see cref="SymbolicRegexNode{TSet}.Subsumes(SymbolicRegexBuilder{TSet}, SymbolicRegexNode{TSet}, int)"/> keyed by:
/// -The node R potentially subsuming S
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,32 @@ internal static SymbolicRegexNode<TSet> CreateLoop(SymbolicRegexBuilder<TSet> bu
Debug.Assert(body._left is not null);
return CreateLoop(builder, body._left, 0, 1, isLazy || body.IsLazy);
}
// Simplify (R*)* → R*, (R+)* → R*, (R*)+ → R*
// More generally: (R{a,∞}){b,∞} → R{0,∞} when the combined loop can match any
// number of R's including zero. This holds when either:
// - a == 0 (inner loop already matches ε, so each outer iteration can be empty), or
// - a == 1 && b == 0 (outer can take 0 iterations for ε, or n iterations of R+ for any n ≥ 1).
// Counterexample: (R{2,∞})* cannot match a single R, so it's NOT equivalent to R*.
// This is critical for performance: without it, deeply nested patterns like ((a)*)*
// cause exponential blowup in derivative computation.
// The blowup is in the per-character cost (pattern-dependent constant), not in input-length
// scaling — the engine remains O(input_length) for any given pattern — but the constant
// grows exponentially with nesting depth, making matching impractically slow.
// Additional requirements:
// - No effects (captures), since collapsing loops would change capture group bindings.
// - Same laziness for both loops, since mixing greedy/lazy changes match priorities
// (e.g. (?:0*)+? is not equivalent to 0*? — the former prefers fewer outer iterations
// each greedily consuming, while the latter prefers less overall).
// This is a while loop (not if + recursive CreateLoop call) so that arbitrary nesting
// depth is handled without consuming stack proportional to the depth.
while (upper == int.MaxValue && body._kind == SymbolicRegexNodeKind.Loop && body._upper == int.MaxValue
&& (body._lower == 0 || (body._lower == 1 && lower == 0))
&& !body._info.ContainsEffect && isLazy == body.IsLazy)
{
Debug.Assert(body._left is not null);
lower = 0;
body = body._left;
}
return Create(builder, SymbolicRegexNodeKind.Loop, body, null, lower, upper, default, SymbolicRegexInfo.Loop(body._info, lower, isLazy));
}

Expand Down Expand Up @@ -1372,40 +1398,62 @@ internal SymbolicRegexNode<TSet> StripEffects(SymbolicRegexBuilder<TSet> builder
if (!_info.ContainsEffect)
return this;

// Check the cache to avoid redundant work. The derivative of deeply nested loop patterns
// like ((a)*)* produces a DAG with shared sub-trees that differ only in their Effect
// wrappers. Without caching, each shared sub-tree is traversed once per reference rather
// than once per unique node, leading to exponential time complexity in the nesting depth.
if (builder._stripEffectsCache.TryGetValue(this, out SymbolicRegexNode<TSet>? cached))
return cached;

SymbolicRegexNode<TSet> result;

// Recurse over the structure of the node to strip effects
switch (_kind)
{
case SymbolicRegexNodeKind.Effect:
Debug.Assert(_left is not null && _right is not null);
// This is the place where the effect (the right child) is getting ignored
return _left.StripEffects(builder);
result = _left.StripEffects(builder);
break;

case SymbolicRegexNodeKind.Concat:
Debug.Assert(_left is not null && _right is not null);
Debug.Assert(_left._info.ContainsEffect && !_right._info.ContainsEffect);
return builder.CreateConcat(_left.StripEffects(builder), _right);
result = builder.CreateConcat(_left.StripEffects(builder), _right);
break;

case SymbolicRegexNodeKind.Alternate:
Debug.Assert(_left is not null && _right is not null);
// This iterative handling of nested alternations is important to avoid quadratic work in deduplicating
// the elements. We don't want to omit deduplication here, since he stripping may make nodes equal.
List<SymbolicRegexNode<TSet>> elems = ToList(listKind: SymbolicRegexNodeKind.Alternate);
for (int i = 0; i < elems.Count; i++)
elems[i] = elems[i].StripEffects(builder);
return builder.Alternate(elems);
// Strip effects from each child first, leveraging the cache for shared sub-trees,
// then flatten and deduplicate the results. Stripping before flattening is important:
// the pre-stripped tree may have exponentially many paths that collapse after effects
// are removed, but the cache ensures each unique sub-tree is only processed once.
{
SymbolicRegexNode<TSet> strippedLeft = _left.StripEffects(builder);
SymbolicRegexNode<TSet> strippedRight = _right.StripEffects(builder);
List<SymbolicRegexNode<TSet>> elems = strippedLeft.ToList(listKind: SymbolicRegexNodeKind.Alternate);
strippedRight.ToList(elems, listKind: SymbolicRegexNodeKind.Alternate);
result = builder.Alternate(elems);
}
break;

case SymbolicRegexNodeKind.DisableBacktrackingSimulation:
Debug.Assert(_left is not null);
return builder.CreateDisableBacktrackingSimulation(_left.StripEffects(builder));
result = builder.CreateDisableBacktrackingSimulation(_left.StripEffects(builder));
break;

case SymbolicRegexNodeKind.Loop:
Debug.Assert(_left is not null);
return builder.CreateLoop(_left.StripEffects(builder), IsLazy, _lower, _upper);
result = builder.CreateLoop(_left.StripEffects(builder), IsLazy, _lower, _upper);
break;

default:
Debug.Fail($"{nameof(StripEffects)}:{_kind}");
return null;
}

builder._stripEffectsCache[this] = result;
return result;
}

/// <summary>
Expand Down Expand Up @@ -2295,8 +2343,23 @@ internal int CountSingletons()
{
if (_lower is 0 or int.MaxValue)
{
// infinite loop has the same size as a *-loop
return _left.CountSingletons();
int bodyCount = _left.CountSingletons();

// When an unbounded loop's body is itself an unbounded loop, the derivative
// computation creates a 2-way Alternate at each nesting level (because the
// Concat derivative branches when its left side is nullable). This means
// the per-character derivative cost doubles with each such nesting level.
// Account for this so that the NFA size estimate catches deeply nested
// patterns that would cause exponential blowup.
// The loop flattening in CreateLoop already collapses same-laziness nesting
// without captures, so this multiplier primarily affects patterns that survive
// flattening (e.g., alternating lazy/greedy nesting at every level).
if (_left._kind == SymbolicRegexNodeKind.Loop && _left._upper == int.MaxValue)
{
return Times(2, bodyCount);
}

return bodyCount;
}

// the upper bound is not being used, so the lower must be non-zero
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2581,6 +2581,32 @@ public static IEnumerable<object[]> Match_Count_TestData()
yield return new object[] { engine, $@"{b2}\w+{b2}", "one two three", 1 };
yield return new object[] { engine, $@"{b2}\w+{b2}", "one two", 0 };
}

// Nested loop simplification correctness. The NonBacktracking engine simplifies
// certain nested loop patterns (e.g. (R*)* → R*) during construction. These tests
// verify the simplification produces results consistent across all engines.
foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
{
// Patterns that ARE simplified: (R*)*, (R+)*, (R*)+ all collapse to R*
yield return new object[] { engine, @"^(?:(?:a)*)*$", "aaa", 1 };
yield return new object[] { engine, @"^(?:(?:a)+)*$", "aaa", 1 };
yield return new object[] { engine, @"^(?:(?:a)*)+$", "aaa", 1 };

// (R{2,})* must NOT be simplified to R* — a single R is not in the language
yield return new object[] { engine, @"^(?:(?:a){2,})*$", "a", 0 };
yield return new object[] { engine, @"^(?:(?:a){2,})*$", "aa", 1 };
yield return new object[] { engine, @"^(?:(?:a){2,})*$", "", 1 };

// (R{2})* must NOT be simplified to R* — odd-length strings must not match
yield return new object[] { engine, @"^(?:(?:a){2})*$", "", 1 };
yield return new object[] { engine, @"^(?:(?:a){2})*$", "a", 0 };
yield return new object[] { engine, @"^(?:(?:a){2})*$", "aa", 1 };
yield return new object[] { engine, @"^(?:(?:a){2})*$", "aaa", 0 };
yield return new object[] { engine, @"^(?:(?:a){2})*$", "aaaa", 1 };

// Mixed laziness: greedy inner with lazy outer must not be conflated
yield return new object[] { engine, @"(?:(?:a*)+?)", "aaa", 2 };
}
}

[Theory]
Expand Down Expand Up @@ -2655,13 +2681,14 @@ public static IEnumerable<object[]> StressTestDeepNestingOfLoops_TestData()
{
foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
{
if (engine != RegexEngine.NonBacktracking) // Hangs, or effectively hangs. https://github.com/dotnet/runtime/issues/84188
{
yield return new object[] { engine, "(", "a", ")*", "a", 2000, 1000 };
}

yield return new object[] { engine, "(", "[aA]", ")+", "aA", 2000, 3000 };
yield return new object[] { engine, "(", "ab", "){0,1}", "ab", 2000, 1000 };
// NonBacktracking uses a lower nesting depth to avoid stack overflow in the recursive
// derivative processing. 120 levels is still well beyond what would previously hang
// (previously, even 20 levels would cause >60s execution time).
int depth = RegexHelpers.IsNonBacktracking(engine) ? 120 : 2000;

yield return new object[] { engine, "(", "a", ")*", "a", depth, 1000 };
yield return new object[] { engine, "(", "[aA]", ")+", "aA", depth, 3000 };
yield return new object[] { engine, "(", "ab", "){0,1}", "ab", depth, 1000 };
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System.Globalization;
using System.Text;
using Xunit;
using System.Text.RegularExpressions.Symbolic;
using System.Collections.Generic;
Expand Down Expand Up @@ -228,6 +229,25 @@ public static IEnumerable<object[]> UnsafeThresholdTests_MemberData()
SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
yield return new object[] { rootNode };
}

// Deeply nested unbounded loops with alternating laziness bypass loop flattening
// (which requires matching laziness) and cause exponential derivative blowup.
// CountSingletons accounts for this by multiplying by 2 per nesting level, so
// at depth 15+ the estimate exceeds the default 10,000 threshold.
// Build: (?:(?:(?:a)*?)*)*? ... with alternating greedy/lazy at each level
Copy link

Copilot AI Mar 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment showing the example pattern doesn't match the actual pattern constructed by the loops: the closing loop runs from depth-1 down to 0, making the outermost quantifier greedy ("") rather than lazy ("?") as implied by ...)*? .... Please update the comment to reflect the real generated pattern (or describe the alternation without suggesting a specific outermost suffix).

Suggested change
// Build: (?:(?:(?:a)*?)*)*? ... with alternating greedy/lazy at each level
// Build: (?:(?:(?:a)*)*?)* ... with alternating greedy/lazy quantifiers at each level

Copilot uses AI. Check for mistakes.
{
const int depth = 20;
var sb = new StringBuilder();
for (int i = 0; i < depth; i++)
sb.Append("(?:");
sb.Append('a');
for (int i = depth - 1; i >= 0; i--)
sb.Append(i % 2 == 0 ? ")*" : ")*?");

RegexNode tree = RegexParser.Parse(sb.ToString(), options, CultureInfo.CurrentCulture).Root;
SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
yield return new object[] { rootNode };
}
}

[Theory]
Expand All @@ -238,6 +258,36 @@ public void UnsafeThresholdTests(object node)
Assert.True(size > SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold());
}

/// <summary>
/// Verifies that deeply nested same-laziness unbounded loops do NOT exceed the threshold,
/// because loop flattening in CreateLoop collapses them (e.g. ((?:a)*)* at depth 100 → a*).
/// </summary>
[Fact]
public void SameLazinessDeepNestingIsSafeAfterFlattening()
{
var charSetSolver = new CharSetSolver();
var bddBuilder = new SymbolicRegexBuilder<BDD>(charSetSolver, charSetSolver);
var converter = new RegexNodeConverter(bddBuilder, null);
RegexOptions options = RegexOptions.NonBacktracking | RegexOptions.ExplicitCapture;

// Build (?:(?:(?:a)*)*...)* at depth 100 — all same laziness (greedy)
const int depth = 100;
var sb = new StringBuilder();
for (int i = 0; i < depth; i++)
sb.Append("(?:");
sb.Append('a');
for (int i = 0; i < depth; i++)
sb.Append(")*");

RegexNode tree = RegexParser.Parse(sb.ToString(), options, CultureInfo.CurrentCulture).Root;
SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);

// After loop flattening, this collapses to a*, so the estimate should be tiny
int size = rootNode.EstimateNfaSize();
Assert.True(size <= SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold(),
$"Same-laziness nested loops should be safe after flattening, but EstimateNfaSize returned {size}");
}

[Theory]
[InlineData(200, 200)]
[InlineData(10_000, 10_000)]
Expand Down
Loading