diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index c21987a7107ea7..53ca0dc28076de 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -1237,7 +1237,7 @@ public static bool CharInClass(char ch, string set, ref uint[]? asciiLazyCache)
// For ASCII, lazily initialize. For non-ASCII, just compute the value.
return ch < 128 ?
InitializeValue(ch, set, ref asciiLazyCache) :
- CharInClassRecursive(ch, set, 0);
+ CharInClassIterative(ch, set, 0);
static bool InitializeValue(char ch, string set, ref uint[]? asciiLazyCache)
{
@@ -1269,27 +1269,31 @@ static bool InitializeValue(char ch, string set, ref uint[]? asciiLazyCache)
/// Determines a character's membership in a character class (via the string representation of the class).
///
public static bool CharInClass(char ch, string set) =>
- CharInClassRecursive(ch, set, 0);
+ CharInClassIterative(ch, set, 0);
- private static bool CharInClassRecursive(char ch, string set, int start)
+ private static bool CharInClassIterative(char ch, string set, int start)
{
- int setLength = set[start + SetLengthIndex];
- int categoryLength = set[start + CategoryLengthIndex];
- int endPosition = start + SetStartIndex + setLength + categoryLength;
+ bool inClass = false;
- bool inClass = CharInClassInternal(ch, set, start, setLength, categoryLength);
-
- // Note that we apply the negation *before* performing the subtraction. This is because
- // the negation only applies to the first char class, not the entire subtraction.
- if (IsNegated(set, start))
+ while (true)
{
+ int setLength = set[start + SetLengthIndex];
+ int categoryLength = set[start + CategoryLengthIndex];
+ int endPosition = start + SetStartIndex + setLength + categoryLength;
+
+ if (CharInClassInternal(ch, set, start, setLength, categoryLength) == IsNegated(set, start))
+ {
+ break;
+ }
+
inClass = !inClass;
- }
- // Subtract if necessary
- if (inClass && set.Length > endPosition)
- {
- inClass = !CharInClassRecursive(ch, set, endPosition);
+ if (set.Length <= endPosition)
+ {
+ break;
+ }
+
+ start = endPosition;
}
return inClass;
@@ -1427,32 +1431,48 @@ private static bool CharInCategoryGroup(UnicodeCategory chcategory, ReadOnlySpan
return result;
}
- public static RegexCharClass Parse(string charClass) => ParseRecursive(charClass, 0);
-
- private static RegexCharClass ParseRecursive(string charClass, int start)
+ public static RegexCharClass Parse(string charClass)
{
- int setLength = charClass[start + SetLengthIndex];
- int categoryLength = charClass[start + CategoryLengthIndex];
- int endPosition = start + SetStartIndex + setLength + categoryLength;
+ RegexCharClass? outermost = null;
+ RegexCharClass? current = null;
- int i = start + SetStartIndex;
- int end = i + setLength;
+ int pos = 0;
+ while (true)
+ {
+ int setLength = charClass[pos + SetLengthIndex];
+ int categoryLength = charClass[pos + CategoryLengthIndex];
+ int endPosition = pos + SetStartIndex + setLength + categoryLength;
- List<(char First, char Last)>? ranges = ComputeRanges(charClass.AsSpan(start));
+ List<(char First, char Last)>? ranges = ComputeRanges(charClass.AsSpan(pos));
- RegexCharClass? sub = null;
- if (charClass.Length > endPosition)
- {
- sub = ParseRecursive(charClass, endPosition);
- }
+ StringBuilder? categoriesBuilder = null;
+ if (categoryLength > 0)
+ {
+ int end = pos + SetStartIndex + setLength;
+ categoriesBuilder = new StringBuilder().Append(charClass.AsSpan(end, categoryLength));
+ }
- StringBuilder? categoriesBuilder = null;
- if (categoryLength > 0)
- {
- categoriesBuilder = new StringBuilder().Append(charClass.AsSpan(end, categoryLength));
+ var level = new RegexCharClass(IsNegated(charClass, pos), ranges, categoriesBuilder, subtraction: null);
+
+ if (outermost is null)
+ {
+ outermost = level;
+ }
+ else
+ {
+ current!.AddSubtraction(level);
+ }
+ current = level;
+
+ if (charClass.Length <= endPosition)
+ {
+ break;
+ }
+
+ pos = endPosition;
}
- return new RegexCharClass(IsNegated(charClass, start), ranges, categoriesBuilder, sub);
+ return outermost!;
}
/// Computes a list of all of the character ranges in the set string.
@@ -1591,51 +1611,52 @@ internal static string CharsToStringClass(ReadOnlySpan chars)
public string ToStringClass()
{
var vsb = new ValueStringBuilder(stackalloc char[256]);
- ToStringClass(ref vsb);
- return vsb.ToString();
- }
- private void ToStringClass(ref ValueStringBuilder vsb)
- {
- Canonicalize();
+ RegexCharClass? current = this;
+ do
+ {
+ current.Canonicalize();
- int initialLength = vsb.Length;
- int categoriesLength = _categories?.Length ?? 0;
- Span headerSpan = vsb.AppendSpan(SetStartIndex);
- headerSpan[FlagsIndex] = (char)(_negate ? 1 : 0);
- headerSpan[SetLengthIndex] = '\0'; // (will be replaced once we know how long a range we've added)
- headerSpan[CategoryLengthIndex] = (char)categoriesLength;
+ int initialLength = vsb.Length;
+ int categoriesLength = current._categories?.Length ?? 0;
+ Span headerSpan = vsb.AppendSpan(SetStartIndex);
+ headerSpan[FlagsIndex] = (char)(current._negate ? 1 : 0);
+ headerSpan[SetLengthIndex] = '\0'; // (will be replaced once we know how long a range we've added)
+ headerSpan[CategoryLengthIndex] = (char)categoriesLength;
- // Append ranges
- List<(char First, char Last)>? rangelist = _rangelist;
- if (rangelist != null)
- {
- for (int i = 0; i < rangelist.Count; i++)
+ // Append ranges
+ List<(char First, char Last)>? rangelist = current._rangelist;
+ if (rangelist != null)
{
- (char First, char Last) currentRange = rangelist[i];
- vsb.Append(currentRange.First);
- if (currentRange.Last != LastChar)
+ for (int i = 0; i < rangelist.Count; i++)
{
- vsb.Append((char)(currentRange.Last + 1));
+ (char First, char Last) currentRange = rangelist[i];
+ vsb.Append(currentRange.First);
+ if (currentRange.Last != LastChar)
+ {
+ vsb.Append((char)(currentRange.Last + 1));
+ }
}
}
- }
- // Update the range length. The ValueStringBuilder may have already had some
- // contents (if this is a subtactor), so we need to offset by the initial length.
- vsb[initialLength + SetLengthIndex] = (char)(vsb.Length - initialLength - SetStartIndex);
+ // Update the range length. The ValueStringBuilder may have already had some
+ // contents (if this is a subtactor), so we need to offset by the initial length.
+ vsb[initialLength + SetLengthIndex] = (char)(vsb.Length - initialLength - SetStartIndex);
- // Append categories
- if (categoriesLength != 0)
- {
- foreach (ReadOnlyMemory chunk in _categories!.GetChunks())
+ // Append categories
+ if (categoriesLength != 0)
{
- vsb.Append(chunk.Span);
+ foreach (ReadOnlyMemory chunk in current._categories!.GetChunks())
+ {
+ vsb.Append(chunk.Span);
+ }
}
+
+ current = current._subtractor;
}
+ while (current is not null);
- // Append a subtractor if there is one.
- _subtractor?.ToStringClass(ref vsb);
+ return vsb.ToString();
}
///
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
index 69511af80983bd..4be6424aa3c590 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
@@ -543,7 +543,11 @@ private RegexNode ScanReplacement()
return _concatenation;
}
- /// Scans contents of [] (not including []'s), and converts to a RegexCharClass
+ /// Scans contents of [] (not including []'s), and converts to a RegexCharClass.
+ ///
+ /// Character class subtractions (e.g. [a-z-[aeiou]]) are handled iteratively using an
+ /// explicit parent stack to avoid stack overflow with deeply nested subtractions.
+ ///
private RegexCharClass? ScanCharClass(bool caseInsensitive, bool scanOnly)
{
char ch;
@@ -551,6 +555,9 @@ private RegexNode ScanReplacement()
bool inRange = false;
bool firstChar = true;
bool closed = false;
+ bool startingNewLevel = false;
+
+ List? parents = null;
RegexCharClass? charClass = scanOnly ? null : new RegexCharClass();
@@ -569,12 +576,60 @@ private RegexNode ScanReplacement()
for (; _pos < _pattern.Length; firstChar = false)
{
+ // When entering a new subtraction level, reset state for the nested character class.
+ if (startingNewLevel)
+ {
+ startingNewLevel = false;
+ firstChar = true;
+ if (_pos < _pattern.Length && _pattern[_pos] == '^')
+ {
+ _pos++;
+ if (!scanOnly)
+ {
+ charClass!.Negate = true;
+ }
+ if ((_options & RegexOptions.ECMAScript) != 0 && _pos < _pattern.Length && _pattern[_pos] == ']')
+ {
+ firstChar = false;
+ }
+ }
+ if (_pos >= _pattern.Length)
+ {
+ break;
+ }
+ }
+
bool translatedChar = false;
ch = _pattern[_pos++];
if (ch == ']')
{
if (!firstChar)
{
+ // Finalize this character class level.
+ if (!scanOnly && caseInsensitive)
+ {
+ charClass!.AddCaseEquivalences(_culture);
+ }
+
+ // If there are parent levels, pop back to the parent and set the
+ // current class as its subtraction.
+ if (parents is { Count: > 0 })
+ {
+ RegexCharClass? parent = parents[parents.Count - 1];
+ parents.RemoveAt(parents.Count - 1);
+ if (!scanOnly)
+ {
+ parent!.AddSubtraction(charClass!);
+
+ if (_pos < _pattern.Length && _pattern[_pos] != ']')
+ {
+ throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast);
+ }
+ }
+ charClass = parent;
+ continue;
+ }
+
closed = true;
break;
}
@@ -675,14 +730,13 @@ private RegexNode ScanReplacement()
{
// We thought we were in a range, but we're actually starting a subtraction.
// In that case, we'll add chPrev to our char class, skip the opening [, and
- // scan the new character class recursively.
+ // scan the new character class iteratively.
charClass!.AddChar(chPrev);
- charClass.AddSubtraction(ScanCharClass(caseInsensitive, scanOnly)!);
-
- if (_pos < _pattern.Length && _pattern[_pos] != ']')
- {
- throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast);
- }
+ (parents ??= new List()).Add(charClass);
+ charClass = new RegexCharClass();
+ chPrev = '\0';
+ startingNewLevel = true;
+ continue;
}
else
{
@@ -707,16 +761,14 @@ private RegexNode ScanReplacement()
// we aren't in a range, and now there is a subtraction. Usually this happens
// only when a subtraction follows a range, like [a-z-[b]]
_pos++;
- RegexCharClass? rcc = ScanCharClass(caseInsensitive, scanOnly);
+ (parents ??= new List()).Add(scanOnly ? null : charClass);
if (!scanOnly)
{
- charClass!.AddSubtraction(rcc!);
-
- if (_pos < _pattern.Length && _pattern[_pos] != ']')
- {
- throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast);
- }
+ charClass = new RegexCharClass();
}
+ chPrev = '\0';
+ startingNewLevel = true;
+ continue;
}
else
{
@@ -732,11 +784,6 @@ private RegexNode ScanReplacement()
throw MakeException(RegexParseError.UnterminatedBracket, SR.UnterminatedBracket);
}
- if (!scanOnly && caseInsensitive)
- {
- charClass!.AddCaseEquivalences(_culture);
- }
-
return charClass;
}
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index c03f22a893ac1a..0a1343384aa519 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -2632,6 +2632,105 @@ public async Task StressTestDeepNestingOfLoops(RegexEngine engine, string begin,
}
}
+ [Theory]
+ [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework, "Fix is not available on .NET Framework")]
+ [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
+ public async Task CharClassSubtraction_DeepNesting_DoesNotStackOverflow(RegexEngine engine)
+ {
+ // Build a pattern with deeply nested character class subtractions: [a-[a-[a-[...[a]...]]]]
+ // This previously caused a StackOverflowException due to unbounded recursion in the parser.
+ // Use a reduced depth for SourceGenerated to avoid overwhelming Roslyn compilation.
+ int depth = engine == RegexEngine.SourceGenerated ? 1_000 : 10_000;
+ var sb = new System.Text.StringBuilder();
+ sb.Append('[');
+ for (int i = 0; i < depth; i++)
+ {
+ sb.Append("a-[");
+ }
+ sb.Append('a');
+ sb.Append(']', depth + 1);
+
+ Regex r = await RegexHelpers.GetRegexAsync(engine, sb.ToString());
+ Assert.True(r.IsMatch("a"));
+ Assert.False(r.IsMatch("b"));
+ }
+
+ [Theory]
+ [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
+ public async Task CharClassSubtraction_Correctness(RegexEngine engine)
+ {
+ // [a-z-[d-w]] should match a-c and x-z
+ Regex r1 = await RegexHelpers.GetRegexAsync(engine, "[a-z-[d-w]]");
+ Assert.True(r1.IsMatch("a"));
+ Assert.True(r1.IsMatch("c"));
+ Assert.True(r1.IsMatch("x"));
+ Assert.True(r1.IsMatch("z"));
+ Assert.False(r1.IsMatch("d"));
+ Assert.False(r1.IsMatch("m"));
+ Assert.False(r1.IsMatch("w"));
+
+ // [a-z-[d-w-[m]]] should match a-c, m, and x-z
+ Regex r2 = await RegexHelpers.GetRegexAsync(engine, "[a-z-[d-w-[m]]]");
+ Assert.True(r2.IsMatch("a"));
+ Assert.True(r2.IsMatch("m"));
+ Assert.True(r2.IsMatch("z"));
+ Assert.False(r2.IsMatch("d"));
+ Assert.False(r2.IsMatch("l"));
+ Assert.False(r2.IsMatch("n"));
+ Assert.False(r2.IsMatch("w"));
+ }
+
+ [Theory]
+ [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
+ public async Task CharClassSubtraction_CaseInsensitive(RegexEngine engine)
+ {
+ // [a-z-[D-W]] with IgnoreCase should behave like [a-z-[d-w]], matching a-c and x-z.
+ Regex r = await RegexHelpers.GetRegexAsync(engine, "[a-z-[D-W]]", RegexOptions.IgnoreCase);
+ Assert.True(r.IsMatch("a"));
+ Assert.True(r.IsMatch("c"));
+ Assert.True(r.IsMatch("x"));
+ Assert.True(r.IsMatch("z"));
+ Assert.False(r.IsMatch("d"));
+ Assert.False(r.IsMatch("D"));
+ Assert.False(r.IsMatch("m"));
+ Assert.False(r.IsMatch("w"));
+ }
+
+ [Theory]
+ [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
+ public async Task CharClassSubtraction_NegatedOuter(RegexEngine engine)
+ {
+ // [^a-z-[m-p]] = (NOT a-z) minus m-p. Since m-p is a subset of a-z,
+ // the subtraction has no effect: matches anything outside a-z.
+ Regex r = await RegexHelpers.GetRegexAsync(engine, "[^a-z-[m-p]]");
+ Assert.True(r.IsMatch("A"));
+ Assert.True(r.IsMatch("5"));
+ Assert.False(r.IsMatch("a"));
+ Assert.False(r.IsMatch("m"));
+ Assert.False(r.IsMatch("z"));
+ }
+
+ [Theory]
+ [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
+ public async Task CharClassSubtraction_FourLevels(RegexEngine engine)
+ {
+ // [a-z-[b-y-[c-x-[d-w]]]]
+ // Level 0: a-z
+ // Level 1: subtract b-y => a, z
+ // Level 2: subtract c-x from b-y => add back c-x => a, c-x, z
+ // Level 3: subtract d-w from c-x => remove d-w again => a, c, x, z
+ Regex r = await RegexHelpers.GetRegexAsync(engine, "[a-z-[b-y-[c-x-[d-w]]]]");
+ Assert.True(r.IsMatch("a"));
+ Assert.True(r.IsMatch("c"));
+ Assert.True(r.IsMatch("x"));
+ Assert.True(r.IsMatch("z"));
+ Assert.False(r.IsMatch("b"));
+ Assert.False(r.IsMatch("d"));
+ Assert.False(r.IsMatch("m"));
+ Assert.False(r.IsMatch("w"));
+ Assert.False(r.IsMatch("y"));
+ }
+
public static IEnumerable