diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index c21987a7107ea7..53ca0dc28076de 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -1237,7 +1237,7 @@ public static bool CharInClass(char ch, string set, ref uint[]? asciiLazyCache) // For ASCII, lazily initialize. For non-ASCII, just compute the value. return ch < 128 ? InitializeValue(ch, set, ref asciiLazyCache) : - CharInClassRecursive(ch, set, 0); + CharInClassIterative(ch, set, 0); static bool InitializeValue(char ch, string set, ref uint[]? asciiLazyCache) { @@ -1269,27 +1269,31 @@ static bool InitializeValue(char ch, string set, ref uint[]? asciiLazyCache) /// Determines a character's membership in a character class (via the string representation of the class). /// public static bool CharInClass(char ch, string set) => - CharInClassRecursive(ch, set, 0); + CharInClassIterative(ch, set, 0); - private static bool CharInClassRecursive(char ch, string set, int start) + private static bool CharInClassIterative(char ch, string set, int start) { - int setLength = set[start + SetLengthIndex]; - int categoryLength = set[start + CategoryLengthIndex]; - int endPosition = start + SetStartIndex + setLength + categoryLength; + bool inClass = false; - bool inClass = CharInClassInternal(ch, set, start, setLength, categoryLength); - - // Note that we apply the negation *before* performing the subtraction. This is because - // the negation only applies to the first char class, not the entire subtraction. - if (IsNegated(set, start)) + while (true) { + int setLength = set[start + SetLengthIndex]; + int categoryLength = set[start + CategoryLengthIndex]; + int endPosition = start + SetStartIndex + setLength + categoryLength; + + if (CharInClassInternal(ch, set, start, setLength, categoryLength) == IsNegated(set, start)) + { + break; + } + inClass = !inClass; - } - // Subtract if necessary - if (inClass && set.Length > endPosition) - { - inClass = !CharInClassRecursive(ch, set, endPosition); + if (set.Length <= endPosition) + { + break; + } + + start = endPosition; } return inClass; @@ -1427,32 +1431,48 @@ private static bool CharInCategoryGroup(UnicodeCategory chcategory, ReadOnlySpan return result; } - public static RegexCharClass Parse(string charClass) => ParseRecursive(charClass, 0); - - private static RegexCharClass ParseRecursive(string charClass, int start) + public static RegexCharClass Parse(string charClass) { - int setLength = charClass[start + SetLengthIndex]; - int categoryLength = charClass[start + CategoryLengthIndex]; - int endPosition = start + SetStartIndex + setLength + categoryLength; + RegexCharClass? outermost = null; + RegexCharClass? current = null; - int i = start + SetStartIndex; - int end = i + setLength; + int pos = 0; + while (true) + { + int setLength = charClass[pos + SetLengthIndex]; + int categoryLength = charClass[pos + CategoryLengthIndex]; + int endPosition = pos + SetStartIndex + setLength + categoryLength; - List<(char First, char Last)>? ranges = ComputeRanges(charClass.AsSpan(start)); + List<(char First, char Last)>? ranges = ComputeRanges(charClass.AsSpan(pos)); - RegexCharClass? sub = null; - if (charClass.Length > endPosition) - { - sub = ParseRecursive(charClass, endPosition); - } + StringBuilder? categoriesBuilder = null; + if (categoryLength > 0) + { + int end = pos + SetStartIndex + setLength; + categoriesBuilder = new StringBuilder().Append(charClass.AsSpan(end, categoryLength)); + } - StringBuilder? categoriesBuilder = null; - if (categoryLength > 0) - { - categoriesBuilder = new StringBuilder().Append(charClass.AsSpan(end, categoryLength)); + var level = new RegexCharClass(IsNegated(charClass, pos), ranges, categoriesBuilder, subtraction: null); + + if (outermost is null) + { + outermost = level; + } + else + { + current!.AddSubtraction(level); + } + current = level; + + if (charClass.Length <= endPosition) + { + break; + } + + pos = endPosition; } - return new RegexCharClass(IsNegated(charClass, start), ranges, categoriesBuilder, sub); + return outermost!; } /// Computes a list of all of the character ranges in the set string. @@ -1591,51 +1611,52 @@ internal static string CharsToStringClass(ReadOnlySpan chars) public string ToStringClass() { var vsb = new ValueStringBuilder(stackalloc char[256]); - ToStringClass(ref vsb); - return vsb.ToString(); - } - private void ToStringClass(ref ValueStringBuilder vsb) - { - Canonicalize(); + RegexCharClass? current = this; + do + { + current.Canonicalize(); - int initialLength = vsb.Length; - int categoriesLength = _categories?.Length ?? 0; - Span headerSpan = vsb.AppendSpan(SetStartIndex); - headerSpan[FlagsIndex] = (char)(_negate ? 1 : 0); - headerSpan[SetLengthIndex] = '\0'; // (will be replaced once we know how long a range we've added) - headerSpan[CategoryLengthIndex] = (char)categoriesLength; + int initialLength = vsb.Length; + int categoriesLength = current._categories?.Length ?? 0; + Span headerSpan = vsb.AppendSpan(SetStartIndex); + headerSpan[FlagsIndex] = (char)(current._negate ? 1 : 0); + headerSpan[SetLengthIndex] = '\0'; // (will be replaced once we know how long a range we've added) + headerSpan[CategoryLengthIndex] = (char)categoriesLength; - // Append ranges - List<(char First, char Last)>? rangelist = _rangelist; - if (rangelist != null) - { - for (int i = 0; i < rangelist.Count; i++) + // Append ranges + List<(char First, char Last)>? rangelist = current._rangelist; + if (rangelist != null) { - (char First, char Last) currentRange = rangelist[i]; - vsb.Append(currentRange.First); - if (currentRange.Last != LastChar) + for (int i = 0; i < rangelist.Count; i++) { - vsb.Append((char)(currentRange.Last + 1)); + (char First, char Last) currentRange = rangelist[i]; + vsb.Append(currentRange.First); + if (currentRange.Last != LastChar) + { + vsb.Append((char)(currentRange.Last + 1)); + } } } - } - // Update the range length. The ValueStringBuilder may have already had some - // contents (if this is a subtactor), so we need to offset by the initial length. - vsb[initialLength + SetLengthIndex] = (char)(vsb.Length - initialLength - SetStartIndex); + // Update the range length. The ValueStringBuilder may have already had some + // contents (if this is a subtactor), so we need to offset by the initial length. + vsb[initialLength + SetLengthIndex] = (char)(vsb.Length - initialLength - SetStartIndex); - // Append categories - if (categoriesLength != 0) - { - foreach (ReadOnlyMemory chunk in _categories!.GetChunks()) + // Append categories + if (categoriesLength != 0) { - vsb.Append(chunk.Span); + foreach (ReadOnlyMemory chunk in current._categories!.GetChunks()) + { + vsb.Append(chunk.Span); + } } + + current = current._subtractor; } + while (current is not null); - // Append a subtractor if there is one. - _subtractor?.ToStringClass(ref vsb); + return vsb.ToString(); } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 69511af80983bd..4be6424aa3c590 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -543,7 +543,11 @@ private RegexNode ScanReplacement() return _concatenation; } - /// Scans contents of [] (not including []'s), and converts to a RegexCharClass + /// Scans contents of [] (not including []'s), and converts to a RegexCharClass. + /// + /// Character class subtractions (e.g. [a-z-[aeiou]]) are handled iteratively using an + /// explicit parent stack to avoid stack overflow with deeply nested subtractions. + /// private RegexCharClass? ScanCharClass(bool caseInsensitive, bool scanOnly) { char ch; @@ -551,6 +555,9 @@ private RegexNode ScanReplacement() bool inRange = false; bool firstChar = true; bool closed = false; + bool startingNewLevel = false; + + List? parents = null; RegexCharClass? charClass = scanOnly ? null : new RegexCharClass(); @@ -569,12 +576,60 @@ private RegexNode ScanReplacement() for (; _pos < _pattern.Length; firstChar = false) { + // When entering a new subtraction level, reset state for the nested character class. + if (startingNewLevel) + { + startingNewLevel = false; + firstChar = true; + if (_pos < _pattern.Length && _pattern[_pos] == '^') + { + _pos++; + if (!scanOnly) + { + charClass!.Negate = true; + } + if ((_options & RegexOptions.ECMAScript) != 0 && _pos < _pattern.Length && _pattern[_pos] == ']') + { + firstChar = false; + } + } + if (_pos >= _pattern.Length) + { + break; + } + } + bool translatedChar = false; ch = _pattern[_pos++]; if (ch == ']') { if (!firstChar) { + // Finalize this character class level. + if (!scanOnly && caseInsensitive) + { + charClass!.AddCaseEquivalences(_culture); + } + + // If there are parent levels, pop back to the parent and set the + // current class as its subtraction. + if (parents is { Count: > 0 }) + { + RegexCharClass? parent = parents[parents.Count - 1]; + parents.RemoveAt(parents.Count - 1); + if (!scanOnly) + { + parent!.AddSubtraction(charClass!); + + if (_pos < _pattern.Length && _pattern[_pos] != ']') + { + throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast); + } + } + charClass = parent; + continue; + } + closed = true; break; } @@ -675,14 +730,13 @@ private RegexNode ScanReplacement() { // We thought we were in a range, but we're actually starting a subtraction. // In that case, we'll add chPrev to our char class, skip the opening [, and - // scan the new character class recursively. + // scan the new character class iteratively. charClass!.AddChar(chPrev); - charClass.AddSubtraction(ScanCharClass(caseInsensitive, scanOnly)!); - - if (_pos < _pattern.Length && _pattern[_pos] != ']') - { - throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast); - } + (parents ??= new List()).Add(charClass); + charClass = new RegexCharClass(); + chPrev = '\0'; + startingNewLevel = true; + continue; } else { @@ -707,16 +761,14 @@ private RegexNode ScanReplacement() // we aren't in a range, and now there is a subtraction. Usually this happens // only when a subtraction follows a range, like [a-z-[b]] _pos++; - RegexCharClass? rcc = ScanCharClass(caseInsensitive, scanOnly); + (parents ??= new List()).Add(scanOnly ? null : charClass); if (!scanOnly) { - charClass!.AddSubtraction(rcc!); - - if (_pos < _pattern.Length && _pattern[_pos] != ']') - { - throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast); - } + charClass = new RegexCharClass(); } + chPrev = '\0'; + startingNewLevel = true; + continue; } else { @@ -732,11 +784,6 @@ private RegexNode ScanReplacement() throw MakeException(RegexParseError.UnterminatedBracket, SR.UnterminatedBracket); } - if (!scanOnly && caseInsensitive) - { - charClass!.AddCaseEquivalences(_culture); - } - return charClass; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index c03f22a893ac1a..0a1343384aa519 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2632,6 +2632,105 @@ public async Task StressTestDeepNestingOfLoops(RegexEngine engine, string begin, } } + [Theory] + [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework, "Fix is not available on .NET Framework")] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task CharClassSubtraction_DeepNesting_DoesNotStackOverflow(RegexEngine engine) + { + // Build a pattern with deeply nested character class subtractions: [a-[a-[a-[...[a]...]]]] + // This previously caused a StackOverflowException due to unbounded recursion in the parser. + // Use a reduced depth for SourceGenerated to avoid overwhelming Roslyn compilation. + int depth = engine == RegexEngine.SourceGenerated ? 1_000 : 10_000; + var sb = new System.Text.StringBuilder(); + sb.Append('['); + for (int i = 0; i < depth; i++) + { + sb.Append("a-["); + } + sb.Append('a'); + sb.Append(']', depth + 1); + + Regex r = await RegexHelpers.GetRegexAsync(engine, sb.ToString()); + Assert.True(r.IsMatch("a")); + Assert.False(r.IsMatch("b")); + } + + [Theory] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task CharClassSubtraction_Correctness(RegexEngine engine) + { + // [a-z-[d-w]] should match a-c and x-z + Regex r1 = await RegexHelpers.GetRegexAsync(engine, "[a-z-[d-w]]"); + Assert.True(r1.IsMatch("a")); + Assert.True(r1.IsMatch("c")); + Assert.True(r1.IsMatch("x")); + Assert.True(r1.IsMatch("z")); + Assert.False(r1.IsMatch("d")); + Assert.False(r1.IsMatch("m")); + Assert.False(r1.IsMatch("w")); + + // [a-z-[d-w-[m]]] should match a-c, m, and x-z + Regex r2 = await RegexHelpers.GetRegexAsync(engine, "[a-z-[d-w-[m]]]"); + Assert.True(r2.IsMatch("a")); + Assert.True(r2.IsMatch("m")); + Assert.True(r2.IsMatch("z")); + Assert.False(r2.IsMatch("d")); + Assert.False(r2.IsMatch("l")); + Assert.False(r2.IsMatch("n")); + Assert.False(r2.IsMatch("w")); + } + + [Theory] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task CharClassSubtraction_CaseInsensitive(RegexEngine engine) + { + // [a-z-[D-W]] with IgnoreCase should behave like [a-z-[d-w]], matching a-c and x-z. + Regex r = await RegexHelpers.GetRegexAsync(engine, "[a-z-[D-W]]", RegexOptions.IgnoreCase); + Assert.True(r.IsMatch("a")); + Assert.True(r.IsMatch("c")); + Assert.True(r.IsMatch("x")); + Assert.True(r.IsMatch("z")); + Assert.False(r.IsMatch("d")); + Assert.False(r.IsMatch("D")); + Assert.False(r.IsMatch("m")); + Assert.False(r.IsMatch("w")); + } + + [Theory] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task CharClassSubtraction_NegatedOuter(RegexEngine engine) + { + // [^a-z-[m-p]] = (NOT a-z) minus m-p. Since m-p is a subset of a-z, + // the subtraction has no effect: matches anything outside a-z. + Regex r = await RegexHelpers.GetRegexAsync(engine, "[^a-z-[m-p]]"); + Assert.True(r.IsMatch("A")); + Assert.True(r.IsMatch("5")); + Assert.False(r.IsMatch("a")); + Assert.False(r.IsMatch("m")); + Assert.False(r.IsMatch("z")); + } + + [Theory] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task CharClassSubtraction_FourLevels(RegexEngine engine) + { + // [a-z-[b-y-[c-x-[d-w]]]] + // Level 0: a-z + // Level 1: subtract b-y => a, z + // Level 2: subtract c-x from b-y => add back c-x => a, c-x, z + // Level 3: subtract d-w from c-x => remove d-w again => a, c, x, z + Regex r = await RegexHelpers.GetRegexAsync(engine, "[a-z-[b-y-[c-x-[d-w]]]]"); + Assert.True(r.IsMatch("a")); + Assert.True(r.IsMatch("c")); + Assert.True(r.IsMatch("x")); + Assert.True(r.IsMatch("z")); + Assert.False(r.IsMatch("b")); + Assert.False(r.IsMatch("d")); + Assert.False(r.IsMatch("m")); + Assert.False(r.IsMatch("w")); + Assert.False(r.IsMatch("y")); + } + public static IEnumerable StressTestNfaMode_TestData() { yield return new object[] { "(?:a|aa|[abc]?[ab]?[abcd]).{20}$", "aaa01234567890123456789", 23 }; diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Tests.Common.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Tests.Common.cs index b0a9b6549492bc..601bdc85a39459 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Tests.Common.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Tests.Common.cs @@ -16,8 +16,6 @@ public static class RegexHelpers { public const string DefaultMatchTimeout_ConfigKeyName = "REGEX_DEFAULT_MATCH_TIMEOUT"; - public const int StressTestNestingDepth = 1000; - /// RegexOptions.NonBacktracking. /// Defined here to be able to reference the value by name even on .NET Framework test builds. public const RegexOptions RegexOptionNonBacktracking = (RegexOptions)0x400;