Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1237,7 +1237,7 @@ public static bool CharInClass(char ch, string set, ref uint[]? asciiLazyCache)
// For ASCII, lazily initialize. For non-ASCII, just compute the value.
return ch < 128 ?
InitializeValue(ch, set, ref asciiLazyCache) :
CharInClassRecursive(ch, set, 0);
CharInClassIterative(ch, set, 0);

static bool InitializeValue(char ch, string set, ref uint[]? asciiLazyCache)
{
Expand Down Expand Up @@ -1269,27 +1269,31 @@ static bool InitializeValue(char ch, string set, ref uint[]? asciiLazyCache)
/// Determines a character's membership in a character class (via the string representation of the class).
/// </summary>
public static bool CharInClass(char ch, string set) =>
CharInClassRecursive(ch, set, 0);
CharInClassIterative(ch, set, 0);

private static bool CharInClassRecursive(char ch, string set, int start)
private static bool CharInClassIterative(char ch, string set, int start)
{
int setLength = set[start + SetLengthIndex];
int categoryLength = set[start + CategoryLengthIndex];
int endPosition = start + SetStartIndex + setLength + categoryLength;
bool inClass = false;

bool inClass = CharInClassInternal(ch, set, start, setLength, categoryLength);

// Note that we apply the negation *before* performing the subtraction. This is because
// the negation only applies to the first char class, not the entire subtraction.
if (IsNegated(set, start))
while (true)
{
int setLength = set[start + SetLengthIndex];
int categoryLength = set[start + CategoryLengthIndex];
int endPosition = start + SetStartIndex + setLength + categoryLength;

if (CharInClassInternal(ch, set, start, setLength, categoryLength) == IsNegated(set, start))
{
break;
}

inClass = !inClass;
}

// Subtract if necessary
if (inClass && set.Length > endPosition)
{
inClass = !CharInClassRecursive(ch, set, endPosition);
if (set.Length <= endPosition)
{
break;
}

start = endPosition;
}

return inClass;
Expand Down Expand Up @@ -1427,32 +1431,48 @@ private static bool CharInCategoryGroup(UnicodeCategory chcategory, ReadOnlySpan
return result;
}

public static RegexCharClass Parse(string charClass) => ParseRecursive(charClass, 0);

private static RegexCharClass ParseRecursive(string charClass, int start)
public static RegexCharClass Parse(string charClass)
{
int setLength = charClass[start + SetLengthIndex];
int categoryLength = charClass[start + CategoryLengthIndex];
int endPosition = start + SetStartIndex + setLength + categoryLength;
RegexCharClass? outermost = null;
RegexCharClass? current = null;

int i = start + SetStartIndex;
int end = i + setLength;
int pos = 0;
while (true)
{
int setLength = charClass[pos + SetLengthIndex];
int categoryLength = charClass[pos + CategoryLengthIndex];
int endPosition = pos + SetStartIndex + setLength + categoryLength;

List<(char First, char Last)>? ranges = ComputeRanges(charClass.AsSpan(start));
List<(char First, char Last)>? ranges = ComputeRanges(charClass.AsSpan(pos));

RegexCharClass? sub = null;
if (charClass.Length > endPosition)
{
sub = ParseRecursive(charClass, endPosition);
}
StringBuilder? categoriesBuilder = null;
if (categoryLength > 0)
{
int end = pos + SetStartIndex + setLength;
categoriesBuilder = new StringBuilder().Append(charClass.AsSpan(end, categoryLength));
}

StringBuilder? categoriesBuilder = null;
if (categoryLength > 0)
{
categoriesBuilder = new StringBuilder().Append(charClass.AsSpan(end, categoryLength));
var level = new RegexCharClass(IsNegated(charClass, pos), ranges, categoriesBuilder, subtraction: null);

if (outermost is null)
{
outermost = level;
}
else
{
current!.AddSubtraction(level);
}
current = level;

if (charClass.Length <= endPosition)
{
break;
}

pos = endPosition;
}

return new RegexCharClass(IsNegated(charClass, start), ranges, categoriesBuilder, sub);
return outermost!;
}

/// <summary>Computes a list of all of the character ranges in the set string.</summary>
Expand Down Expand Up @@ -1591,51 +1611,52 @@ internal static string CharsToStringClass(ReadOnlySpan<char> chars)
public string ToStringClass()
{
var vsb = new ValueStringBuilder(stackalloc char[256]);
ToStringClass(ref vsb);
return vsb.ToString();
}

private void ToStringClass(ref ValueStringBuilder vsb)
{
Canonicalize();
RegexCharClass? current = this;
do
{
current.Canonicalize();

int initialLength = vsb.Length;
int categoriesLength = _categories?.Length ?? 0;
Span<char> headerSpan = vsb.AppendSpan(SetStartIndex);
headerSpan[FlagsIndex] = (char)(_negate ? 1 : 0);
headerSpan[SetLengthIndex] = '\0'; // (will be replaced once we know how long a range we've added)
headerSpan[CategoryLengthIndex] = (char)categoriesLength;
int initialLength = vsb.Length;
int categoriesLength = current._categories?.Length ?? 0;
Span<char> headerSpan = vsb.AppendSpan(SetStartIndex);
headerSpan[FlagsIndex] = (char)(current._negate ? 1 : 0);
headerSpan[SetLengthIndex] = '\0'; // (will be replaced once we know how long a range we've added)
headerSpan[CategoryLengthIndex] = (char)categoriesLength;

// Append ranges
List<(char First, char Last)>? rangelist = _rangelist;
if (rangelist != null)
{
for (int i = 0; i < rangelist.Count; i++)
// Append ranges
List<(char First, char Last)>? rangelist = current._rangelist;
if (rangelist != null)
{
(char First, char Last) currentRange = rangelist[i];
vsb.Append(currentRange.First);
if (currentRange.Last != LastChar)
for (int i = 0; i < rangelist.Count; i++)
{
vsb.Append((char)(currentRange.Last + 1));
(char First, char Last) currentRange = rangelist[i];
vsb.Append(currentRange.First);
if (currentRange.Last != LastChar)
{
vsb.Append((char)(currentRange.Last + 1));
}
}
}
}

// Update the range length. The ValueStringBuilder may have already had some
// contents (if this is a subtactor), so we need to offset by the initial length.
vsb[initialLength + SetLengthIndex] = (char)(vsb.Length - initialLength - SetStartIndex);
// Update the range length. The ValueStringBuilder may have already had some
// contents (if this is a subtactor), so we need to offset by the initial length.
vsb[initialLength + SetLengthIndex] = (char)(vsb.Length - initialLength - SetStartIndex);

// Append categories
if (categoriesLength != 0)
{
foreach (ReadOnlyMemory<char> chunk in _categories!.GetChunks())
// Append categories
if (categoriesLength != 0)
{
vsb.Append(chunk.Span);
foreach (ReadOnlyMemory<char> chunk in current._categories!.GetChunks())
{
vsb.Append(chunk.Span);
}
}

current = current._subtractor;
}
while (current is not null);

// Append a subtractor if there is one.
_subtractor?.ToStringClass(ref vsb);
return vsb.ToString();
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -543,14 +543,21 @@ private RegexNode ScanReplacement()
return _concatenation;
}

/// <summary>Scans contents of [] (not including []'s), and converts to a RegexCharClass</summary>
/// <summary>Scans contents of [] (not including []'s), and converts to a RegexCharClass.</summary>
/// <remarks>
/// Character class subtractions (e.g. [a-z-[aeiou]]) are handled iteratively using an
/// explicit parent stack to avoid stack overflow with deeply nested subtractions.
/// </remarks>
private RegexCharClass? ScanCharClass(bool caseInsensitive, bool scanOnly)
{
char ch;
char chPrev = '\0';
bool inRange = false;
bool firstChar = true;
bool closed = false;
bool startingNewLevel = false;

List<RegexCharClass?>? parents = null;

RegexCharClass? charClass = scanOnly ? null : new RegexCharClass();

Expand All @@ -569,12 +576,60 @@ private RegexNode ScanReplacement()

for (; _pos < _pattern.Length; firstChar = false)
{
// When entering a new subtraction level, reset state for the nested character class.
if (startingNewLevel)
{
startingNewLevel = false;
firstChar = true;
if (_pos < _pattern.Length && _pattern[_pos] == '^')
{
_pos++;
if (!scanOnly)
{
charClass!.Negate = true;
}
if ((_options & RegexOptions.ECMAScript) != 0 && _pos < _pattern.Length && _pattern[_pos] == ']')
{
firstChar = false;
}
}
if (_pos >= _pattern.Length)
{
break;
}
}

bool translatedChar = false;
ch = _pattern[_pos++];
if (ch == ']')
{
if (!firstChar)
{
// Finalize this character class level.
if (!scanOnly && caseInsensitive)
{
charClass!.AddCaseEquivalences(_culture);
}

// If there are parent levels, pop back to the parent and set the
// current class as its subtraction.
if (parents is { Count: > 0 })
{
RegexCharClass? parent = parents[parents.Count - 1];
parents.RemoveAt(parents.Count - 1);
if (!scanOnly)
{
parent!.AddSubtraction(charClass!);

if (_pos < _pattern.Length && _pattern[_pos] != ']')
{
throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast);
}
}
charClass = parent;
continue;
}

closed = true;
break;
}
Expand Down Expand Up @@ -675,14 +730,13 @@ private RegexNode ScanReplacement()
{
// We thought we were in a range, but we're actually starting a subtraction.
// In that case, we'll add chPrev to our char class, skip the opening [, and
// scan the new character class recursively.
// scan the new character class iteratively.
charClass!.AddChar(chPrev);
charClass.AddSubtraction(ScanCharClass(caseInsensitive, scanOnly)!);

if (_pos < _pattern.Length && _pattern[_pos] != ']')
{
throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast);
}
(parents ??= new List<RegexCharClass?>()).Add(charClass);
charClass = new RegexCharClass();
chPrev = '\0';
startingNewLevel = true;
continue;
}
else
{
Expand All @@ -707,16 +761,14 @@ private RegexNode ScanReplacement()
// we aren't in a range, and now there is a subtraction. Usually this happens
// only when a subtraction follows a range, like [a-z-[b]]
_pos++;
RegexCharClass? rcc = ScanCharClass(caseInsensitive, scanOnly);
(parents ??= new List<RegexCharClass?>()).Add(scanOnly ? null : charClass);
if (!scanOnly)
{
charClass!.AddSubtraction(rcc!);

if (_pos < _pattern.Length && _pattern[_pos] != ']')
{
throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast);
}
charClass = new RegexCharClass();
}
chPrev = '\0';
startingNewLevel = true;
continue;
}
else
{
Expand All @@ -732,11 +784,6 @@ private RegexNode ScanReplacement()
throw MakeException(RegexParseError.UnterminatedBracket, SR.UnterminatedBracket);
}

if (!scanOnly && caseInsensitive)
{
charClass!.AddCaseEquivalences(_culture);
}

return charClass;
}

Expand Down
Loading
Loading