Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1193,24 +1193,16 @@ static RegexNode ExtractCommonPrefixNode(RegexNode alternation)
return alternation;
}

// Only handle the case where each branch is a concatenation
foreach (RegexNode child in children)
{
if (child.Kind != RegexNodeKind.Concatenate || child.ChildCount() < 2)
{
return alternation;
}
}

for (int startingIndex = 0; startingIndex < children.Count - 1; startingIndex++)
{
Debug.Assert(children[startingIndex].Children is List<RegexNode> { Count: >= 2 });

// Only handle the case where each branch begins with the same One, Notone, Set (individual or loop), or Anchor.
// Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing
// it for non-atomic variable length loops could change behavior as each branch could otherwise have a
// different number of characters consumed by the loop based on what's after it.
RegexNode required = children[startingIndex].Child(0);
// A branch may be either a Concatenation (get its first child) or a single node (e.g., a Set
// that was reduced from a single-child Concatenation after prior prefix extraction).
RegexNode startingNode = children[startingIndex];
RegexNode required = startingNode.Kind == RegexNodeKind.Concatenate ? startingNode.Child(0) : startingNode;
switch (required.Kind)
{
case RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set:
Expand All @@ -1230,7 +1222,8 @@ or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary
int endingIndex = startingIndex + 1;
for (; endingIndex < children.Count; endingIndex++)
{
RegexNode other = children[endingIndex].Child(0);
RegexNode endingNode = children[endingIndex];
RegexNode other = endingNode.Kind == RegexNodeKind.Concatenate ? endingNode.Child(0) : endingNode;
if (required.Kind != other.Kind ||
required.Options != other.Options ||
required.M != other.M ||
Expand All @@ -1252,8 +1245,16 @@ or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary
var newAlternate = new RegexNode(RegexNodeKind.Alternate, alternation.Options);
for (int i = startingIndex; i < endingIndex; i++)
{
((List<RegexNode>)children[i].Children!).RemoveAt(0);
newAlternate.AddChild(children[i]);
if (children[i].Kind == RegexNodeKind.Concatenate)
{
((List<RegexNode>)children[i].Children!).RemoveAt(0);
newAlternate.AddChild(children[i]);
}
else
{
// The entire branch was the extracted prefix; what remains is Empty.
newAlternate.AddChild(new RegexNode(RegexNodeKind.Empty, children[i].Options));
}
}

// If this alternation is wrapped as atomic, we need to do the same for the new alternation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,11 @@ public static IEnumerable<object[]> Match_MemberData()
yield return (@"a\wc|\wgh|de\w", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper);
yield return (@"a\wc|\wgh|de\w", upper, RegexOptions.None, 0, input.Length, false, "");
}
// Alternation prefix extraction with IgnoreCase: correctness after single-node branch handling
yield return (@"(?:http|https)://foo", "HTTP://FOO", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 10, true, "HTTP://FOO");
yield return (@"(?:http|https)://foo", "HTTPS://FOO", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 11, true, "HTTPS://FOO");
yield return (@"(?:http|https)://foo", "ftp://foo", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 9, false, "");

yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest");
yield return (@"(\w+|\d+)a+[ab]+", "123123aa", RegexOptions.None, 0, 8, true, "123123aa");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,25 @@ public void TrailingAnchor(string pattern, int options, int expectedMode, int ex
[InlineData(@"(?<=cd)ab", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")]
[InlineData(@"\bab(?=\w)(?!=\d)c\b", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")]
[InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
// Alternation branches differing by one trailing character: prefix extraction should include all shared characters
[InlineData(@"(?:http|https)://foo", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "http")]
[InlineData(@"(?:http|https)://foo", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "http")]
// Alternation where shorter branch is just the shared prefix
[InlineData(@"(?:ab|abc)d", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")]
// Alternation where branches differ by more than one character
[InlineData(@"(?:abc|abcdef)g", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
[InlineData(@"(?:abc|abcdef)g", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")]
// Three-branch alternation with shared prefix and different lengths
[InlineData(@"(?:ab|abc|abcd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")]
[InlineData(@"(?:ab|abc|abcd)e", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")]
// Three-branch alternation with shared prefix and different trailing characters
[InlineData(@"(?:ab|abc|abd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")]
[InlineData(@"(?:ab|abc|abd)e", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")]
// Case-sensitive alternation with branches differing by one (handled by ExtractCommonPrefixText, not Node, but verifies no regression)
[InlineData(@"(?:ab|abc)d", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")]
// Four-branch alternation mixing single-node and Concat branches after IgnoreCase prefix extraction
[InlineData(@"(?:abc|abcd|abce|abcfg)h", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
[InlineData(@"(?:abc|abcd|abce|abcfg)h", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")]
public void LeadingPrefix(string pattern, int options, int expectedMode, string expectedPrefix)
{
RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options);
Expand All @@ -138,6 +157,12 @@ public void LeadingPrefix(string pattern, int options, int expectedMode, string
[InlineData(@"ab|cd|ef|gh", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "bdfh")]
[InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)(RegexOptions.IgnoreCase | RegexOptions.RightToLeft), (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "Cc")]
[InlineData(@"ab|(abc)|(abcd)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "bcd")]
// Non-IgnoreCase Set-node branch: single-node branch after prefix extraction of character class
[InlineData(@"(?:[ab][0-9]|[ab])x", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "ab")]
// Single-node before Concat branch (reversed order)
[InlineData(@"(?:[ab]|[ab][0-9])x", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "ab")]
// IgnoreCase Set-node branch: prefix extraction across set-expanded branches
[InlineData(@"(?:a|ab)c", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "Aa")]
public void LeadingSet(string pattern, int options, int expectedMode, string expectedChars)
{
RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,18 @@ public class RegexReductionTests
[InlineData("(?i)\\d", "\\d")]
[InlineData("(?i).", ".")]
[InlineData("(?i)\\$", "\\$")]
// IgnoreCase node prefix extraction with single-node branch handling
[InlineData("(?i)(?:ab|abc)d", "(?i)ab(?>c?)d")]
[InlineData("(?i)(?:http|https)://foo", "[Hh](?>[Tt]{2})[Pp](?>[Ss]?)://[Ff](?>[Oo]{2})")]
[InlineData("(?i)(?:abc|abcd|abce|abcfg)h", "(?i)abc(?:|[de]|fg)h")]
[InlineData("(?i)(?:ab|abc|abcd)e", "(?i)ab(?:c(?>d?))??e")]
// Non-IgnoreCase node prefix extraction with single-node branch handling
[InlineData("(?:[ab][0-9]|[ab])x", "[ab](?>[0-9]?)x")]
[InlineData("(?:\\w\\d|\\w)x", "\\w(?>\\d?)x")]
// Non-IgnoreCase text prefix extraction (regression guards)
[InlineData("(?:http|https)://foo", "http(?>s?)://foo")]
[InlineData("(?:ab|abc)d", "ab(?>c?)d")]
[InlineData("(?:abc|abcd|abce|abcfg)h", "abc(?:|[de]|fg)h")]
public void PatternsReduceIdentically(string actual, string expected)
{
// NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project.
Expand Down Expand Up @@ -643,6 +655,9 @@ public void PatternsReduceIdentically(string actual, string expected)
[InlineData(@"\b\B", "\b")]
[InlineData(@"^$", "^")]
[InlineData(@"^$", "$")]
// After alternation prefix extraction, optional patterns should differ from non-optional
[InlineData("(?i)(?:ab|abc)d", "(?i)abcd")]
[InlineData("(?:[ab][0-9]|[ab])x", "[ab][0-9]x")]
public void PatternsReduceDifferently(string actual, string expected)
{
// NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project.
Expand Down
Loading