diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 78d529b7baee72..fd1f97ec7a9bb5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -1193,24 +1193,16 @@ static RegexNode ExtractCommonPrefixNode(RegexNode alternation) return alternation; } - // Only handle the case where each branch is a concatenation - foreach (RegexNode child in children) - { - if (child.Kind != RegexNodeKind.Concatenate || child.ChildCount() < 2) - { - return alternation; - } - } - for (int startingIndex = 0; startingIndex < children.Count - 1; startingIndex++) { - Debug.Assert(children[startingIndex].Children is List { Count: >= 2 }); - // Only handle the case where each branch begins with the same One, Notone, Set (individual or loop), or Anchor. // Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing // it for non-atomic variable length loops could change behavior as each branch could otherwise have a // different number of characters consumed by the loop based on what's after it. - RegexNode required = children[startingIndex].Child(0); + // A branch may be either a Concatenation (get its first child) or a single node (e.g., a Set + // that was reduced from a single-child Concatenation after prior prefix extraction). + RegexNode startingNode = children[startingIndex]; + RegexNode required = startingNode.Kind == RegexNodeKind.Concatenate ? startingNode.Child(0) : startingNode; switch (required.Kind) { case RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set: @@ -1230,7 +1222,8 @@ or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary int endingIndex = startingIndex + 1; for (; endingIndex < children.Count; endingIndex++) { - RegexNode other = children[endingIndex].Child(0); + RegexNode endingNode = children[endingIndex]; + RegexNode other = endingNode.Kind == RegexNodeKind.Concatenate ? endingNode.Child(0) : endingNode; if (required.Kind != other.Kind || required.Options != other.Options || required.M != other.M || @@ -1252,8 +1245,16 @@ or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary var newAlternate = new RegexNode(RegexNodeKind.Alternate, alternation.Options); for (int i = startingIndex; i < endingIndex; i++) { - ((List)children[i].Children!).RemoveAt(0); - newAlternate.AddChild(children[i]); + if (children[i].Kind == RegexNodeKind.Concatenate) + { + ((List)children[i].Children!).RemoveAt(0); + newAlternate.AddChild(children[i]); + } + else + { + // The entire branch was the extracted prefix; what remains is Empty. + newAlternate.AddChild(new RegexNode(RegexNodeKind.Empty, children[i].Options)); + } } // If this alternation is wrapped as atomic, we need to do the same for the new alternation. diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index bc2cbc9536e1ff..2e3e1d1ee5350a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -849,6 +849,11 @@ public static IEnumerable Match_MemberData() yield return (@"a\wc|\wgh|de\w", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper); yield return (@"a\wc|\wgh|de\w", upper, RegexOptions.None, 0, input.Length, false, ""); } + // Alternation prefix extraction with IgnoreCase: correctness after single-node branch handling + yield return (@"(?:http|https)://foo", "HTTP://FOO", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 10, true, "HTTP://FOO"); + yield return (@"(?:http|https)://foo", "HTTPS://FOO", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 11, true, "HTTPS://FOO"); + yield return (@"(?:http|https)://foo", "ftp://foo", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 9, false, ""); + yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest"); yield return (@"(\w+|\d+)a+[ab]+", "123123aa", RegexOptions.None, 0, 8, true, "123123aa"); diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs index ca75e1abd4404f..c5e117440ed5b9 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs @@ -120,6 +120,25 @@ public void TrailingAnchor(string pattern, int options, int expectedMode, int ex [InlineData(@"(?<=cd)ab", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")] [InlineData(@"\bab(?=\w)(?!=\d)c\b", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")] [InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] + // Alternation branches differing by one trailing character: prefix extraction should include all shared characters + [InlineData(@"(?:http|https)://foo", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "http")] + [InlineData(@"(?:http|https)://foo", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "http")] + // Alternation where shorter branch is just the shared prefix + [InlineData(@"(?:ab|abc)d", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] + // Alternation where branches differ by more than one character + [InlineData(@"(?:abc|abcdef)g", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] + [InlineData(@"(?:abc|abcdef)g", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")] + // Three-branch alternation with shared prefix and different lengths + [InlineData(@"(?:ab|abc|abcd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] + [InlineData(@"(?:ab|abc|abcd)e", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] + // Three-branch alternation with shared prefix and different trailing characters + [InlineData(@"(?:ab|abc|abd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] + [InlineData(@"(?:ab|abc|abd)e", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] + // Case-sensitive alternation with branches differing by one (handled by ExtractCommonPrefixText, not Node, but verifies no regression) + [InlineData(@"(?:ab|abc)d", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] + // Four-branch alternation mixing single-node and Concat branches after IgnoreCase prefix extraction + [InlineData(@"(?:abc|abcd|abce|abcfg)h", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] + [InlineData(@"(?:abc|abcd|abce|abcfg)h", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")] public void LeadingPrefix(string pattern, int options, int expectedMode, string expectedPrefix) { RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options); @@ -138,6 +157,12 @@ public void LeadingPrefix(string pattern, int options, int expectedMode, string [InlineData(@"ab|cd|ef|gh", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "bdfh")] [InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)(RegexOptions.IgnoreCase | RegexOptions.RightToLeft), (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "Cc")] [InlineData(@"ab|(abc)|(abcd)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "bcd")] + // Non-IgnoreCase Set-node branch: single-node branch after prefix extraction of character class + [InlineData(@"(?:[ab][0-9]|[ab])x", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "ab")] + // Single-node before Concat branch (reversed order) + [InlineData(@"(?:[ab]|[ab][0-9])x", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "ab")] + // IgnoreCase Set-node branch: prefix extraction across set-expanded branches + [InlineData(@"(?:a|ab)c", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "Aa")] public void LeadingSet(string pattern, int options, int expectedMode, string expectedChars) { RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options); diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index 14de90e63f5caa..f801a188d8c76b 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -463,6 +463,18 @@ public class RegexReductionTests [InlineData("(?i)\\d", "\\d")] [InlineData("(?i).", ".")] [InlineData("(?i)\\$", "\\$")] + // IgnoreCase node prefix extraction with single-node branch handling + [InlineData("(?i)(?:ab|abc)d", "(?i)ab(?>c?)d")] + [InlineData("(?i)(?:http|https)://foo", "[Hh](?>[Tt]{2})[Pp](?>[Ss]?)://[Ff](?>[Oo]{2})")] + [InlineData("(?i)(?:abc|abcd|abce|abcfg)h", "(?i)abc(?:|[de]|fg)h")] + [InlineData("(?i)(?:ab|abc|abcd)e", "(?i)ab(?:c(?>d?))??e")] + // Non-IgnoreCase node prefix extraction with single-node branch handling + [InlineData("(?:[ab][0-9]|[ab])x", "[ab](?>[0-9]?)x")] + [InlineData("(?:\\w\\d|\\w)x", "\\w(?>\\d?)x")] + // Non-IgnoreCase text prefix extraction (regression guards) + [InlineData("(?:http|https)://foo", "http(?>s?)://foo")] + [InlineData("(?:ab|abc)d", "ab(?>c?)d")] + [InlineData("(?:abc|abcd|abce|abcfg)h", "abc(?:|[de]|fg)h")] public void PatternsReduceIdentically(string actual, string expected) { // NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project. @@ -643,6 +655,9 @@ public void PatternsReduceIdentically(string actual, string expected) [InlineData(@"\b\B", "\b")] [InlineData(@"^$", "^")] [InlineData(@"^$", "$")] + // After alternation prefix extraction, optional patterns should differ from non-optional + [InlineData("(?i)(?:ab|abc)d", "(?i)abcd")] + [InlineData("(?:[ab][0-9]|[ab])x", "[ab][0-9]x")] public void PatternsReduceDifferently(string actual, string expected) { // NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project.