From 121454842bcab07e3edfeeeb2b41f01d46172757 Mon Sep 17 00:00:00 2001 From: Dan Moseley Date: Wed, 25 Feb 2026 14:32:12 -0700 Subject: [PATCH 1/3] Handle single-node branches in ExtractCommonPrefixNode When alternation branches are reduced to single nodes (e.g., Set[Pp] from a single-child Concatenation after prior prefix extraction), ExtractCommonPrefixNode previously bailed because it required all branches to be Concatenations. This caused IgnoreCase alternation prefix extraction to stop one character short (e.g., 'htt' instead of 'http' for (http|https)). Fix: remove the upfront gate check, and handle both Concatenation and single-node branches throughout the extraction loop. Fixes dotnet#124871 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Text/RegularExpressions/RegexNode.cs | 31 ++++++++++--------- .../UnitTests/RegexFindOptimizationsTests.cs | 12 +++++++ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 78d529b7baee72..fd1f97ec7a9bb5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -1193,24 +1193,16 @@ static RegexNode ExtractCommonPrefixNode(RegexNode alternation) return alternation; } - // Only handle the case where each branch is a concatenation - foreach (RegexNode child in children) - { - if (child.Kind != RegexNodeKind.Concatenate || child.ChildCount() < 2) - { - return alternation; - } - } - for (int startingIndex = 0; startingIndex < children.Count - 1; startingIndex++) { - Debug.Assert(children[startingIndex].Children is List { Count: >= 2 }); - // Only handle the case where each branch begins with the same One, Notone, Set (individual or loop), or Anchor. // Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing // it for non-atomic variable length loops could change behavior as each branch could otherwise have a // different number of characters consumed by the loop based on what's after it. - RegexNode required = children[startingIndex].Child(0); + // A branch may be either a Concatenation (get its first child) or a single node (e.g., a Set + // that was reduced from a single-child Concatenation after prior prefix extraction). + RegexNode startingNode = children[startingIndex]; + RegexNode required = startingNode.Kind == RegexNodeKind.Concatenate ? startingNode.Child(0) : startingNode; switch (required.Kind) { case RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set: @@ -1230,7 +1222,8 @@ or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary int endingIndex = startingIndex + 1; for (; endingIndex < children.Count; endingIndex++) { - RegexNode other = children[endingIndex].Child(0); + RegexNode endingNode = children[endingIndex]; + RegexNode other = endingNode.Kind == RegexNodeKind.Concatenate ? endingNode.Child(0) : endingNode; if (required.Kind != other.Kind || required.Options != other.Options || required.M != other.M || @@ -1252,8 +1245,16 @@ or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary var newAlternate = new RegexNode(RegexNodeKind.Alternate, alternation.Options); for (int i = startingIndex; i < endingIndex; i++) { - ((List)children[i].Children!).RemoveAt(0); - newAlternate.AddChild(children[i]); + if (children[i].Kind == RegexNodeKind.Concatenate) + { + ((List)children[i].Children!).RemoveAt(0); + newAlternate.AddChild(children[i]); + } + else + { + // The entire branch was the extracted prefix; what remains is Empty. + newAlternate.AddChild(new RegexNode(RegexNodeKind.Empty, children[i].Options)); + } } // If this alternation is wrapped as atomic, we need to do the same for the new alternation. diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs index ca75e1abd4404f..bd7c687aa05f8e 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs @@ -120,6 +120,18 @@ public void TrailingAnchor(string pattern, int options, int expectedMode, int ex [InlineData(@"(?<=cd)ab", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")] [InlineData(@"\bab(?=\w)(?!=\d)c\b", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")] [InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] + // Alternation branches differing by one trailing character: prefix extraction should include all shared characters + [InlineData(@"(?:http|https)://foo", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "http")] + // Alternation where shorter branch is just the shared prefix + [InlineData(@"(?:ab|abc)d", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] + // Alternation where branches differ by more than one character + [InlineData(@"(?:abc|abcdef)g", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] + // Three-branch alternation with shared prefix and different lengths + [InlineData(@"(?:ab|abc|abcd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] + // Three-branch alternation with shared prefix and different trailing characters + [InlineData(@"(?:ab|abc|abd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] + // Case-sensitive alternation with branches differing by one (handled by ExtractCommonPrefixText, not Node, but verifies no regression) + [InlineData(@"(?:ab|abc)d", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] public void LeadingPrefix(string pattern, int options, int expectedMode, string expectedPrefix) { RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options); From 5aa5d8b8def8551a1aae64f8093b81f35da83d28 Mon Sep 17 00:00:00 2001 From: Dan Moseley Date: Wed, 25 Feb 2026 19:03:56 -0700 Subject: [PATCH 2/3] Add tests for single-node branch handling in ExtractCommonPrefixNode - Behavioral correctness test for (?:http|https)://foo with IgnoreCase - 4-branch regression test exercising single-node branch after recursive prefix extraction - Non-IgnoreCase Set-node branch test via character class alternation Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../tests/FunctionalTests/Regex.Match.Tests.cs | 5 +++++ .../tests/UnitTests/RegexFindOptimizationsTests.cs | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index bc2cbc9536e1ff..2e3e1d1ee5350a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -849,6 +849,11 @@ public static IEnumerable Match_MemberData() yield return (@"a\wc|\wgh|de\w", upper, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, input.Length, true, upper); yield return (@"a\wc|\wgh|de\w", upper, RegexOptions.None, 0, input.Length, false, ""); } + // Alternation prefix extraction with IgnoreCase: correctness after single-node branch handling + yield return (@"(?:http|https)://foo", "HTTP://FOO", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 10, true, "HTTP://FOO"); + yield return (@"(?:http|https)://foo", "HTTPS://FOO", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 11, true, "HTTPS://FOO"); + yield return (@"(?:http|https)://foo", "ftp://foo", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 9, false, ""); + yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest"); yield return (@"(\w+|\d+)a+[ab]+", "123123aa", RegexOptions.None, 0, 8, true, "123123aa"); diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs index bd7c687aa05f8e..8a609f3d96c368 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs @@ -132,6 +132,8 @@ public void TrailingAnchor(string pattern, int options, int expectedMode, int ex [InlineData(@"(?:ab|abc|abd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] // Case-sensitive alternation with branches differing by one (handled by ExtractCommonPrefixText, not Node, but verifies no regression) [InlineData(@"(?:ab|abc)d", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] + // Four-branch alternation mixing single-node and Concat branches after IgnoreCase prefix extraction + [InlineData(@"(?:abc|abcd|abce|abcfg)h", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] public void LeadingPrefix(string pattern, int options, int expectedMode, string expectedPrefix) { RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options); @@ -150,6 +152,8 @@ public void LeadingPrefix(string pattern, int options, int expectedMode, string [InlineData(@"ab|cd|ef|gh", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "bdfh")] [InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)(RegexOptions.IgnoreCase | RegexOptions.RightToLeft), (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "Cc")] [InlineData(@"ab|(abc)|(abcd)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "bcd")] + // Non-IgnoreCase Set-node branch: single-node branch after prefix extraction of character class + [InlineData(@"(?:[ab][0-9]|[ab])x", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "ab")] public void LeadingSet(string pattern, int options, int expectedMode, string expectedChars) { RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options); From 1399ca2bfde0e593396c23832ae03a894611b5eb Mon Sep 17 00:00:00 2001 From: Dan Moseley Date: Thu, 26 Feb 2026 09:20:17 -0700 Subject: [PATCH 3/3] Add tree-shape and non-IgnoreCase tests for single-node branch handling Address review feedback: - Add PatternsReduceIdentically tests validating reduced tree shapes for IgnoreCase and non-IgnoreCase node prefix extraction - Add PatternsReduceDifferently tests confirming optional vs required patterns produce distinct trees - Add non-IgnoreCase variants of LeadingPrefix test cases - Add LeadingSet tests for reversed branch order and IgnoreCase sets Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../UnitTests/RegexFindOptimizationsTests.cs | 9 +++++++++ .../tests/UnitTests/RegexReductionTests.cs | 15 +++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs index 8a609f3d96c368..c5e117440ed5b9 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs @@ -122,18 +122,23 @@ public void TrailingAnchor(string pattern, int options, int expectedMode, int ex [InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] // Alternation branches differing by one trailing character: prefix extraction should include all shared characters [InlineData(@"(?:http|https)://foo", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "http")] + [InlineData(@"(?:http|https)://foo", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "http")] // Alternation where shorter branch is just the shared prefix [InlineData(@"(?:ab|abc)d", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] // Alternation where branches differ by more than one character [InlineData(@"(?:abc|abcdef)g", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] + [InlineData(@"(?:abc|abcdef)g", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")] // Three-branch alternation with shared prefix and different lengths [InlineData(@"(?:ab|abc|abcd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] + [InlineData(@"(?:ab|abc|abcd)e", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] // Three-branch alternation with shared prefix and different trailing characters [InlineData(@"(?:ab|abc|abd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] + [InlineData(@"(?:ab|abc|abd)e", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] // Case-sensitive alternation with branches differing by one (handled by ExtractCommonPrefixText, not Node, but verifies no regression) [InlineData(@"(?:ab|abc)d", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "ab")] // Four-branch alternation mixing single-node and Concat branches after IgnoreCase prefix extraction [InlineData(@"(?:abc|abcd|abce|abcfg)h", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] + [InlineData(@"(?:abc|abcd|abce|abcfg)h", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")] public void LeadingPrefix(string pattern, int options, int expectedMode, string expectedPrefix) { RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options); @@ -154,6 +159,10 @@ public void LeadingPrefix(string pattern, int options, int expectedMode, string [InlineData(@"ab|(abc)|(abcd)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingSet_RightToLeft, "bcd")] // Non-IgnoreCase Set-node branch: single-node branch after prefix extraction of character class [InlineData(@"(?:[ab][0-9]|[ab])x", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "ab")] + // Single-node before Concat branch (reversed order) + [InlineData(@"(?:[ab]|[ab][0-9])x", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "ab")] + // IgnoreCase Set-node branch: prefix extraction across set-expanded branches + [InlineData(@"(?:a|ab)c", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "Aa")] public void LeadingSet(string pattern, int options, int expectedMode, string expectedChars) { RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options); diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index 14de90e63f5caa..f801a188d8c76b 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -463,6 +463,18 @@ public class RegexReductionTests [InlineData("(?i)\\d", "\\d")] [InlineData("(?i).", ".")] [InlineData("(?i)\\$", "\\$")] + // IgnoreCase node prefix extraction with single-node branch handling + [InlineData("(?i)(?:ab|abc)d", "(?i)ab(?>c?)d")] + [InlineData("(?i)(?:http|https)://foo", "[Hh](?>[Tt]{2})[Pp](?>[Ss]?)://[Ff](?>[Oo]{2})")] + [InlineData("(?i)(?:abc|abcd|abce|abcfg)h", "(?i)abc(?:|[de]|fg)h")] + [InlineData("(?i)(?:ab|abc|abcd)e", "(?i)ab(?:c(?>d?))??e")] + // Non-IgnoreCase node prefix extraction with single-node branch handling + [InlineData("(?:[ab][0-9]|[ab])x", "[ab](?>[0-9]?)x")] + [InlineData("(?:\\w\\d|\\w)x", "\\w(?>\\d?)x")] + // Non-IgnoreCase text prefix extraction (regression guards) + [InlineData("(?:http|https)://foo", "http(?>s?)://foo")] + [InlineData("(?:ab|abc)d", "ab(?>c?)d")] + [InlineData("(?:abc|abcd|abce|abcfg)h", "abc(?:|[de]|fg)h")] public void PatternsReduceIdentically(string actual, string expected) { // NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project. @@ -643,6 +655,9 @@ public void PatternsReduceIdentically(string actual, string expected) [InlineData(@"\b\B", "\b")] [InlineData(@"^$", "^")] [InlineData(@"^$", "$")] + // After alternation prefix extraction, optional patterns should differ from non-optional + [InlineData("(?i)(?:ab|abc)d", "(?i)abcd")] + [InlineData("(?:[ab][0-9]|[ab])x", "[ab][0-9]x")] public void PatternsReduceDifferently(string actual, string expected) { // NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project.