From 3e92348d9dc4f2e08e3cdf97ece5d48fe30b80e6 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Sat, 26 Jul 2025 21:09:10 -0400 Subject: [PATCH] Remove positive lookarounds that wrap only zero-width assertions A positive lookahead effectively changes its contents to be zero-width. If the contents is already zero-width, the lookaround adds no value. --- .../Text/RegularExpressions/RegexNode.cs | 39 ++++++++++++++----- .../RegularExpressions/RegexPrefixAnalyzer.cs | 3 +- .../UnitTests/RegexFindOptimizationsTests.cs | 2 +- .../tests/UnitTests/RegexReductionTests.cs | 10 +++++ 4 files changed, 41 insertions(+), 13 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 7f010f3fdd62e5..572f097edcfdc0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2049,23 +2049,42 @@ private RegexNode ReduceLookaround() // eliminate any ending backtracking from it. EliminateEndingBacktracking(); - // A positive lookaround wrapped around an empty is a nop, and we can reduce it - // to simply Empty. A developer typically doesn't write this, but rather it evolves - // due to optimizations resulting in empty. - - // A negative lookaround wrapped around an empty child, i.e. (?!), is - // sometimes used as a way to insert a guaranteed no-match into the expression, - // often as part of a conditional. We can reduce it to simply Nothing. + RegexNode child = Child(0); - if (Child(0).Kind == RegexNodeKind.Empty) + // A positive lookahead that wraps a zero-width assertion is useless wrapping and can be removed. + // Similarly, a positive lookaround wrapped around an empty can be reduced simply to Empty. + // A developer typically doesn't write this, but rather it evolves due to optimizations resulting in empty. + if (Kind is RegexNodeKind.PositiveLookaround) { - Kind = Kind == RegexNodeKind.PositiveLookaround ? RegexNodeKind.Empty : RegexNodeKind.Nothing; - Children = null; + if (((Options & RegexOptions.RightToLeft) == 0 && IsZeroWidthAssertion(child.Kind)) || + child.Kind is RegexNodeKind.Empty) + { + return child; + } + } + else if (Kind is RegexNodeKind.NegativeLookaround) + { + // A negative lookaround wrapped around an empty child, i.e. (?!), is + // sometimes used as a way to insert a guaranteed no-match into the expression, + // often as part of a conditional. We can reduce it to simply Nothing. + if (child.Kind is RegexNodeKind.Empty) + { + Kind = RegexNodeKind.Nothing; + Children = null; + } } return this; } + private static bool IsZeroWidthAssertion(RegexNodeKind kind) => kind is + RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround or + RegexNodeKind.Beginning or RegexNodeKind.Start or + RegexNodeKind.Bol or RegexNodeKind.Eol or + RegexNodeKind.End or RegexNodeKind.EndZ or + RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or + RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary; + /// Optimizations for backreference conditionals. private RegexNode ReduceBackreferenceConditional() { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 93ab67a2c392a0..34e5a84c85a5da 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -1354,8 +1354,6 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le case RegexNodeKind.Start: case RegexNodeKind.EndZ: case RegexNodeKind.End: - case RegexNodeKind.Boundary: - case RegexNodeKind.ECMABoundary: // Return any anchor found. return node.Kind; @@ -1389,6 +1387,7 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le { case RegexNodeKind.Empty or RegexNodeKind.NegativeLookaround: case RegexNodeKind.PositiveLookaround when ((node.Options | tmpChild.Options) & RegexOptions.RightToLeft) != 0: + case RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary: // Skip over zero-width assertions. continue; diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs index 8501700b31c180..856f3d85d52627 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs @@ -34,7 +34,7 @@ public class RegexFindOptimizationsTests [InlineData(@"(?=^)abc", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)] [InlineData(@"(?=.*$)abc", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight)] [InlineData(@"(?=^)abc", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft)] - [InlineData(@"abc(?=^)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft)] + [InlineData(@"abc(?=^)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning)] [InlineData(@"(?