From 7f8f2165776f48a827651ce60bf3f88544374d6c Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Wed, 24 Jan 2024 13:27:04 -0500 Subject: [PATCH] Fix handling of capture groups inside of negative lookarounds The Regex compiler and source generator weren't uncapturing captures inside of a negative lookaround. That then leads both to subsequent backreferences matching when they shouldn't. --- .../gen/RegexGenerator.Emitter.cs | 33 ++++++++++++- .../Text/RegularExpressions/RegexCompiler.cs | 49 ++++++++++++++++++- .../FunctionalTests/Regex.Match.Tests.cs | 6 +++ 3 files changed, 85 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 27dc17ff9d800d..a4c4d18d553c40 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -2574,7 +2574,8 @@ void EmitNegativeLookaroundAssertion(RegexNode node) string originalDoneLabel = doneLabel; // Save off pos. We'll need to reset this upon successful completion of the lookaround. - string startingPos = ReserveName((node.Options & RegexOptions.RightToLeft) != 0 ? "negativelookbehind_starting_pos" : "negativelookahead_starting_pos"); + string variablePrefix = (node.Options & RegexOptions.RightToLeft) != 0 ? "negativelookbehind_" : "negativelookahead_"; + string startingPos = ReserveName($"{variablePrefix}_starting_pos"); writer.WriteLine($"int {startingPos} = pos;"); int startingSliceStaticPos = sliceStaticPos; @@ -2585,8 +2586,30 @@ void EmitNegativeLookaroundAssertion(RegexNode node) // technically backtracking, it's appropriate to have a timeout check. EmitTimeoutCheckIfNeeded(writer, rm); - // Emit the child. RegexNode child = node.Child(0); + + // Ensure we're able to uncapture anything captured by the child. + bool isInLoop = false; + string? capturePos = null; + bool hasCaptures = rm.Analysis.MayContainCapture(child); + if (hasCaptures) + { + // If we're inside a loop, push the current crawl position onto the stack, + // so that each iteration tracks its own value. Otherwise, store it into a local. + isInLoop = rm.Analysis.IsInLoop(node); + if (isInLoop) + { + EmitStackPush("base.Crawlpos()"); + } + else + { + capturePos = ReserveName($"{variablePrefix}_capture_pos"); + additionalDeclarations.Add($"int {capturePos} = 0;"); + writer.WriteLine($"{capturePos} = base.Crawlpos();"); + } + } + + // Emit the child. if (rm.Analysis.MayBacktrack(child)) { // Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack. @@ -2611,6 +2634,12 @@ void EmitNegativeLookaroundAssertion(RegexNode node) SliceInputSpan(); sliceStaticPos = startingSliceStaticPos; + // And uncapture anything if necessary. Negative lookaround captures don't persist beyond the lookaround. + if (hasCaptures) + { + EmitUncaptureUntil(isInLoop ? StackPop() : capturePos!); + } + doneLabel = originalDoneLabel; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 45fb0e8a288821..779b8be2d897a2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -2577,8 +2577,41 @@ void EmitNegativeLookaroundAssertion(RegexNode node) // technically backtracking, it's appropriate to have a timeout check. EmitTimeoutCheckIfNeeded(); - // Emit the child. RegexNode child = node.Child(0); + + // Ensure we're able to uncapture anything captured by the child. + // Note that this differs ever so slightly from the source generator. The source + // generator only defines a local for capturePos if not in a loop (as it calls to a helper + // method where the argument acts implicitly as a local), but the compiler + // needs to store the popped stack value somewhere so that it can repeatedly compare + // that value against Crawlpos, so capturePos is always declared if there are captures. + bool isInLoop = false; + LocalBuilder? capturePos = analysis.MayContainCapture(child) ? DeclareInt32() : null; + if (capturePos is not null) + { + // If we're inside a loop, push the current crawl position onto the stack, + // so that each iteration tracks its own value. Otherwise, store it into a local. + isInLoop = analysis.IsInLoop(node); + if (isInLoop) + { + EmitStackResizeIfNeeded(1); + EmitStackPush(() => + { + // base.Crawlpos(); + Ldthis(); + Call(s_crawlposMethod); + }); + } + else + { + // capturePos = base.Crawlpos(); + Ldthis(); + Call(s_crawlposMethod); + Stloc(capturePos); + } + } + + // Emit the child. if (analysis.MayBacktrack(child)) { // Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack. @@ -2608,6 +2641,20 @@ void EmitNegativeLookaroundAssertion(RegexNode node) SliceInputSpan(); sliceStaticPos = startingTextSpanPos; + // And uncapture anything if necessary. Negative lookaround captures don't persist beyond the lookaround. + if (capturePos is not null) + { + if (isInLoop) + { + // capturepos = base.runstack[--stackpos]; + EmitStackPop(); + Stloc(capturePos); + } + + // while (base.Crawlpos() > capturepos) base.Uncapture(); + EmitUncaptureUntil(capturePos); + } + doneLabel = originalDoneLabel; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 63ae7584fdfb05..f72b477c82011e 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -88,6 +88,10 @@ public static IEnumerable Match_MemberData() { // Zero-width negative lookahead assertion yield return (@"abc(?!XXX)\w+", "abcXXXdef", RegexOptions.None, 0, 9, false, string.Empty); + yield return (@"(?!(b)b)\1", "ba", RegexOptions.None, 0, 2, false, string.Empty); + yield return (@"(?:(?!(b)b)\1a)+", "babababa", RegexOptions.None, 0, 8, false, string.Empty); + yield return (@"(?:(?!(b)b)\1a)*", "babababa", RegexOptions.None, 0, 8, true, string.Empty); + yield return (@"(.*?)a(?!(a+)b\2c)\2(.*)", "baaabaac", RegexOptions.None, 0, 8, false, string.Empty); // Zero-width positive lookbehind assertion yield return (@"(\w){6}(?<=XXX)def", "abcXXXdef", RegexOptions.None, 0, 9, true, "abcXXXdef"); @@ -114,6 +118,8 @@ public static IEnumerable Match_MemberData() yield return (@"(?<=\d?)a{4}", "123aaaaaaaaa", RegexOptions.None, 0, 12, true, "aaaa"); yield return (@"(?<=a{3,5}[ab]*)1234", "aaaaaaa1234", RegexOptions.None, 0, 11, true, "1234"); yield return (@"(\w)*?3(?<=33)$", "1233", RegexOptions.None, 0, 4, true, "1233"); + yield return (@"(?=(\d))4\1", "44", RegexOptions.None, 0, 2, true, "44"); + yield return (@"(?=(\d))4\1", "43", RegexOptions.None, 0, 2, false, ""); // Zero-width negative lookbehind assertion: Actual - "(\\w){6}(?