From e74a52fc625ec5228bc48670ae75388dbc5ddb2b Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Wed, 31 Jan 2024 00:13:33 -0500 Subject: [PATCH 1/2] Fix regex lazy loop handling of backtracking state at max iteration limit Upon entering a lazy loop, state is pushed onto the backtracking stack if the lazy loop might be backtracked into. That state is then dutifully popped off when unwinding the loop due to failure to match. However, if the loop successfully matches its maximum number of times but still fails because of a failure after the lazy loop, the state still needs to be popped off the stack, but isn't. This fixes that. --- .../gen/RegexGenerator.Emitter.cs | 12 +++++++++ .../Text/RegularExpressions/RegexCompiler.cs | 26 +++++++++++++++++-- .../FunctionalTests/Regex.Match.Tests.cs | 2 ++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 19488703f07f7e..efe4f0e40b4390 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -3770,6 +3770,17 @@ void EmitLazy(RegexNode node) using (clause) { + // We're backtracking, which could either be to something prior to the lazy loop or to something + // inside of the lazy loop. If it's to something inside of the lazy loop, then either the loop + // will eventually succeed or we'll eventually end up unwinding back through the iterations all + // the way back to the loop not matching at all, in which case the state we first pushed on at the + // beginning of the !isAtomic section will get popped off. But if here we're instead going to jump + // to something prior to the lazy loop, then we need to pop off that state here. + if (doneLabel == originalDoneLabel) + { + EmitAdd(writer, "stackpos", -entriesPerIteration); + } + if (iterationMayBeEmpty) { // If we saw empty, it must have been in the most recent iteration, as we wouldn't have @@ -3777,6 +3788,7 @@ void EmitLazy(RegexNode node) // false prior to backtracking / undoing that iteration. writer.WriteLine($"{sawEmpty} = 0; // false"); } + Goto(doneLabel); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 20ccc3afefcca6..fb710b26788732 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -4075,6 +4075,8 @@ void EmitLazy(RegexNode node) // Determine where to branch, either back to the lazy loop body to add an additional iteration, // or to the last backtracking label. + Label jumpToDone = DefineLabel(); + if (iterationMayBeEmpty) { // if (sawEmpty != 0) @@ -4093,7 +4095,7 @@ void EmitLazy(RegexNode node) Ldc(0); Stloc(sawEmpty!); - BrFar(doneLabel); + Br(jumpToDone); MarkLabel(sawEmptyZero); } @@ -4102,12 +4104,32 @@ void EmitLazy(RegexNode node) // if (iterationCount >= maxIterations) goto doneLabel; Ldloc(iterationCount); Ldc(maxIterations); - BgeFar(doneLabel); + Bge(jumpToDone); } // goto body; BrFar(body); + MarkLabel(jumpToDone); + + // We're backtracking, which could either be to something prior to the lazy loop or to something + // inside of the lazy loop. If it's to something inside of the lazy loop, then either the loop + // will eventually succeed or we'll eventually end up unwinding back through the iterations all + // the way back to the loop not matching at all, in which case the state we first pushed on at the + // beginning of the !isAtomic section will get popped off. But if here we're instead going to jump + // to something prior to the lazy loop, then we need to pop off that state here. + if (doneLabel == originalDoneLabel) + { + // stackpos -= entriesPerIteration; + Ldloc(stackpos); + Ldc(entriesPerIteration); + Sub(); + Stloc(stackpos); + } + + // goto done; + BrFar(doneLabel); + doneLabel = backtrack; MarkLabel(skipBacktrack); } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 43bd39f8f36779..dca759906c5283 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2374,6 +2374,8 @@ public static IEnumerable AllMatches_TestData() yield return new object[] { engine, "^(?:aaa|aa){1,5}?$", RegexOptions.None, "aaaaaaaa", new (int, int, string)[] { (0, 8, "aaaaaaaa") } }; yield return new object[] { engine, "^(?:aaa|aa){4}$", RegexOptions.None, "aaaaaaaa", new (int, int, string)[] { (0, 8, "aaaaaaaa") } }; yield return new object[] { engine, "^(?:aaa|aa){4}?$", RegexOptions.None, "aaaaaaaa", new (int, int, string)[] { (0, 8, "aaaaaaaa") } }; + yield return new object[] { engine, "^((?:(xx?,xx?)|xx?,xx?>xx?,xx?)-?(x)??){1,2}$", RegexOptions.None, "xx,xx>xx,xx", new (int, int, string)[] { (0, 11, "xx,xx>xx,xx") } }; + yield return new object[] { engine, "^(?:x|(x)??){2}$", RegexOptions.None, "x>", null }; // Mostly empty matches using unusual regexes consisting mostly of anchors only yield return new object[] { engine, "^", RegexOptions.None, "", new (int, int, string)[] { (0, 0, "") } }; From bd5da68e1e4c6f06051b97d8a2bf6f1530c6afad Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Sat, 3 Feb 2024 13:04:02 -0500 Subject: [PATCH 2/2] Add a few more test variations --- .../tests/FunctionalTests/Regex.Match.Tests.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index dca759906c5283..d09f45ac2064c4 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2375,6 +2375,9 @@ public static IEnumerable AllMatches_TestData() yield return new object[] { engine, "^(?:aaa|aa){4}$", RegexOptions.None, "aaaaaaaa", new (int, int, string)[] { (0, 8, "aaaaaaaa") } }; yield return new object[] { engine, "^(?:aaa|aa){4}?$", RegexOptions.None, "aaaaaaaa", new (int, int, string)[] { (0, 8, "aaaaaaaa") } }; yield return new object[] { engine, "^((?:(xx?,xx?)|xx?,xx?>xx?,xx?)-?(x)??){1,2}$", RegexOptions.None, "xx,xx>xx,xx", new (int, int, string)[] { (0, 11, "xx,xx>xx,xx") } }; + yield return new object[] { engine, "^((?:(xx?,xx?)|xx?,xx?>xx?,xx?)-?(x*)??){1,2}$", RegexOptions.None, "xx,xx>xx,xx", new (int, int, string)[] { (0, 11, "xx,xx>xx,xx") } }; + yield return new object[] { engine, "^((?:(xx?,xx?)|xx?,xx?>xx?,xx?)-?(x+)??){1,2}$", RegexOptions.None, "xx,xx>xx,xx", new (int, int, string)[] { (0, 11, "xx,xx>xx,xx") } }; + yield return new object[] { engine, "^((?:(x(x?),x(x?))|xx?,(x(x?)>x(x?)),(x((x))?))-?(x+?)??){1,2}$", RegexOptions.None, "xx,xx>xx,xx", new (int, int, string)[] { (0, 11, "xx,xx>xx,xx") } }; yield return new object[] { engine, "^(?:x|(x)??){2}$", RegexOptions.None, "x>", null }; // Mostly empty matches using unusual regexes consisting mostly of anchors only