-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Enable lookarounds to influence atomicity #118153
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
As part of our auto-atomicity handling, today we give up when the subsequent node is a lookaround. This improves it to support the case when the subsequent node is a positive lookahead.
|
@MihuBot regexdiff |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull Request Overview
This PR enhances the auto-atomicity optimization in regex processing to support positive lookahead assertions. Previously, the system would give up on atomicity optimizations when encountering any lookaround construct, but this change specifically enables optimization for positive lookaheads.
- Adds support for positive lookahead assertions in the
CanBeMadeAtomicmethod - Extends test coverage for various combinations of quantifiers with lookaround assertions
- Improves regex performance by enabling atomic grouping in more scenarios
Reviewed Changes
Copilot reviewed 3 out of 3 changed files in this pull request and generated 1 comment.
| File | Description |
|---|---|
| RegexNode.cs | Adds logic to handle positive lookahead assertions in atomicity determination |
| RegexReductionTests.cs | Adds test cases verifying that patterns with lookarounds are correctly optimized to atomic groups |
| Regex.Match.Tests.cs | Extensive test coverage for various quantifier and lookaround combinations to ensure correct matching behavior |
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
Show resolved
Hide resolved
|
Tagging subscribers to this area: @dotnet/area-system-text-regularexpressions |
|
@MihuBot regexdiff |
|
85 out of 18857 patterns have generated source code changes. Examples of GeneratedRegex source diffs"(?<=\"code\":)(\\d+)(?=,)" (384 uses)[GeneratedRegex("(?<=\"code\":)(\\d+)(?=,)")] /// ○ Zero-width positive lookbehind.<br/>
/// ○ Match the string "\"code\":" right-to-left.<br/>
/// ○ 1st capture group.<br/>
- /// ○ Match a Unicode digit greedily at least once.<br/>
+ /// ○ Match a Unicode digit atomically at least once.<br/>
/// ○ Zero-width positive lookahead.<br/>
/// ○ Match ','.<br/>
/// </code>
int pos = base.runtextpos;
int matchStart = pos;
int capture_starting_pos = 0;
- int charloop_capture_pos = 0;
- int charloop_starting_pos = 0, charloop_ending_pos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
// Zero-width positive lookbehind.
}
// 1st capture group.
- //{
+ {
capture_starting_pos = pos;
- // Match a Unicode digit greedily at least once.
- //{
- charloop_starting_pos = pos;
-
+ // Match a Unicode digit atomically at least once.
+ {
int iteration = 0;
while ((uint)iteration < (uint)slice.Length && char.IsDigit(slice[iteration]))
{
slice = slice.Slice(iteration);
pos += iteration;
-
- charloop_ending_pos = pos;
- charloop_starting_pos++;
- goto CharLoopEnd;
-
- CharLoopBacktrack:
- UncaptureUntil(charloop_capture_pos);
-
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- if (charloop_starting_pos >= charloop_ending_pos ||
- (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, charloop_ending_pos - charloop_starting_pos).LastIndexOf(',')) < 0)
- {
- UncaptureUntil(0);
- return false; // The input didn't match.
- }
- charloop_ending_pos += charloop_starting_pos;
- pos = charloop_ending_pos;
- slice = inputSpan.Slice(pos);
-
- CharLoopEnd:
- charloop_capture_pos = base.Crawlpos();
- //}
+ }
base.Capture(1, capture_starting_pos, pos);
-
- goto CaptureSkipBacktrack;
-
- CaptureBacktrack:
- goto CharLoopBacktrack;
-
- CaptureSkipBacktrack:;
- //}
+ }
// Zero-width positive lookahead.
{
// Match ','.
if (slice.IsEmpty || slice[0] != ',')
{
- goto CaptureBacktrack;
+ UncaptureUntil(0);
+ return false; // The input didn't match.
}
pos = positivelookahead_starting_pos;"[^\\.][\\w]+(?=,)" (364 uses)[GeneratedRegex("[^\\.][\\w]+(?=,)")] /// Explanation:<br/>
/// <code>
/// ○ Match any character other than '.'.<br/>
- /// ○ Match a word character greedily at least once.<br/>
+ /// ○ Match a word character atomically at least once.<br/>
/// ○ Zero-width positive lookahead.<br/>
/// ○ Match ','.<br/>
/// </code>
{
int pos = base.runtextpos;
int matchStart = pos;
- int charloop_starting_pos = 0, charloop_ending_pos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
// Match any character other than '.'.
return false; // The input didn't match.
}
- // Match a word character greedily at least once.
- //{
+ // Match a word character atomically at least once.
+ {
pos++;
slice = inputSpan.Slice(pos);
- charloop_starting_pos = pos;
-
int iteration = 0;
while ((uint)iteration < (uint)slice.Length && Utilities.IsWordChar(slice[iteration]))
{
slice = slice.Slice(iteration);
pos += iteration;
-
- charloop_ending_pos = pos;
- charloop_starting_pos++;
- goto CharLoopEnd;
-
- CharLoopBacktrack:
-
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- if (charloop_starting_pos >= charloop_ending_pos ||
- (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, charloop_ending_pos - charloop_starting_pos).LastIndexOf(',')) < 0)
- {
- return false; // The input didn't match.
- }
- charloop_ending_pos += charloop_starting_pos;
- pos = charloop_ending_pos;
- slice = inputSpan.Slice(pos);
-
- CharLoopEnd:
- //}
+ }
// Zero-width positive lookahead.
{
// Match ','.
if (slice.IsEmpty || slice[0] != ',')
{
- goto CharLoopBacktrack;
+ return false; // The input didn't match.
}
pos = positivelookahead_starting_pos;"^\\*{2,}(?=[^/*])" (327 uses)[GeneratedRegex("^\\*{2,}(?=[^/*])")] /// Explanation:<br/>
/// <code>
/// ○ Match if at the beginning of the string.<br/>
- /// ○ Match '*' greedily at least twice.<br/>
+ /// ○ Match '*' atomically at least twice.<br/>
/// ○ Zero-width positive lookahead.<br/>
/// ○ Match a character in the set [^*/].<br/>
/// </code>
int pos = base.runtextpos;
int matchStart = pos;
char ch;
- int charloop_starting_pos = 0, charloop_ending_pos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
// Match if at the beginning of the string.
return false; // The input didn't match.
}
- // Match '*' greedily at least twice.
- //{
- charloop_starting_pos = pos;
-
+ // Match '*' atomically at least twice.
+ {
int iteration = slice.IndexOfAnyExcept('*');
if (iteration < 0)
{
slice = slice.Slice(iteration);
pos += iteration;
-
- charloop_ending_pos = pos;
- charloop_starting_pos += 2;
- goto CharLoopEnd;
-
- CharLoopBacktrack:
-
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- if (charloop_starting_pos >= charloop_ending_pos ||
- (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, charloop_ending_pos - charloop_starting_pos).LastIndexOfAnyExcept('*', '/')) < 0)
- {
- return false; // The input didn't match.
- }
- charloop_ending_pos += charloop_starting_pos;
- pos = charloop_ending_pos;
- slice = inputSpan.Slice(pos);
-
- CharLoopEnd:
- //}
+ }
// Zero-width positive lookahead.
{
// Match a character in the set [^*/].
if (slice.IsEmpty || (((ch = slice[0]) == '*') | (ch == '/')))
{
- goto CharLoopBacktrack;
+ return false; // The input didn't match.
}
pos = positivelookahead_starting_pos;"(?<=\\w)/\\*([^*]|(\\*+[^*/]))*\\*+/\\s*(?=:)" (284 uses)[GeneratedRegex("(?<=\\w)/\\*([^*]|(\\*+[^*/]))*\\*+/\\s*(?=:)", RegexOptions.IgnoreCase | RegexOptions.Singleline)] /// ○ Match a character in the set [^*/].<br/>
/// ○ Match '*' atomically at least once.<br/>
/// ○ Match '/'.<br/>
- /// ○ Match a whitespace character greedily any number of times.<br/>
+ /// ○ Match a whitespace character atomically any number of times.<br/>
/// ○ Zero-width positive lookahead.<br/>
/// ○ Match ':'.<br/>
/// </code>
int pos = base.runtextpos;
int matchStart = pos;
char ch;
- int charloop_capture_pos = 0;
- int charloop_starting_pos = 0, charloop_ending_pos = 0;
int loop_iteration = 0;
int stackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
goto LoopBacktrack;
}
- // Match a whitespace character greedily any number of times.
- //{
- pos++;
- slice = inputSpan.Slice(pos);
- charloop_starting_pos = pos;
-
- int iteration2 = 0;
+ // Match a whitespace character atomically any number of times.
+ {
+ int iteration2 = 1;
while ((uint)iteration2 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration2]))
{
iteration2++;
slice = slice.Slice(iteration2);
pos += iteration2;
-
- charloop_ending_pos = pos;
- goto CharLoopEnd;
-
- CharLoopBacktrack:
- UncaptureUntil(charloop_capture_pos);
-
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- if (charloop_starting_pos >= charloop_ending_pos ||
- (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, charloop_ending_pos - charloop_starting_pos).LastIndexOf(':')) < 0)
- {
- goto LoopBacktrack;
- }
- charloop_ending_pos += charloop_starting_pos;
- pos = charloop_ending_pos;
- slice = inputSpan.Slice(pos);
-
- CharLoopEnd:
- charloop_capture_pos = base.Crawlpos();
- //}
+ }
// Zero-width positive lookahead.
{
// Match ':'.
if (slice.IsEmpty || slice[0] != ':')
{
- goto CharLoopBacktrack;
+ goto LoopBacktrack;
}
pos = positivelookahead_starting_pos;"(?<=\\w\\s+)/\\*([^*]|(\\*+[^*/]))*\\*+/\\s* ..." (284 uses)[GeneratedRegex("(?<=\\w\\s+)/\\*([^*]|(\\*+[^*/]))*\\*+/\\s*(?=:)", RegexOptions.IgnoreCase | RegexOptions.Singleline)] /// ○ Match a character in the set [^*/].<br/>
/// ○ Match '*' atomically at least once.<br/>
/// ○ Match '/'.<br/>
- /// ○ Match a whitespace character greedily any number of times.<br/>
+ /// ○ Match a whitespace character atomically any number of times.<br/>
/// ○ Zero-width positive lookahead.<br/>
/// ○ Match ':'.<br/>
/// </code>
int matchStart = pos;
char ch;
int charloop_capture_pos = 0;
- int charloop_capture_pos1 = 0;
int charloop_starting_pos = 0, charloop_ending_pos = 0;
- int charloop_starting_pos1 = 0, charloop_ending_pos1 = 0;
int loop_iteration = 0;
int stackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
goto LoopBacktrack;
}
- // Match a whitespace character greedily any number of times.
- //{
- pos++;
- slice = inputSpan.Slice(pos);
- charloop_starting_pos1 = pos;
-
- int iteration3 = 0;
+ // Match a whitespace character atomically any number of times.
+ {
+ int iteration3 = 1;
while ((uint)iteration3 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration3]))
{
iteration3++;
slice = slice.Slice(iteration3);
pos += iteration3;
-
- charloop_ending_pos1 = pos;
- goto CharLoopEnd1;
-
- CharLoopBacktrack1:
- UncaptureUntil(charloop_capture_pos1);
-
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- if (charloop_starting_pos1 >= charloop_ending_pos1 ||
- (charloop_ending_pos1 = inputSpan.Slice(charloop_starting_pos1, charloop_ending_pos1 - charloop_starting_pos1).LastIndexOf(':')) < 0)
- {
- goto LoopBacktrack;
- }
- charloop_ending_pos1 += charloop_starting_pos1;
- pos = charloop_ending_pos1;
- slice = inputSpan.Slice(pos);
-
- CharLoopEnd1:
- charloop_capture_pos1 = base.Crawlpos();
- //}
+ }
// Zero-width positive lookahead.
{
// Match ':'.
if (slice.IsEmpty || slice[0] != ':')
{
- goto CharLoopBacktrack1;
+ goto LoopBacktrack;
}
pos = positivelookahead_starting_pos;"(?<=\\W)(?<one>\\$?[a-zA-Z]{1,3}\\$?\\d{1,7} ..." (214 uses)[GeneratedRegex("(?<=\\W)(?<one>\\$?[a-zA-Z]{1,3}\\$?\\d{1,7})(?=\\W)|(?<=\\W)(?<two>\\$?\\d{1,7}:\\$?\\d{1,7})(?=\\W)|(?<=\\W)(?<three>\\$?[a-zA-Z]{1,3}:\\$?[a-zA-Z]{1,3})(?=\\W)")] /// ○ Match a character in the set [A-Za-z] atomically at least 1 and at most 3 times.<br/>
/// ○ Match ':'.<br/>
/// ○ Match '$' atomically, optionally.<br/>
- /// ○ Match a character in the set [A-Za-z] greedily at least 1 and at most 3 times.<br/>
+ /// ○ Match a character in the set [A-Za-z] atomically at least 1 and at most 3 times.<br/>
/// ○ Zero-width positive lookahead.<br/>
/// ○ Match any character other than a word character.<br/>
/// </code>
int capture_starting_pos2 = 0;
int charloop_capture_pos = 0;
int charloop_capture_pos1 = 0;
- int charloop_capture_pos2 = 0;
int charloop_starting_pos = 0, charloop_ending_pos = 0;
int charloop_starting_pos1 = 0, charloop_ending_pos1 = 0;
- int charloop_starting_pos2 = 0, charloop_ending_pos2 = 0;
int stackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
}
// "three" capture group.
- //{
+ {
capture_starting_pos2 = pos;
// Match '$' atomically, optionally.
}
}
- // Match a character in the set [A-Za-z] greedily at least 1 and at most 3 times.
- //{
+ // Match a character in the set [A-Za-z] atomically at least 1 and at most 3 times.
+ {
pos++;
slice = inputSpan.Slice(pos);
- charloop_starting_pos2 = pos;
-
int iteration5 = 0;
while (iteration5 < 3 && (uint)iteration5 < (uint)slice.Length && char.IsAsciiLetter(slice[iteration5]))
{
slice = slice.Slice(iteration5);
pos += iteration5;
-
- charloop_ending_pos2 = pos;
- charloop_starting_pos2++;
- goto CharLoopEnd2;
-
- CharLoopBacktrack2:
- UncaptureUntil(charloop_capture_pos2);
-
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- if (charloop_starting_pos2 >= charloop_ending_pos2)
- {
- UncaptureUntil(0);
- return false; // The input didn't match.
- }
- pos = --charloop_ending_pos2;
- slice = inputSpan.Slice(pos);
-
- CharLoopEnd2:
- charloop_capture_pos2 = base.Crawlpos();
- //}
+ }
base.Capture(3, capture_starting_pos2, pos);
-
- goto CaptureSkipBacktrack2;
-
- CaptureBacktrack2:
- goto CharLoopBacktrack2;
-
- CaptureSkipBacktrack2:;
- //}
+ }
// Zero-width positive lookahead.
{
// Match any character other than a word character.
if (slice.IsEmpty || Utilities.IsWordChar(slice[0]))
{
- goto CaptureBacktrack2;
+ UncaptureUntil(0);
+ return false; // The input didn't match.
}
pos = positivelookahead_starting_pos2;"(?<=\\W)(\\$?[a-zA-Z]{1,3}\\$?\\d{1,7})(?=\\ ..." (213 uses)[GeneratedRegex("(?<=\\W)(\\$?[a-zA-Z]{1,3}\\$?\\d{1,7})(?=\\W)|(?<=\\W)(\\$?\\d{1,7}:\\$?\\d{1,7})(?=\\W)|(?<=\\W)(\\$?[a-zA-Z]{1,3}:\\$?[a-zA-Z]{1,3})(?=\\W)")] /// ○ Match a character in the set [A-Za-z] atomically at least 1 and at most 3 times.<br/>
/// ○ Match ':'.<br/>
/// ○ Match '$' atomically, optionally.<br/>
- /// ○ Match a character in the set [A-Za-z] greedily at least 1 and at most 3 times.<br/>
+ /// ○ Match a character in the set [A-Za-z] atomically at least 1 and at most 3 times.<br/>
/// ○ Zero-width positive lookahead.<br/>
/// ○ Match any character other than a word character.<br/>
/// </code>
int capture_starting_pos2 = 0;
int charloop_capture_pos = 0;
int charloop_capture_pos1 = 0;
- int charloop_capture_pos2 = 0;
int charloop_starting_pos = 0, charloop_ending_pos = 0;
int charloop_starting_pos1 = 0, charloop_ending_pos1 = 0;
- int charloop_starting_pos2 = 0, charloop_ending_pos2 = 0;
int stackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
}
// 3rd capture group.
- //{
+ {
capture_starting_pos2 = pos;
// Match '$' atomically, optionally.
}
}
- // Match a character in the set [A-Za-z] greedily at least 1 and at most 3 times.
- //{
+ // Match a character in the set [A-Za-z] atomically at least 1 and at most 3 times.
+ {
pos++;
slice = inputSpan.Slice(pos);
- charloop_starting_pos2 = pos;
-
int iteration5 = 0;
while (iteration5 < 3 && (uint)iteration5 < (uint)slice.Length && char.IsAsciiLetter(slice[iteration5]))
{
slice = slice.Slice(iteration5);
pos += iteration5;
-
- charloop_ending_pos2 = pos;
- charloop_starting_pos2++;
- goto CharLoopEnd2;
-
- CharLoopBacktrack2:
- UncaptureUntil(charloop_capture_pos2);
-
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- if (charloop_starting_pos2 >= charloop_ending_pos2)
- {
- UncaptureUntil(0);
- return false; // The input didn't match.
- }
- pos = --charloop_ending_pos2;
- slice = inputSpan.Slice(pos);
-
- CharLoopEnd2:
- charloop_capture_pos2 = base.Crawlpos();
- //}
+ }
base.Capture(3, capture_starting_pos2, pos);
-
- goto CaptureSkipBacktrack2;
-
- CaptureBacktrack2:
- goto CharLoopBacktrack2;
-
- CaptureSkipBacktrack2:;
- //}
+ }
// Zero-width positive lookahead.
{
// Match any character other than a word character.
if (slice.IsEmpty || Utilities.IsWordChar(slice[0]))
{
- goto CaptureBacktrack2;
+ UncaptureUntil(0);
+ return false; // The input didn't match.
}
pos = positivelookahead_starting_pos2;For more diff examples, see https://gist.github.com/MihuBot/493a118db9f5c5624b0b894e757f271e JIT assembly changesFor a list of JIT diff regressions, see Regressions.md Sample source code for further analysisconst string JsonPath = "RegexResults-1318.json";
if (!File.Exists(JsonPath))
{
await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2xaEQLA");
using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}
using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");
record KnownPattern(string Pattern, RegexOptions Options, int Count);
sealed class RegexEntry
{
public required KnownPattern Regex { get; set; }
public required string MainSource { get; set; }
public required string PrSource { get; set; }
public string? FullDiff { get; set; }
public string? ShortDiff { get; set; }
public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}Test |
src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs
Outdated
Show resolved
Hide resolved
…egexReductionTests.cs
As part of our auto-atomicity handling, today we give up when the subsequent node is a lookaround. This improves it to support the case when the subsequent node is a positive lookahead.