Skip to content

[RegexDiff X64] [stephentoub] Remove positive lookarounds that wrap only zer ... #1301

@MihuBot

Description

@MihuBot

Job completed in 17 minutes 48 seconds (remote runner delay: 54 seconds).
dotnet/runtime#118091
Using arguments: regexdiff

173 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"(((?<=\\W|^)-\\s*)|(?<=\\b))\\d+\\s*(B|b|m|t ..." (540 uses)
[GeneratedRegex("(((?<=\\W|^)-\\s*)|(?<=\\b))\\d+\\s*(B|b|m|t|g)(?=\\b)", RegexOptions.Singleline)]
  /// ○ Match a whitespace character atomically any number of times.<br/>
  /// ○ 3rd capture group.<br/>
  ///     ○ Match a character in the set [Bbgmt].<br/>
-   /// ○ Zero-width positive lookahead.<br/>
-   ///     ○ Match if at a word boundary.<br/>
+   /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      base.Capture(3, capture_starting_pos2, pos);
                  }
                  
-                   // Zero-width positive lookahead.
+                   // Match if at a word boundary.
+                   if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       slice = inputSpan.Slice(pos);
-                       int positivelookahead_starting_pos = pos;
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       // Match if at a word boundary.
-                       if (!Utilities.IsBoundary(inputSpan, pos))
-                       {
-                           goto CaptureBacktrack;
-                       }
-                       
-                       pos = positivelookahead_starting_pos;
-                       slice = inputSpan.Slice(pos);
+                       goto CaptureBacktrack;
                  }
                  
                  // The input matched.
"(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+,)))( ..." (338 uses)
[GeneratedRegex("(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+,)))(\\d+(,\\d+)?)\\^([+-]*[1-9]\\d*)(?=\\b)", RegexOptions.IgnoreCase | RegexOptions.Singleline)]
  ///     ○ Match a character in the set [+\-] atomically any number of times.<br/>
  ///     ○ Match a character in the set [1-9].<br/>
  ///     ○ Match a Unicode digit greedily any number of times.<br/>
-   /// ○ Zero-width positive lookahead.<br/>
-   ///     ○ Match if at a word boundary.<br/>
+   /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      CaptureSkipBacktrack2:;
                  //}
                  
-                   // Zero-width positive lookahead.
+                   // Match if at a word boundary.
+                   if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       slice = inputSpan.Slice(pos);
-                       int positivelookahead_starting_pos = pos;
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       // Match if at a word boundary.
-                       if (!Utilities.IsBoundary(inputSpan, pos))
-                       {
-                           goto CaptureBacktrack2;
-                       }
-                       
-                       pos = positivelookahead_starting_pos;
-                       slice = inputSpan.Slice(pos);
+                       goto CaptureBacktrack2;
                  }
                  
                  // The input matched.
"(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+,)))( ..." (338 uses)
[GeneratedRegex("(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+,)))(\\d+(,\\d+)?)e([+-]*[1-9]\\d*)(?=\\b)", RegexOptions.IgnoreCase | RegexOptions.Singleline)]
  ///     ○ Match a character in the set [+\-] atomically any number of times.<br/>
  ///     ○ Match a character in the set [1-9].<br/>
  ///     ○ Match a Unicode digit greedily any number of times.<br/>
-   /// ○ Zero-width positive lookahead.<br/>
-   ///     ○ Match if at a word boundary.<br/>
+   /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      CaptureSkipBacktrack2:;
                  //}
                  
-                   // Zero-width positive lookahead.
+                   // Match if at a word boundary.
+                   if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       slice = inputSpan.Slice(pos);
-                       int positivelookahead_starting_pos = pos;
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       // Match if at a word boundary.
-                       if (!Utilities.IsBoundary(inputSpan, pos))
-                       {
-                           goto CaptureBacktrack2;
-                       }
-                       
-                       pos = positivelookahead_starting_pos;
-                       slice = inputSpan.Slice(pos);
+                       goto CaptureBacktrack2;
                  }
                  
                  // The input matched.
"(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+\\,)) ..." (326 uses)
[GeneratedRegex("(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+\\,)))\\d+,\\d+\\s*(K|k|M|G|T)(?=\\b)", RegexOptions.Singleline)]
  /// ○ Match a whitespace character atomically any number of times.<br/>
  /// ○ 4th capture group.<br/>
  ///     ○ Match a character in the set [GKMTk].<br/>
-   /// ○ Zero-width positive lookahead.<br/>
-   ///     ○ Match if at a word boundary.<br/>
+   /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      base.Capture(4, capture_starting_pos3, pos);
                  }
                  
-                   // Zero-width positive lookahead.
+                   // Match if at a word boundary.
+                   if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       slice = inputSpan.Slice(pos);
-                       int positivelookahead_starting_pos = pos;
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       // Match if at a word boundary.
-                       if (!Utilities.IsBoundary(inputSpan, pos))
-                       {
-                           goto CaptureBacktrack;
-                       }
-                       
-                       pos = positivelookahead_starting_pos;
-                       slice = inputSpan.Slice(pos);
+                       goto CaptureBacktrack;
                  }
                  
                  // The input matched.
"(((?<=\\W|^)-\\s*)|(?<=\\b))\\d+\\s*(k|M|T|G ..." (325 uses)
[GeneratedRegex("(((?<=\\W|^)-\\s*)|(?<=\\b))\\d+\\s*(k|M|T|G)(?=\\b)", RegexOptions.Singleline)]
  /// ○ Match a whitespace character atomically any number of times.<br/>
  /// ○ 3rd capture group.<br/>
  ///     ○ Match a character in the set [GMTk].<br/>
-   /// ○ Zero-width positive lookahead.<br/>
-   ///     ○ Match if at a word boundary.<br/>
+   /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      base.Capture(3, capture_starting_pos2, pos);
                  }
                  
-                   // Zero-width positive lookahead.
+                   // Match if at a word boundary.
+                   if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       slice = inputSpan.Slice(pos);
-                       int positivelookahead_starting_pos = pos;
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       // Match if at a word boundary.
-                       if (!Utilities.IsBoundary(inputSpan, pos))
-                       {
-                           goto CaptureBacktrack;
-                       }
-                       
-                       pos = positivelookahead_starting_pos;
-                       slice = inputSpan.Slice(pos);
+                       goto CaptureBacktrack;
                  }
                  
                  // The input matched.
"(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+,)))( ..." (252 uses)
[GeneratedRegex("(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+,)))(\\d+(,\\d+)?)\\^([+-]*[1-9]\\d*)(?=\\b)", RegexOptions.ExplicitCapture | RegexOptions.Singleline)]
  /// ○ Match a character in the set [+\-] atomically any number of times.<br/>
  /// ○ Match a character in the set [1-9].<br/>
  /// ○ Match a Unicode digit greedily any number of times.<br/>
-   /// ○ Zero-width positive lookahead.<br/>
-   ///     ○ Match if at a word boundary.<br/>
+   /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      CharLoopEnd2:
                  //}
                  
-                   // Zero-width positive lookahead.
+                   // Match if at a word boundary.
+                   if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       slice = inputSpan.Slice(pos);
-                       int positivelookahead_starting_pos = pos;
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       // Match if at a word boundary.
-                       if (!Utilities.IsBoundary(inputSpan, pos))
-                       {
-                           goto CharLoopBacktrack2;
-                       }
-                       
-                       pos = positivelookahead_starting_pos;
-                       slice = inputSpan.Slice(pos);
+                       goto CharLoopBacktrack2;
                  }
                  
                  // The input matched.
"(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+,)))( ..." (252 uses)
[GeneratedRegex("(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+,)))(\\d+(,\\d+)?)e([+-]*[1-9]\\d*)(?=\\b)", RegexOptions.ExplicitCapture | RegexOptions.Singleline)]
  /// ○ Match a character in the set [+\-] atomically any number of times.<br/>
  /// ○ Match a character in the set [1-9].<br/>
  /// ○ Match a Unicode digit greedily any number of times.<br/>
-   /// ○ Zero-width positive lookahead.<br/>
-   ///     ○ Match if at a word boundary.<br/>
+   /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      CharLoopEnd2:
                  //}
                  
-                   // Zero-width positive lookahead.
+                   // Match if at a word boundary.
+                   if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       slice = inputSpan.Slice(pos);
-                       int positivelookahead_starting_pos = pos;
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       // Match if at a word boundary.
-                       if (!Utilities.IsBoundary(inputSpan, pos))
-                       {
-                           goto CharLoopBacktrack2;
-                       }
-                       
-                       pos = positivelookahead_starting_pos;
-                       slice = inputSpan.Slice(pos);
+                       goto CharLoopBacktrack2;
                  }
                  
                  // The input matched.
"(((?<!\\d+\\s*)-\\s*)|(?<=\\b))\\d+\\s*(K|k| ..." (215 uses)
[GeneratedRegex("(((?<!\\d+\\s*)-\\s*)|(?<=\\b))\\d+\\s*(K|k|M|T|G)(?=\\b)", RegexOptions.Singleline)]
  /// ○ Match a whitespace character atomically any number of times.<br/>
  /// ○ 3rd capture group.<br/>
  ///     ○ Match a character in the set [GKMTk].<br/>
-   /// ○ Zero-width positive lookahead.<br/>
-   ///     ○ Match if at a word boundary.<br/>
+   /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      base.Capture(3, capture_starting_pos2, pos);
                  }
                  
-                   // Zero-width positive lookahead.
+                   // Match if at a word boundary.
+                   if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       slice = inputSpan.Slice(pos);
-                       int positivelookahead_starting_pos = pos;
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       // Match if at a word boundary.
-                       if (!Utilities.IsBoundary(inputSpan, pos))
-                       {
-                           goto CaptureBacktrack;
-                       }
-                       
-                       pos = positivelookahead_starting_pos;
-                       slice = inputSpan.Slice(pos);
+                       goto CaptureBacktrack;
                  }
                  
                  // The input matched.
"(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+[\\., ..." (200 uses)
[GeneratedRegex("(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+[\\.,])))(\\d+([\\.,]\\d+)?)\\^([+-]*[1-9]\\d*)(?=\\b)", RegexOptions.ExplicitCapture | RegexOptions.Singleline)]
  /// ○ Match a character in the set [+\-] atomically any number of times.<br/>
  /// ○ Match a character in the set [1-9].<br/>
  /// ○ Match a Unicode digit greedily any number of times.<br/>
-   /// ○ Zero-width positive lookahead.<br/>
-   ///     ○ Match if at a word boundary.<br/>
+   /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      CharLoopEnd2:
                  //}
                  
-                   // Zero-width positive lookahead.
+                   // Match if at a word boundary.
+                   if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       slice = inputSpan.Slice(pos);
-                       int positivelookahead_starting_pos = pos;
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       // Match if at a word boundary.
-                       if (!Utilities.IsBoundary(inputSpan, pos))
-                       {
-                           goto CharLoopBacktrack2;
-                       }
-                       
-                       pos = positivelookahead_starting_pos;
-                       slice = inputSpan.Slice(pos);
+                       goto CharLoopBacktrack2;
                  }
                  
                  // The input matched.
"(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+[\\., ..." (200 uses)
[GeneratedRegex("(((?<!\\d+\\s*)-\\s*)|((?<=\\b)(?<!\\d+[\\.,])))(\\d+([\\.,]\\d+)?)e([+-]*[1-9]\\d*)(?=\\b)", RegexOptions.ExplicitCapture | RegexOptions.Singleline)]
  /// ○ Match a character in the set [+\-] atomically any number of times.<br/>
  /// ○ Match a character in the set [1-9].<br/>
  /// ○ Match a Unicode digit greedily any number of times.<br/>
-   /// ○ Zero-width positive lookahead.<br/>
-   ///     ○ Match if at a word boundary.<br/>
+   /// ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      CharLoopEnd2:
                  //}
                  
-                   // Zero-width positive lookahead.
+                   // Match if at a word boundary.
+                   if (!Utilities.IsBoundary(inputSpan, pos))
                  {
-                       slice = inputSpan.Slice(pos);
-                       int positivelookahead_starting_pos = pos;
-                       
-                       if (Utilities.s_hasTimeout)
-                       {
-                           base.CheckTimeout();
-                       }
-                       
-                       // Match if at a word boundary.
-                       if (!Utilities.IsBoundary(inputSpan, pos))
-                       {
-                           goto CharLoopBacktrack2;
-                       }
-                       
-                       pos = positivelookahead_starting_pos;
-                       slice = inputSpan.Slice(pos);
+                       goto CharLoopBacktrack2;
                  }
                  
                  // The input matched.

For more diff examples, see https://gist.github.com/MihuBot/262e88321889757cdd1ba8ed32fc4fd4

JIT assembly changes
Total bytes of base: 54186495
Total bytes of diff: 54183740
Total bytes of delta: -2755 (-0.01 % of base)
Total relative delta: -0.28
    diff is an improvement.
    relative diff is an improvement.

For a list of JIT diff regressions, see Regressions.md
For a list of JIT diff improvements, see Improvements.md

Sample source code for further analysis
const string JsonPath = "RegexResults-1301.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2myvU9A");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions