Skip to content

Transform regex X| into X? #118087

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

stephentoub
Copy link
Member

An alternation with two branches where the second is empty is the same as the first branch just being an optional loop. The latter way of it being expressed is better optimized elsewhere in the regex transforms, e.g. with loop coalescing, so we're better off with the optional representation.

An alternation with two branches where the second is empty is the same as the first branch just being an optional loop. The latter way of it being expressed is better optimized elsewhere in the regex transforms, e.g. with loop coalescing, so we're better off with the optional representation.
@stephentoub
Copy link
Member Author

@MihuBot regexdiff

Copy link
Contributor

Tagging subscribers to this area: @dotnet/area-system-text-regularexpressions
See info in area-owners.md if you want to be subscribed.

@MihuBot
Copy link

MihuBot commented Jul 26, 2025

471 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"\\r\\n|\\r" (823 uses)
[GeneratedRegex("\\r\\n|\\r")]
  /// Explanation:<br/>
  /// <code>
  /// ○ Match '\r'.<br/>
-   /// ○ Match with 2 alternative expressions, atomically.<br/>
-   ///     ○ Match '\n'.<br/>
-   ///     ○ Match an empty string.<br/>
+   /// ○ Match '\n' atomically, optionally.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      return false; // The input didn't match.
                  }
                  
-                   // Match with 2 alternative expressions, atomically.
+                   // Match '\n' atomically, optionally.
                  {
-                       int alternation_starting_pos = pos;
-                       
-                       // Branch 0
+                       if ((uint)slice.Length > (uint)1 && slice[1] == '\n')
                      {
-                           // Match '\n'.
-                           if ((uint)slice.Length < 2 || slice[1] != '\n')
-                           {
-                               goto AlternationBranch;
-                           }
-                           
-                           pos += 2;
-                           slice = inputSpan.Slice(pos);
-                           goto AlternationMatch;
-                           
-                           AlternationBranch:
-                           pos = alternation_starting_pos;
-                           slice = inputSpan.Slice(pos);
-                       }
-                       
-                       // Branch 1
-                       {
-                           
+                           slice = slice.Slice(1);
                          pos++;
-                           slice = inputSpan.Slice(pos);
                      }
-                       
-                       AlternationMatch:;
                  }
                  
                  // The input matched.
+                   pos++;
                  base.runtextpos = pos;
                  base.Capture(0, matchStart, pos);
                  return true;
"(\\b\\d+([\\.,]\\d*)?|\\b)(?<unit>or[ae]|hrs ..." (330 uses)
[GeneratedRegex("(\\b\\d+([\\.,]\\d*)?|\\b)(?<unit>or[ae]|hrs|hr|h|minut[oi]|mins|min|second[oi]|secs|sec)\\b", RegexOptions.ExplicitCapture | RegexOptions.Singleline)]
  ///             ○ Match a character in the set [ae].<br/>
  ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match 'h'.<br/>
-   ///             ○ Match with 2 alternative expressions.<br/>
-   ///                 ○ Match a sequence of expressions.<br/>
-   ///                     ○ Match 'r'.<br/>
-   ///                     ○ Match with 2 alternative expressions.<br/>
-   ///                         ○ Match 's'.<br/>
-   ///                         ○ Match an empty string.<br/>
-   ///                 ○ Match an empty string.<br/>
+   ///             ○ Optional (greedy).<br/>
+   ///                 ○ Match 'r'.<br/>
+   ///                 ○ Match 's' greedily, optionally.<br/>
  ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match the string "min".<br/>
  ///             ○ Match with 3 alternative expressions.<br/>
                  int alternation_branch1 = 0;
                  int alternation_branch2 = 0;
                  int alternation_branch3 = 0;
-                   int alternation_branch4 = 0;
-                   int alternation_branch5 = 0;
                  int alternation_starting_capturepos = 0;
                  int alternation_starting_capturepos1 = 0;
                  int alternation_starting_capturepos2 = 0;
                  int alternation_starting_capturepos3 = 0;
-                   int alternation_starting_capturepos4 = 0;
-                   int alternation_starting_capturepos5 = 0;
                  int alternation_starting_pos = 0;
                  int alternation_starting_pos1 = 0;
                  int alternation_starting_pos2 = 0;
                  int alternation_starting_pos3 = 0;
-                   int alternation_starting_pos4 = 0;
-                   int alternation_starting_pos5 = 0;
                  int capture_starting_pos = 0;
                  int charloop_capture_pos = 0;
                  int charloop_starting_pos = 0, charloop_ending_pos = 0;
+                   int charloop_starting_pos1 = 0, charloop_ending_pos1 = 0;
                  int loop_iteration = 0;
+                   int loop_iteration1 = 0;
                  int stackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                                  goto AlternationBranch2;
                              }
                              
-                               // Match with 2 alternative expressions.
+                               // Optional (greedy).
                              //{
-                                   alternation_starting_pos2 = pos;
-                                   alternation_starting_capturepos2 = base.Crawlpos();
+                                   pos++;
+                                   slice = inputSpan.Slice(pos);
+                                   loop_iteration1 = 0;
                                  
-                                   // Branch 0
-                                   //{
-                                       // Match 'r'.
-                                       if ((uint)slice.Length < 2 || slice[1] != 'r')
-                                       {
-                                           goto AlternationBranch3;
-                                       }
-                                       
-                                       // Match with 2 alternative expressions.
-                                       //{
-                                           alternation_starting_pos3 = pos;
-                                           alternation_starting_capturepos3 = base.Crawlpos();
-                                           
-                                           // Branch 0
-                                           //{
-                                               // Match 's'.
-                                               if ((uint)slice.Length < 3 || slice[2] != 's')
-                                               {
-                                                   goto AlternationBranch4;
-                                               }
-                                               
-                                               alternation_branch3 = 0;
-                                               pos += 3;
-                                               slice = inputSpan.Slice(pos);
-                                               goto AlternationMatch3;
-                                               
-                                               AlternationBranch4:
-                                               pos = alternation_starting_pos3;
-                                               slice = inputSpan.Slice(pos);
-                                               UncaptureUntil(alternation_starting_capturepos3);
-                                           //}
-                                           
-                                           // Branch 1
-                                           //{
-                                               
-                                               alternation_branch3 = 1;
-                                               pos += 2;
-                                               slice = inputSpan.Slice(pos);
-                                               goto AlternationMatch3;
-                                           //}
-                                           
-                                           AlternationBacktrack3:
-                                           if (Utilities.s_hasTimeout)
-                                           {
-                                               base.CheckTimeout();
-                                           }
-                                           
-                                           switch (alternation_branch3)
-                                           {
-                                               case 0:
-                                                   goto AlternationBranch4;
-                                               case 1:
-                                                   goto AlternationBranch3;
-                                           }
-                                           
-                                           AlternationMatch3:;
-                                       //}
-                                       
-                                       alternation_branch2 = 0;
-                                       goto AlternationMatch2;
-                                       
-                                       AlternationBranch3:
-                                       pos = alternation_starting_pos2;
-                                       slice = inputSpan.Slice(pos);
-                                       UncaptureUntil(alternation_starting_capturepos2);
-                                   //}
+                                   LoopBody1:
+                                   Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), pos);
                                  
-                                   // Branch 1
+                                   loop_iteration1++;
+                                   
+                                   // Match 'r'.
+                                   if (slice.IsEmpty || slice[0] != 'r')
+                                   {
+                                       goto LoopIterationNoMatch1;
+                                   }
+                                   
+                                   // Match 's' greedily, optionally.
                                  //{
-                                       
-                                       alternation_branch2 = 1;
                                      pos++;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch2;
+                                       charloop_starting_pos1 = pos;
+                                       
+                                       if (!slice.IsEmpty && slice[0] == 's')
+                                       {
+                                           slice = slice.Slice(1);
+                                           pos++;
+                                       }
+                                       
+                                       charloop_ending_pos1 = pos;
+                                       goto CharLoopEnd1;
+                                       
+                                       CharLoopBacktrack1:
+                                       UncaptureUntil(base.runstack![--stackpos]);
+                                       Utilities.StackPop(base.runstack!, ref stackpos, out charloop_ending_pos1, out charloop_starting_pos1);
+                                       
+                                       if (Utilities.s_hasTimeout)
+                                       {
+                                           base.CheckTimeout();
+                                       }
+                                       
+                                       if (charloop_starting_pos1 >= charloop_ending_pos1)
+                                       {
+                                           goto LoopIterationNoMatch1;
+                                       }
+                                       pos = --charloop_ending_pos1;
+                                       slice = inputSpan.Slice(pos);
+                                       
+                                       CharLoopEnd1:
+                                       Utilities.StackPush(ref base.runstack!, ref stackpos, charloop_starting_pos1, charloop_ending_pos1, base.Crawlpos());
                                  //}
                                  
-                                   AlternationBacktrack2:
+                                   
+                                   // The loop has an upper bound of 1. Continue iterating greedily if it hasn't yet been reached.
+                                   if (loop_iteration1 == 0)
+                                   {
+                                       goto LoopBody1;
+                                   }
+                                   goto LoopEnd1;
+                                   
+                                   // The loop iteration failed. Put state back to the way it was before the iteration.
+                                   LoopIterationNoMatch1:
+                                   if (--loop_iteration1 < 0)
+                                   {
+                                       // Unable to match the remainder of the expression after exhausting the loop.
+                                       goto AlternationBranch2;
+                                   }
+                                   pos = base.runstack![--stackpos];
+                                   UncaptureUntil(base.runstack![--stackpos]);
+                                   slice = inputSpan.Slice(pos);
+                                   goto LoopEnd1;
+                                   
+                                   LoopBacktrack:
                                  if (Utilities.s_hasTimeout)
                                  {
                                      base.CheckTimeout();
                                  }
                                  
-                                   switch (alternation_branch2)
+                                   if (loop_iteration1 == 0)
                                  {
-                                       case 0:
-                                           goto AlternationBacktrack3;
-                                       case 1:
-                                           goto AlternationBranch2;
+                                       // No iterations of the loop remain to backtrack into. Fail the loop.
+                                       goto AlternationBranch2;
                                  }
-                                   
-                                   AlternationMatch2:;
+                                   goto CharLoopBacktrack1;
+                                   LoopEnd1:;
                              //}
                              
                              alternation_branch1 = 1;
                              // Match the string "min".
                              if (!slice.StartsWith("min"))
                              {
-                                   goto AlternationBranch5;
+                                   goto AlternationBranch3;
                              }
                              
                              // Match with 3 alternative expressions.
                              //{
-                                   alternation_starting_pos4 = pos;
-                                   alternation_starting_capturepos4 = base.Crawlpos();
+                                   alternation_starting_pos2 = pos;
+                                   alternation_starting_capturepos2 = base.Crawlpos();
                                  
                                  // Branch 0
                                  //{
                                          !slice.Slice(3).StartsWith("ut") || // Match the string "ut".
                                          (((ch = slice[5]) != 'i') & (ch != 'o'))) // Match a character in the set [io].
                                      {
-                                           goto AlternationBranch6;
+                                           goto AlternationBranch4;
                                      }
                                      
-                                       alternation_branch4 = 0;
+                                       alternation_branch2 = 0;
                                      pos += 6;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch4;
+                                       goto AlternationMatch2;
                                      
-                                       AlternationBranch6:
-                                       pos = alternation_starting_pos4;
+                                       AlternationBranch4:
+                                       pos = alternation_starting_pos2;
                                      slice = inputSpan.Slice(pos);
-                                       UncaptureUntil(alternation_starting_capturepos4);
+                                       UncaptureUntil(alternation_starting_capturepos2);
                                  //}
                                  
                                  // Branch 1
                                      // Match 's'.
                                      if ((uint)slice.Length < 4 || slice[3] != 's')
                                      {
-                                           goto AlternationBranch7;
+                                           goto AlternationBranch5;
                                      }
                                      
-                                       alternation_branch4 = 1;
+                                       alternation_branch2 = 1;
                                      pos += 4;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch4;
+                                       goto AlternationMatch2;
                                      
-                                       AlternationBranch7:
-                                       pos = alternation_starting_pos4;
+                                       AlternationBranch5:
+                                       pos = alternation_starting_pos2;
                                      slice = inputSpan.Slice(pos);
-                                       UncaptureUntil(alternation_starting_capturepos4);
+                                       UncaptureUntil(alternation_starting_capturepos2);
                                  //}
                                  
                                  // Branch 2
                                  //{
                                      
-                                       alternation_branch4 = 2;
+                                       alternation_branch2 = 2;
                                      pos += 3;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch4;
+                                       goto AlternationMatch2;
                                  //}
                                  
-                                   AlternationBacktrack4:
+                                   AlternationBacktrack2:
                                  if (Utilities.s_hasTimeout)
                                  {
                                      base.CheckTimeout();
                                  }
                                  
-                                   switch (alternation_branch4)
+                                   switch (alternation_branch2)
                                  {
                                      case 0:
-                                           goto AlternationBranch6;
+                                           goto AlternationBranch4;
                                      case 1:
-                                           goto AlternationBranch7;
-                                       case 2:
                                          goto AlternationBranch5;
+                                       case 2:
+                                           goto AlternationBranch3;
                                  }
                                  
-                                   AlternationMatch4:;
+                                   AlternationMatch2:;
                              //}
                              
                              alternation_branch1 = 2;
                              goto AlternationMatch1;
                              
-                               AlternationBranch5:
+                               AlternationBranch3:
                              pos = alternation_starting_pos1;
                              slice = inputSpan.Slice(pos);
                              UncaptureUntil(alternation_starting_capturepos1);
                              
                              // Match with 3 alternative expressions.
                              //{
-                                   alternation_starting_pos5 = pos;
-                                   alternation_starting_capturepos5 = base.Crawlpos();
+                                   alternation_starting_pos3 = pos;
+                                   alternation_starting_capturepos3 = base.Crawlpos();
                                  
                                  // Branch 0
                                  //{
                                          !slice.Slice(3).StartsWith("ond") || // Match the string "ond".
                                          (((ch = slice[6]) != 'i') & (ch != 'o'))) // Match a character in the set [io].
                                      {
-                                           goto AlternationBranch8;
+                                           goto AlternationBranch6;
                                      }
                                      
-                                       alternation_branch5 = 0;
+                                       alternation_branch3 = 0;
                                      pos += 7;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch5;
+                                       goto AlternationMatch3;
                                      
-                                       AlternationBranch8:
-                                       pos = alternation_starting_pos5;
+                                       AlternationBranch6:
+                                       pos = alternation_starting_pos3;
                                      slice = inputSpan.Slice(pos);
-                                       UncaptureUntil(alternation_starting_capturepos5);
+                                       UncaptureUntil(alternation_starting_capturepos3);
                                  //}
                                  
                                  // Branch 1
                                      // Match 's'.
                                      if ((uint)slice.Length < 4 || slice[3] != 's')
                                      {
-                                           goto AlternationBranch9;
+                                           goto AlternationBranch7;
                                      }
                                      
-                                       alternation_branch5 = 1;
+                                       alternation_branch3 = 1;
                                      pos += 4;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch5;
+                                       goto AlternationMatch3;
                                      
-                                       AlternationBranch9:
-                                       pos = alternation_starting_pos5;
+                                       AlternationBranch7:
+                                       pos = alternation_starting_pos3;
                                      slice = inputSpan.Slice(pos);
-                                       UncaptureUntil(alternation_starting_capturepos5);
+                                       UncaptureUntil(alternation_starting_capturepos3);
                                  //}
                                  
                                  // Branch 2
                                  //{
                                      
-                                       alternation_branch5 = 2;
+                                       alternation_branch3 = 2;
                                      pos += 3;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch5;
+                                       goto AlternationMatch3;
                                  //}
                                  
-                                   AlternationBacktrack5:
+                                   AlternationBacktrack3:
                                  if (Utilities.s_hasTimeout)
                                  {
                                      base.CheckTimeout();
                                  }
                                  
-                                   switch (alternation_branch5)
+                                   switch (alternation_branch3)
                                  {
                                      case 0:
-                                           goto AlternationBranch8;
+                                           goto AlternationBranch6;
                                      case 1:
-                                           goto AlternationBranch9;
+                                           goto AlternationBranch7;
                                      case 2:
                                          goto AlternationBacktrack;
                                  }
                                  
-                                   AlternationMatch5:;
+                                   AlternationMatch3:;
                              //}
                              
                              alternation_branch1 = 3;
                              case 0:
                                  goto AlternationBranch1;
                              case 1:
-                                   goto AlternationBacktrack2;
+                                   goto LoopBacktrack;
                              case 2:
-                                   goto AlternationBacktrack4;
+                                   goto AlternationBacktrack2;
                              case 3:
-                                   goto AlternationBacktrack5;
+                                   goto AlternationBacktrack3;
                          }
                          
                          AlternationMatch1:;
              (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
      }
      
+       /// <summary>Pops 2 values from the backtracking stack.</summary>
+       [MethodImpl(MethodImplOptions.AggressiveInlining)]
+       internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
+       {
+           arg0 = stack[--pos];
+           arg1 = stack[--pos];
+       }
+       
      /// <summary>Pushes 2 values onto the backtracking stack.</summary>
      [MethodImpl(MethodImplOptions.AggressiveInlining)]
      internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1)
          }
      }
      
+       /// <summary>Pushes 3 values onto the backtracking stack.</summary>
+       [MethodImpl(MethodImplOptions.AggressiveInlining)]
+       internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
+       {
+           // If there's space available for all 3 values, store them.
+           int[] s = stack;
+           int p = pos;
+           if ((uint)(p + 2) < (uint)s.Length)
+           {
+               s[p] = arg0;
+               s[p + 1] = arg1;
+               s[p + 2] = arg2;
+               pos += 3;
+               return;
+           }
+       
+           // Otherwise, resize the stack to make room and try again.
+           WithResize(ref stack, ref pos, arg0, arg1, arg2);
+       
+           // <summary>Resize the backtracking stack array and push 3 values onto the stack.</summary>
+           [MethodImpl(MethodImplOptions.NoInlining)]
+           static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
+           {
+               Array.Resize(ref stack, (pos + 2) * 2);
+               StackPush(ref stack, ref pos, arg0, arg1, arg2);
+           }
+       }
+       
      /// <summary>Supports searching for characters in or not in "\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\t\n\v\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f !\"#$%&amp;'()*+,-./:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefgijklnpqrtuvwxyz{|}~\u007f".</summary>
      internal static readonly SearchValues<char> s_ascii_FFFFFFFFFFFF00FCFFFFFFFFFF5EF7FF = SearchValues.Create("\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\t\n\v\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f !\"#$%&'()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefgijklnpqrtuvwxyz{|}~\u007f");
  }

For more diff examples, see https://gist.github.com/MihuBot/a495e0c60796a8d6786b3298c93aa46a

JIT assembly changes
Total bytes of base: 54274415
Total bytes of diff: 54474333
Total bytes of delta: 199918 (0.37 % of base)
Total relative delta: 59.52
    diff is a regression.
    relative diff is a regression.

For a list of JIT diff regressions, see Regressions.md
For a list of JIT diff improvements, see Improvements.md

Sample source code for further analysis
const string JsonPath = "RegexResults-1300.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2loz5g");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants