Skip to content

Transform regex X| into X? #118087

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 29, 2025
Merged

Conversation

stephentoub
Copy link
Member

An alternation with two branches where the second is empty is the same as the first branch just being an optional loop. The latter way of it being expressed is better optimized elsewhere in the regex transforms, e.g. with loop coalescing, so we're better off with the optional representation.

@stephentoub
Copy link
Member Author

@MihuBot regexdiff

Copy link
Contributor

Tagging subscribers to this area: @dotnet/area-system-text-regularexpressions
See info in area-owners.md if you want to be subscribed.

@MihuBot
Copy link

MihuBot commented Jul 26, 2025

471 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"\\r\\n|\\r" (823 uses)
[GeneratedRegex("\\r\\n|\\r")]
  /// Explanation:<br/>
  /// <code>
  /// ○ Match '\r'.<br/>
-   /// ○ Match with 2 alternative expressions, atomically.<br/>
-   ///     ○ Match '\n'.<br/>
-   ///     ○ Match an empty string.<br/>
+   /// ○ Match '\n' atomically, optionally.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      return false; // The input didn't match.
                  }
                  
-                   // Match with 2 alternative expressions, atomically.
+                   // Match '\n' atomically, optionally.
                  {
-                       int alternation_starting_pos = pos;
-                       
-                       // Branch 0
+                       if ((uint)slice.Length > (uint)1 && slice[1] == '\n')
                      {
-                           // Match '\n'.
-                           if ((uint)slice.Length < 2 || slice[1] != '\n')
-                           {
-                               goto AlternationBranch;
-                           }
-                           
-                           pos += 2;
-                           slice = inputSpan.Slice(pos);
-                           goto AlternationMatch;
-                           
-                           AlternationBranch:
-                           pos = alternation_starting_pos;
-                           slice = inputSpan.Slice(pos);
-                       }
-                       
-                       // Branch 1
-                       {
-                           
+                           slice = slice.Slice(1);
                          pos++;
-                           slice = inputSpan.Slice(pos);
                      }
-                       
-                       AlternationMatch:;
                  }
                  
                  // The input matched.
+                   pos++;
                  base.runtextpos = pos;
                  base.Capture(0, matchStart, pos);
                  return true;
"(\\b\\d+([\\.,]\\d*)?|\\b)(?<unit>or[ae]|hrs ..." (330 uses)
[GeneratedRegex("(\\b\\d+([\\.,]\\d*)?|\\b)(?<unit>or[ae]|hrs|hr|h|minut[oi]|mins|min|second[oi]|secs|sec)\\b", RegexOptions.ExplicitCapture | RegexOptions.Singleline)]
  ///             ○ Match a character in the set [ae].<br/>
  ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match 'h'.<br/>
-   ///             ○ Match with 2 alternative expressions.<br/>
-   ///                 ○ Match a sequence of expressions.<br/>
-   ///                     ○ Match 'r'.<br/>
-   ///                     ○ Match with 2 alternative expressions.<br/>
-   ///                         ○ Match 's'.<br/>
-   ///                         ○ Match an empty string.<br/>
-   ///                 ○ Match an empty string.<br/>
+   ///             ○ Optional (greedy).<br/>
+   ///                 ○ Match 'r'.<br/>
+   ///                 ○ Match 's' greedily, optionally.<br/>
  ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match the string "min".<br/>
  ///             ○ Match with 3 alternative expressions.<br/>
                  int alternation_branch1 = 0;
                  int alternation_branch2 = 0;
                  int alternation_branch3 = 0;
-                   int alternation_branch4 = 0;
-                   int alternation_branch5 = 0;
                  int alternation_starting_capturepos = 0;
                  int alternation_starting_capturepos1 = 0;
                  int alternation_starting_capturepos2 = 0;
                  int alternation_starting_capturepos3 = 0;
-                   int alternation_starting_capturepos4 = 0;
-                   int alternation_starting_capturepos5 = 0;
                  int alternation_starting_pos = 0;
                  int alternation_starting_pos1 = 0;
                  int alternation_starting_pos2 = 0;
                  int alternation_starting_pos3 = 0;
-                   int alternation_starting_pos4 = 0;
-                   int alternation_starting_pos5 = 0;
                  int capture_starting_pos = 0;
                  int charloop_capture_pos = 0;
                  int charloop_starting_pos = 0, charloop_ending_pos = 0;
+                   int charloop_starting_pos1 = 0, charloop_ending_pos1 = 0;
                  int loop_iteration = 0;
+                   int loop_iteration1 = 0;
                  int stackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                                  goto AlternationBranch2;
                              }
                              
-                               // Match with 2 alternative expressions.
+                               // Optional (greedy).
                              //{
-                                   alternation_starting_pos2 = pos;
-                                   alternation_starting_capturepos2 = base.Crawlpos();
+                                   pos++;
+                                   slice = inputSpan.Slice(pos);
+                                   loop_iteration1 = 0;
                                  
-                                   // Branch 0
-                                   //{
-                                       // Match 'r'.
-                                       if ((uint)slice.Length < 2 || slice[1] != 'r')
-                                       {
-                                           goto AlternationBranch3;
-                                       }
-                                       
-                                       // Match with 2 alternative expressions.
-                                       //{
-                                           alternation_starting_pos3 = pos;
-                                           alternation_starting_capturepos3 = base.Crawlpos();
-                                           
-                                           // Branch 0
-                                           //{
-                                               // Match 's'.
-                                               if ((uint)slice.Length < 3 || slice[2] != 's')
-                                               {
-                                                   goto AlternationBranch4;
-                                               }
-                                               
-                                               alternation_branch3 = 0;
-                                               pos += 3;
-                                               slice = inputSpan.Slice(pos);
-                                               goto AlternationMatch3;
-                                               
-                                               AlternationBranch4:
-                                               pos = alternation_starting_pos3;
-                                               slice = inputSpan.Slice(pos);
-                                               UncaptureUntil(alternation_starting_capturepos3);
-                                           //}
-                                           
-                                           // Branch 1
-                                           //{
-                                               
-                                               alternation_branch3 = 1;
-                                               pos += 2;
-                                               slice = inputSpan.Slice(pos);
-                                               goto AlternationMatch3;
-                                           //}
-                                           
-                                           AlternationBacktrack3:
-                                           if (Utilities.s_hasTimeout)
-                                           {
-                                               base.CheckTimeout();
-                                           }
-                                           
-                                           switch (alternation_branch3)
-                                           {
-                                               case 0:
-                                                   goto AlternationBranch4;
-                                               case 1:
-                                                   goto AlternationBranch3;
-                                           }
-                                           
-                                           AlternationMatch3:;
-                                       //}
-                                       
-                                       alternation_branch2 = 0;
-                                       goto AlternationMatch2;
-                                       
-                                       AlternationBranch3:
-                                       pos = alternation_starting_pos2;
-                                       slice = inputSpan.Slice(pos);
-                                       UncaptureUntil(alternation_starting_capturepos2);
-                                   //}
+                                   LoopBody1:
+                                   Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), pos);
                                  
-                                   // Branch 1
+                                   loop_iteration1++;
+                                   
+                                   // Match 'r'.
+                                   if (slice.IsEmpty || slice[0] != 'r')
+                                   {
+                                       goto LoopIterationNoMatch1;
+                                   }
+                                   
+                                   // Match 's' greedily, optionally.
                                  //{
-                                       
-                                       alternation_branch2 = 1;
                                      pos++;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch2;
+                                       charloop_starting_pos1 = pos;
+                                       
+                                       if (!slice.IsEmpty && slice[0] == 's')
+                                       {
+                                           slice = slice.Slice(1);
+                                           pos++;
+                                       }
+                                       
+                                       charloop_ending_pos1 = pos;
+                                       goto CharLoopEnd1;
+                                       
+                                       CharLoopBacktrack1:
+                                       UncaptureUntil(base.runstack![--stackpos]);
+                                       Utilities.StackPop(base.runstack!, ref stackpos, out charloop_ending_pos1, out charloop_starting_pos1);
+                                       
+                                       if (Utilities.s_hasTimeout)
+                                       {
+                                           base.CheckTimeout();
+                                       }
+                                       
+                                       if (charloop_starting_pos1 >= charloop_ending_pos1)
+                                       {
+                                           goto LoopIterationNoMatch1;
+                                       }
+                                       pos = --charloop_ending_pos1;
+                                       slice = inputSpan.Slice(pos);
+                                       
+                                       CharLoopEnd1:
+                                       Utilities.StackPush(ref base.runstack!, ref stackpos, charloop_starting_pos1, charloop_ending_pos1, base.Crawlpos());
                                  //}
                                  
-                                   AlternationBacktrack2:
+                                   
+                                   // The loop has an upper bound of 1. Continue iterating greedily if it hasn't yet been reached.
+                                   if (loop_iteration1 == 0)
+                                   {
+                                       goto LoopBody1;
+                                   }
+                                   goto LoopEnd1;
+                                   
+                                   // The loop iteration failed. Put state back to the way it was before the iteration.
+                                   LoopIterationNoMatch1:
+                                   if (--loop_iteration1 < 0)
+                                   {
+                                       // Unable to match the remainder of the expression after exhausting the loop.
+                                       goto AlternationBranch2;
+                                   }
+                                   pos = base.runstack![--stackpos];
+                                   UncaptureUntil(base.runstack![--stackpos]);
+                                   slice = inputSpan.Slice(pos);
+                                   goto LoopEnd1;
+                                   
+                                   LoopBacktrack:
                                  if (Utilities.s_hasTimeout)
                                  {
                                      base.CheckTimeout();
                                  }
                                  
-                                   switch (alternation_branch2)
+                                   if (loop_iteration1 == 0)
                                  {
-                                       case 0:
-                                           goto AlternationBacktrack3;
-                                       case 1:
-                                           goto AlternationBranch2;
+                                       // No iterations of the loop remain to backtrack into. Fail the loop.
+                                       goto AlternationBranch2;
                                  }
-                                   
-                                   AlternationMatch2:;
+                                   goto CharLoopBacktrack1;
+                                   LoopEnd1:;
                              //}
                              
                              alternation_branch1 = 1;
                              // Match the string "min".
                              if (!slice.StartsWith("min"))
                              {
-                                   goto AlternationBranch5;
+                                   goto AlternationBranch3;
                              }
                              
                              // Match with 3 alternative expressions.
                              //{
-                                   alternation_starting_pos4 = pos;
-                                   alternation_starting_capturepos4 = base.Crawlpos();
+                                   alternation_starting_pos2 = pos;
+                                   alternation_starting_capturepos2 = base.Crawlpos();
                                  
                                  // Branch 0
                                  //{
                                          !slice.Slice(3).StartsWith("ut") || // Match the string "ut".
                                          (((ch = slice[5]) != 'i') & (ch != 'o'))) // Match a character in the set [io].
                                      {
-                                           goto AlternationBranch6;
+                                           goto AlternationBranch4;
                                      }
                                      
-                                       alternation_branch4 = 0;
+                                       alternation_branch2 = 0;
                                      pos += 6;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch4;
+                                       goto AlternationMatch2;
                                      
-                                       AlternationBranch6:
-                                       pos = alternation_starting_pos4;
+                                       AlternationBranch4:
+                                       pos = alternation_starting_pos2;
                                      slice = inputSpan.Slice(pos);
-                                       UncaptureUntil(alternation_starting_capturepos4);
+                                       UncaptureUntil(alternation_starting_capturepos2);
                                  //}
                                  
                                  // Branch 1
                                      // Match 's'.
                                      if ((uint)slice.Length < 4 || slice[3] != 's')
                                      {
-                                           goto AlternationBranch7;
+                                           goto AlternationBranch5;
                                      }
                                      
-                                       alternation_branch4 = 1;
+                                       alternation_branch2 = 1;
                                      pos += 4;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch4;
+                                       goto AlternationMatch2;
                                      
-                                       AlternationBranch7:
-                                       pos = alternation_starting_pos4;
+                                       AlternationBranch5:
+                                       pos = alternation_starting_pos2;
                                      slice = inputSpan.Slice(pos);
-                                       UncaptureUntil(alternation_starting_capturepos4);
+                                       UncaptureUntil(alternation_starting_capturepos2);
                                  //}
                                  
                                  // Branch 2
                                  //{
                                      
-                                       alternation_branch4 = 2;
+                                       alternation_branch2 = 2;
                                      pos += 3;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch4;
+                                       goto AlternationMatch2;
                                  //}
                                  
-                                   AlternationBacktrack4:
+                                   AlternationBacktrack2:
                                  if (Utilities.s_hasTimeout)
                                  {
                                      base.CheckTimeout();
                                  }
                                  
-                                   switch (alternation_branch4)
+                                   switch (alternation_branch2)
                                  {
                                      case 0:
-                                           goto AlternationBranch6;
+                                           goto AlternationBranch4;
                                      case 1:
-                                           goto AlternationBranch7;
-                                       case 2:
                                          goto AlternationBranch5;
+                                       case 2:
+                                           goto AlternationBranch3;
                                  }
                                  
-                                   AlternationMatch4:;
+                                   AlternationMatch2:;
                              //}
                              
                              alternation_branch1 = 2;
                              goto AlternationMatch1;
                              
-                               AlternationBranch5:
+                               AlternationBranch3:
                              pos = alternation_starting_pos1;
                              slice = inputSpan.Slice(pos);
                              UncaptureUntil(alternation_starting_capturepos1);
                              
                              // Match with 3 alternative expressions.
                              //{
-                                   alternation_starting_pos5 = pos;
-                                   alternation_starting_capturepos5 = base.Crawlpos();
+                                   alternation_starting_pos3 = pos;
+                                   alternation_starting_capturepos3 = base.Crawlpos();
                                  
                                  // Branch 0
                                  //{
                                          !slice.Slice(3).StartsWith("ond") || // Match the string "ond".
                                          (((ch = slice[6]) != 'i') & (ch != 'o'))) // Match a character in the set [io].
                                      {
-                                           goto AlternationBranch8;
+                                           goto AlternationBranch6;
                                      }
                                      
-                                       alternation_branch5 = 0;
+                                       alternation_branch3 = 0;
                                      pos += 7;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch5;
+                                       goto AlternationMatch3;
                                      
-                                       AlternationBranch8:
-                                       pos = alternation_starting_pos5;
+                                       AlternationBranch6:
+                                       pos = alternation_starting_pos3;
                                      slice = inputSpan.Slice(pos);
-                                       UncaptureUntil(alternation_starting_capturepos5);
+                                       UncaptureUntil(alternation_starting_capturepos3);
                                  //}
                                  
                                  // Branch 1
                                      // Match 's'.
                                      if ((uint)slice.Length < 4 || slice[3] != 's')
                                      {
-                                           goto AlternationBranch9;
+                                           goto AlternationBranch7;
                                      }
                                      
-                                       alternation_branch5 = 1;
+                                       alternation_branch3 = 1;
                                      pos += 4;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch5;
+                                       goto AlternationMatch3;
                                      
-                                       AlternationBranch9:
-                                       pos = alternation_starting_pos5;
+                                       AlternationBranch7:
+                                       pos = alternation_starting_pos3;
                                      slice = inputSpan.Slice(pos);
-                                       UncaptureUntil(alternation_starting_capturepos5);
+                                       UncaptureUntil(alternation_starting_capturepos3);
                                  //}
                                  
                                  // Branch 2
                                  //{
                                      
-                                       alternation_branch5 = 2;
+                                       alternation_branch3 = 2;
                                      pos += 3;
                                      slice = inputSpan.Slice(pos);
-                                       goto AlternationMatch5;
+                                       goto AlternationMatch3;
                                  //}
                                  
-                                   AlternationBacktrack5:
+                                   AlternationBacktrack3:
                                  if (Utilities.s_hasTimeout)
                                  {
                                      base.CheckTimeout();
                                  }
                                  
-                                   switch (alternation_branch5)
+                                   switch (alternation_branch3)
                                  {
                                      case 0:
-                                           goto AlternationBranch8;
+                                           goto AlternationBranch6;
                                      case 1:
-                                           goto AlternationBranch9;
+                                           goto AlternationBranch7;
                                      case 2:
                                          goto AlternationBacktrack;
                                  }
                                  
-                                   AlternationMatch5:;
+                                   AlternationMatch3:;
                              //}
                              
                              alternation_branch1 = 3;
                              case 0:
                                  goto AlternationBranch1;
                              case 1:
-                                   goto AlternationBacktrack2;
+                                   goto LoopBacktrack;
                              case 2:
-                                   goto AlternationBacktrack4;
+                                   goto AlternationBacktrack2;
                              case 3:
-                                   goto AlternationBacktrack5;
+                                   goto AlternationBacktrack3;
                          }
                          
                          AlternationMatch1:;
              (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
      }
      
+       /// <summary>Pops 2 values from the backtracking stack.</summary>
+       [MethodImpl(MethodImplOptions.AggressiveInlining)]
+       internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
+       {
+           arg0 = stack[--pos];
+           arg1 = stack[--pos];
+       }
+       
      /// <summary>Pushes 2 values onto the backtracking stack.</summary>
      [MethodImpl(MethodImplOptions.AggressiveInlining)]
      internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1)
          }
      }
      
+       /// <summary>Pushes 3 values onto the backtracking stack.</summary>
+       [MethodImpl(MethodImplOptions.AggressiveInlining)]
+       internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
+       {
+           // If there's space available for all 3 values, store them.
+           int[] s = stack;
+           int p = pos;
+           if ((uint)(p + 2) < (uint)s.Length)
+           {
+               s[p] = arg0;
+               s[p + 1] = arg1;
+               s[p + 2] = arg2;
+               pos += 3;
+               return;
+           }
+       
+           // Otherwise, resize the stack to make room and try again.
+           WithResize(ref stack, ref pos, arg0, arg1, arg2);
+       
+           // <summary>Resize the backtracking stack array and push 3 values onto the stack.</summary>
+           [MethodImpl(MethodImplOptions.NoInlining)]
+           static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
+           {
+               Array.Resize(ref stack, (pos + 2) * 2);
+               StackPush(ref stack, ref pos, arg0, arg1, arg2);
+           }
+       }
+       
      /// <summary>Supports searching for characters in or not in "\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\t\n\v\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f !\"#$%&amp;'()*+,-./:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefgijklnpqrtuvwxyz{|}~\u007f".</summary>
      internal static readonly SearchValues<char> s_ascii_FFFFFFFFFFFF00FCFFFFFFFFFF5EF7FF = SearchValues.Create("\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\t\n\v\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f !\"#$%&'()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefgijklnpqrtuvwxyz{|}~\u007f");
  }

For more diff examples, see https://gist.github.com/MihuBot/a495e0c60796a8d6786b3298c93aa46a

JIT assembly changes
Total bytes of base: 54274415
Total bytes of diff: 54474333
Total bytes of delta: 199918 (0.37 % of base)
Total relative delta: 59.52
    diff is a regression.
    relative diff is a regression.

For a list of JIT diff regressions, see Regressions.md
For a list of JIT diff improvements, see Improvements.md

Sample source code for further analysis
const string JsonPath = "RegexResults-1300.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2loz5g");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

@stephentoub
Copy link
Member Author

@MihuBot regexdiff

@stephentoub
Copy link
Member Author

@MihuBot regexdiff

@MihuBot
Copy link

MihuBot commented Jul 28, 2025

659 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"\\r\\n|\\r" (823 uses)
[GeneratedRegex("\\r\\n|\\r")]
  /// Explanation:<br/>
  /// <code>
  /// ○ Match '\r'.<br/>
-   /// ○ Match with 2 alternative expressions, atomically.<br/>
-   ///     ○ Match '\n'.<br/>
-   ///     ○ Match an empty string.<br/>
+   /// ○ Match '\n' atomically, optionally.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      return false; // The input didn't match.
                  }
                  
-                   // Match with 2 alternative expressions, atomically.
+                   // Match '\n' atomically, optionally.
                  {
-                       int alternation_starting_pos = pos;
-                       
-                       // Branch 0
+                       if ((uint)slice.Length > (uint)1 && slice[1] == '\n')
                      {
-                           // Match '\n'.
-                           if ((uint)slice.Length < 2 || slice[1] != '\n')
-                           {
-                               goto AlternationBranch;
-                           }
-                           
-                           pos += 2;
-                           slice = inputSpan.Slice(pos);
-                           goto AlternationMatch;
-                           
-                           AlternationBranch:
-                           pos = alternation_starting_pos;
-                           slice = inputSpan.Slice(pos);
-                       }
-                       
-                       // Branch 1
-                       {
-                           
+                           slice = slice.Slice(1);
                          pos++;
-                           slice = inputSpan.Slice(pos);
                      }
-                       
-                       AlternationMatch:;
                  }
                  
                  // The input matched.
+                   pos++;
                  base.runtextpos = pos;
                  base.Capture(0, matchStart, pos);
                  return true;
"(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+" (479 uses)
[GeneratedRegex("(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+")]
  /// <code>
  /// ○ 1st capture group.<br/>
  ///     ○ Match the string "http".<br/>
-   ///     ○ Match with 2 alternative expressions.<br/>
-   ///         ○ Match an empty string.<br/>
-   ///         ○ Match 's'.<br/>
+   ///     ○ Match 's' atomically, optionally.<br/>
  /// ○ Match the string "://".<br/>
  /// ○ Match a character in the set [\-_\w] atomically at least once.<br/>
  /// ○ Loop greedily and atomically at least once.<br/>
                  int pos = base.runtextpos;
                  int matchStart = pos;
                  char ch;
-                   int alternation_branch = 0;
-                   int alternation_starting_capturepos = 0;
-                   int alternation_starting_pos = 0;
                  int capture_starting_pos = 0;
                  int loop_iteration = 0;
                  int stackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // 1st capture group.
-                   //{
+                   {
                      capture_starting_pos = pos;
                      
                      // Match the string "http".
                          return false; // The input didn't match.
                      }
                      
-                       // Match with 2 alternative expressions.
-                       //{
-                           alternation_starting_pos = pos;
-                           alternation_starting_capturepos = base.Crawlpos();
-                           
-                           // Branch 0
-                           //{
-                               
-                               alternation_branch = 0;
-                               pos += 4;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                               
-                               AlternationBranch:
-                               pos = alternation_starting_pos;
-                               slice = inputSpan.Slice(pos);
-                               UncaptureUntil(alternation_starting_capturepos);
-                           //}
-                           
-                           // Branch 1
-                           //{
-                               // Match 's'.
-                               if ((uint)slice.Length < 5 || slice[4] != 's')
-                               {
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                               }
-                               
-                               alternation_branch = 1;
-                               pos += 5;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                           //}
-                           
-                           AlternationBacktrack:
-                           if (Utilities.s_hasTimeout)
+                       // Match 's' atomically, optionally.
+                       {
+                           if ((uint)slice.Length > (uint)4 && slice[4] == 's')
                          {
-                               base.CheckTimeout();
+                               slice = slice.Slice(1);
+                               pos++;
                          }
-                           
-                           switch (alternation_branch)
-                           {
-                               case 0:
-                                   goto AlternationBranch;
-                               case 1:
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                           }
-                           
-                           AlternationMatch:;
-                       //}
+                       }
                      
+                       pos += 4;
+                       slice = inputSpan.Slice(pos);
                      base.Capture(1, capture_starting_pos, pos);
-                       
-                       goto CaptureSkipBacktrack;
-                       
-                       CaptureBacktrack:
-                       goto AlternationBacktrack;
-                       
-                       CaptureSkipBacktrack:;
-                   //}
+                   }
                  
                  // Match the string "://".
                  if (!slice.StartsWith("://"))
                  {
-                       goto CaptureBacktrack;
+                       UncaptureUntil(0);
+                       return false; // The input didn't match.
                  }
                  
                  // Match a character in the set [\-_\w] atomically at least once.
                      
                      if (iteration == 0)
                      {
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      slice = slice.Slice(iteration);
                      if (--loop_iteration < 0)
                      {
                          // Unable to match the remainder of the expression after exhausting the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      pos = base.runstack![--stackpos];
                      UncaptureUntil(base.runstack![--stackpos]);
                      if (loop_iteration == 0)
                      {
                          // All possible iterations have matched, but it's below the required minimum of 1. Fail the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      LoopEnd:
"(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+) ..." (479 uses)
[GeneratedRegex("(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,/~\\+#]*)?")]
  /// <code>
  /// ○ 1st capture group.<br/>
  ///     ○ Match the string "http".<br/>
-   ///     ○ Match with 2 alternative expressions.<br/>
-   ///         ○ Match an empty string.<br/>
-   ///         ○ Match 's'.<br/>
+   ///     ○ Match 's' atomically, optionally.<br/>
  /// ○ Match the string "://".<br/>
  /// ○ Match a character in the set [\-_\w] atomically at least once.<br/>
  /// ○ Loop greedily at least once.<br/>
                  int pos = base.runtextpos;
                  int matchStart = pos;
                  char ch;
-                   int alternation_branch = 0;
-                   int alternation_starting_capturepos = 0;
-                   int alternation_starting_pos = 0;
                  int capture_starting_pos = 0;
                  int charloop_starting_pos = 0, charloop_ending_pos = 0;
                  int loop_iteration = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // 1st capture group.
-                   //{
+                   {
                      capture_starting_pos = pos;
                      
                      // Match the string "http".
                          return false; // The input didn't match.
                      }
                      
-                       // Match with 2 alternative expressions.
-                       //{
-                           alternation_starting_pos = pos;
-                           alternation_starting_capturepos = base.Crawlpos();
-                           
-                           // Branch 0
-                           //{
-                               
-                               alternation_branch = 0;
-                               pos += 4;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                               
-                               AlternationBranch:
-                               pos = alternation_starting_pos;
-                               slice = inputSpan.Slice(pos);
-                               UncaptureUntil(alternation_starting_capturepos);
-                           //}
-                           
-                           // Branch 1
-                           //{
-                               // Match 's'.
-                               if ((uint)slice.Length < 5 || slice[4] != 's')
-                               {
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                               }
-                               
-                               alternation_branch = 1;
-                               pos += 5;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                           //}
-                           
-                           AlternationBacktrack:
-                           if (Utilities.s_hasTimeout)
+                       // Match 's' atomically, optionally.
+                       {
+                           if ((uint)slice.Length > (uint)4 && slice[4] == 's')
                          {
-                               base.CheckTimeout();
+                               slice = slice.Slice(1);
+                               pos++;
                          }
-                           
-                           switch (alternation_branch)
-                           {
-                               case 0:
-                                   goto AlternationBranch;
-                               case 1:
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                           }
-                           
-                           AlternationMatch:;
-                       //}
+                       }
                      
+                       pos += 4;
+                       slice = inputSpan.Slice(pos);
                      base.Capture(1, capture_starting_pos, pos);
-                       
-                       goto CaptureSkipBacktrack;
-                       
-                       CaptureBacktrack:
-                       goto AlternationBacktrack;
-                       
-                       CaptureSkipBacktrack:;
-                   //}
+                   }
                  
                  // Match the string "://".
                  if (!slice.StartsWith("://"))
                  {
-                       goto CaptureBacktrack;
+                       UncaptureUntil(0);
+                       return false; // The input didn't match.
                  }
                  
                  // Match a character in the set [\-_\w] atomically at least once.
                      
                      if (iteration == 0)
                      {
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      slice = slice.Slice(iteration);
                          base.Capture(2, capture_starting_pos1, pos);
                          
                          Utilities.StackPush(ref base.runstack!, ref stackpos, capture_starting_pos1);
-                           goto CaptureSkipBacktrack1;
+                           goto CaptureSkipBacktrack;
                          
-                           CaptureBacktrack1:
+                           CaptureBacktrack:
                          capture_starting_pos1 = base.runstack![--stackpos];
                          goto CharLoopBacktrack;
                          
-                           CaptureSkipBacktrack1:;
+                           CaptureSkipBacktrack:;
                      //}
                      
                      
                      if (--loop_iteration < 0)
                      {
                          // Unable to match the remainder of the expression after exhausting the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      pos = base.runstack![--stackpos];
                      UncaptureUntil(base.runstack![--stackpos]);
                      if (loop_iteration == 0)
                      {
                          // No iterations have been matched to backtrack into. Fail the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      goto LoopEnd;
                      if (loop_iteration == 0)
                      {
                          // No iterations of the loop remain to backtrack into. Fail the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
-                       goto CaptureBacktrack1;
+                       goto CaptureBacktrack;
                      LoopEnd:;
                  //}
"(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+) ..." (479 uses)
[GeneratedRegex("(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,/~\\+#]*)?/")]
  /// <code>
  /// ○ 1st capture group.<br/>
  ///     ○ Match the string "http".<br/>
-   ///     ○ Match with 2 alternative expressions.<br/>
-   ///         ○ Match an empty string.<br/>
-   ///         ○ Match 's'.<br/>
+   ///     ○ Match 's' atomically, optionally.<br/>
  /// ○ Match the string "://".<br/>
  /// ○ Match a character in the set [\-_\w] atomically at least once.<br/>
  /// ○ Loop greedily at least once.<br/>
                  int pos = base.runtextpos;
                  int matchStart = pos;
                  char ch;
-                   int alternation_branch = 0;
-                   int alternation_starting_capturepos = 0;
-                   int alternation_starting_pos = 0;
                  int capture_starting_pos = 0;
                  int charloop_starting_pos = 0, charloop_ending_pos = 0;
                  int charloop_starting_pos1 = 0, charloop_ending_pos1 = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // 1st capture group.
-                   //{
+                   {
                      capture_starting_pos = pos;
                      
                      // Match the string "http".
                          return false; // The input didn't match.
                      }
                      
-                       // Match with 2 alternative expressions.
-                       //{
-                           alternation_starting_pos = pos;
-                           alternation_starting_capturepos = base.Crawlpos();
-                           
-                           // Branch 0
-                           //{
-                               
-                               alternation_branch = 0;
-                               pos += 4;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                               
-                               AlternationBranch:
-                               pos = alternation_starting_pos;
-                               slice = inputSpan.Slice(pos);
-                               UncaptureUntil(alternation_starting_capturepos);
-                           //}
-                           
-                           // Branch 1
-                           //{
-                               // Match 's'.
-                               if ((uint)slice.Length < 5 || slice[4] != 's')
-                               {
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                               }
-                               
-                               alternation_branch = 1;
-                               pos += 5;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                           //}
-                           
-                           AlternationBacktrack:
-                           if (Utilities.s_hasTimeout)
+                       // Match 's' atomically, optionally.
+                       {
+                           if ((uint)slice.Length > (uint)4 && slice[4] == 's')
                          {
-                               base.CheckTimeout();
+                               slice = slice.Slice(1);
+                               pos++;
                          }
-                           
-                           switch (alternation_branch)
-                           {
-                               case 0:
-                                   goto AlternationBranch;
-                               case 1:
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                           }
-                           
-                           AlternationMatch:;
-                       //}
+                       }
                      
+                       pos += 4;
+                       slice = inputSpan.Slice(pos);
                      base.Capture(1, capture_starting_pos, pos);
-                       
-                       goto CaptureSkipBacktrack;
-                       
-                       CaptureBacktrack:
-                       goto AlternationBacktrack;
-                       
-                       CaptureSkipBacktrack:;
-                   //}
+                   }
                  
                  // Match the string "://".
                  if (!slice.StartsWith("://"))
                  {
-                       goto CaptureBacktrack;
+                       UncaptureUntil(0);
+                       return false; // The input didn't match.
                  }
                  
                  // Match a character in the set [\-_\w] atomically at least once.
                      
                      if (iteration == 0)
                      {
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      slice = slice.Slice(iteration);
                          base.Capture(2, capture_starting_pos1, pos);
                          
                          Utilities.StackPush(ref base.runstack!, ref stackpos, capture_starting_pos1);
-                           goto CaptureSkipBacktrack1;
+                           goto CaptureSkipBacktrack;
                          
-                           CaptureBacktrack1:
+                           CaptureBacktrack:
                          capture_starting_pos1 = base.runstack![--stackpos];
                          goto CharLoopBacktrack;
                          
-                           CaptureSkipBacktrack1:;
+                           CaptureSkipBacktrack:;
                      //}
                      
                      
                      if (--loop_iteration < 0)
                      {
                          // Unable to match the remainder of the expression after exhausting the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      pos = base.runstack![--stackpos];
                      UncaptureUntil(base.runstack![--stackpos]);
                      if (loop_iteration == 0)
                      {
                          // No iterations have been matched to backtrack into. Fail the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      goto LoopEnd;
                      if (loop_iteration == 0)
                      {
                          // No iterations of the loop remain to backtrack into. Fail the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
-                       goto CaptureBacktrack1;
+                       goto CaptureBacktrack;
                      LoopEnd:;
                  //}
                  
                          base.Capture(3, capture_starting_pos2, pos);
                          
                          Utilities.StackPush(ref base.runstack!, ref stackpos, capture_starting_pos2);
-                           goto CaptureSkipBacktrack2;
+                           goto CaptureSkipBacktrack1;
                          
-                           CaptureBacktrack2:
+                           CaptureBacktrack1:
                          capture_starting_pos2 = base.runstack![--stackpos];
                          goto CharLoopBacktrack1;
                          
-                           CaptureSkipBacktrack2:;
+                           CaptureSkipBacktrack1:;
                      //}
                      
                      
                          // No iterations of the loop remain to backtrack into. Fail the loop.
                          goto LoopBacktrack;
                      }
-                       goto CaptureBacktrack2;
+                       goto CaptureBacktrack1;
                      LoopEnd1:;
                  //}
"^ps_(?<major>1|2|3|4|5)_(?<minor>0|1|)$" (308 uses)
[GeneratedRegex("^ps_(?<major>1|2|3|4|5)_(?<minor>0|1|)$")]
  ///     ○ Match a character in the set [1-5].<br/>
  /// ○ Match '_'.<br/>
  /// ○ "minor" capture group.<br/>
-   ///     ○ Match with 2 alternative expressions.<br/>
-   ///         ○ Match a character in the set [01].<br/>
-   ///         ○ Match an empty string.<br/>
+   ///     ○ Match a character in the set [01] atomically, optionally.<br/>
  /// ○ Match if at the end of the string or if before an ending newline.<br/>
  /// </code>
  /// </remarks>
              {
                  int pos = base.runtextpos;
                  int matchStart = pos;
-                   int alternation_branch = 0;
-                   int alternation_starting_capturepos = 0;
-                   int alternation_starting_pos = 0;
                  int capture_starting_pos = 0;
                  int capture_starting_pos1 = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  }
                  
                  // "minor" capture group.
-                   //{
+                   {
                      pos++;
                      slice = inputSpan.Slice(pos);
                      capture_starting_pos1 = pos;
                      
-                       // Match with 2 alternative expressions.
-                       //{
-                           alternation_starting_pos = pos;
-                           alternation_starting_capturepos = base.Crawlpos();
-                           
-                           // Branch 0
-                           //{
-                               // Match a character in the set [01].
-                               if (slice.IsEmpty || !char.IsBetween(slice[0], '0', '1'))
-                               {
-                                   goto AlternationBranch;
-                               }
-                               
-                               alternation_branch = 0;
+                       // Match a character in the set [01] atomically, optionally.
+                       {
+                           if (!slice.IsEmpty && char.IsBetween(slice[0], '0', '1'))
+                           {
+                               slice = slice.Slice(1);
                              pos++;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                               
-                               AlternationBranch:
-                               pos = alternation_starting_pos;
-                               slice = inputSpan.Slice(pos);
-                               UncaptureUntil(alternation_starting_capturepos);
-                           //}
-                           
-                           // Branch 1
-                           //{
-                               
-                               alternation_branch = 1;
-                               goto AlternationMatch;
-                           //}
-                           
-                           AlternationBacktrack:
-                           if (Utilities.s_hasTimeout)
-                           {
-                               base.CheckTimeout();
                          }
-                           
-                           switch (alternation_branch)
-                           {
-                               case 0:
-                                   goto AlternationBranch;
-                               case 1:
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                           }
-                           
-                           AlternationMatch:;
-                       //}
+                       }
                      
                      base.Capture(2, capture_starting_pos1, pos);
-                       
-                       goto CaptureSkipBacktrack;
-                       
-                       CaptureBacktrack:
-                       goto AlternationBacktrack;
-                       
-                       CaptureSkipBacktrack:;
-                   //}
+                   }
                  
                  // Match if at the end of the string or if before an ending newline.
                  if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n'))
                  {
-                       goto CaptureBacktrack;
+                       UncaptureUntil(0);
+                       return false; // The input didn't match.
                  }
                  
                  // The input matched.

For more diff examples, see https://gist.github.com/MihuBot/9cd26902240a71cfb7e4de17b43e6f9c

JIT assembly changes
Total bytes of base: 53732389
Total bytes of diff: 53928517
Total bytes of delta: 196128 (0.37 % of base)
Total relative delta: 53.96
    diff is a regression.
    relative diff is a regression.

For a list of JIT diff regressions, see Regressions.md
For a list of JIT diff improvements, see Improvements.md

Sample source code for further analysis
const string JsonPath = "RegexResults-1313.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2v-Wpg");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

@stephentoub stephentoub marked this pull request as ready for review July 28, 2025 21:54
@stephentoub stephentoub requested review from Copilot and MihaZupan July 28, 2025 21:54
Copy link
Contributor

@Copilot Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull Request Overview

This PR implements an optimization for regular expressions that transforms alternations with empty branches (like X|) into optional quantifiers (like X?). This transformation enables better downstream optimizations, particularly loop coalescing.

Key changes:

  • Transforms X| patterns into X? and |X patterns into X?? (lazy optional)
  • Ensures re-reduction when new optimization opportunities are exposed
  • Adds comprehensive test coverage for various alternation-to-quantifier transformations

Reviewed Changes

Copilot reviewed 2 out of 2 changed files in this pull request and generated 1 comment.

File Description
RegexNode.cs Implements the core transformation logic in ReduceAlternation() and forces re-reduction when opportunities are detected
RegexReductionTests.cs Adds test cases validating the transformation of alternations with empty branches to optional quantifiers

An alternation with two branches where the second is empty is the same as the first branch just being an optional loop; similarly, when the first branch is empty, it's a lazy optional loop of the second branch. Expressing as an optional is better optimized elsewhere in the regex transforms, e.g. with loop coalescing, so we're better off with the optional representation.
@stephentoub stephentoub force-pushed the alternationoptional branch from c588f84 to a5b020e Compare July 28, 2025 21:57
@stephentoub
Copy link
Member Author

@MihuBot regexdiff

@MihuBot
Copy link

MihuBot commented Jul 28, 2025

659 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"\\r\\n|\\r" (823 uses)
[GeneratedRegex("\\r\\n|\\r")]
  /// Explanation:<br/>
  /// <code>
  /// ○ Match '\r'.<br/>
-   /// ○ Match with 2 alternative expressions, atomically.<br/>
-   ///     ○ Match '\n'.<br/>
-   ///     ○ Match an empty string.<br/>
+   /// ○ Match '\n' atomically, optionally.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
                      return false; // The input didn't match.
                  }
                  
-                   // Match with 2 alternative expressions, atomically.
+                   // Match '\n' atomically, optionally.
                  {
-                       int alternation_starting_pos = pos;
-                       
-                       // Branch 0
+                       if ((uint)slice.Length > (uint)1 && slice[1] == '\n')
                      {
-                           // Match '\n'.
-                           if ((uint)slice.Length < 2 || slice[1] != '\n')
-                           {
-                               goto AlternationBranch;
-                           }
-                           
-                           pos += 2;
-                           slice = inputSpan.Slice(pos);
-                           goto AlternationMatch;
-                           
-                           AlternationBranch:
-                           pos = alternation_starting_pos;
-                           slice = inputSpan.Slice(pos);
-                       }
-                       
-                       // Branch 1
-                       {
-                           
+                           slice = slice.Slice(1);
                          pos++;
-                           slice = inputSpan.Slice(pos);
                      }
-                       
-                       AlternationMatch:;
                  }
                  
                  // The input matched.
+                   pos++;
                  base.runtextpos = pos;
                  base.Capture(0, matchStart, pos);
                  return true;
"(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+" (479 uses)
[GeneratedRegex("(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+")]
  /// <code>
  /// ○ 1st capture group.<br/>
  ///     ○ Match the string "http".<br/>
-   ///     ○ Match with 2 alternative expressions.<br/>
-   ///         ○ Match an empty string.<br/>
-   ///         ○ Match 's'.<br/>
+   ///     ○ Match 's' atomically, optionally.<br/>
  /// ○ Match the string "://".<br/>
  /// ○ Match a character in the set [\-_\w] atomically at least once.<br/>
  /// ○ Loop greedily and atomically at least once.<br/>
                  int pos = base.runtextpos;
                  int matchStart = pos;
                  char ch;
-                   int alternation_branch = 0;
-                   int alternation_starting_capturepos = 0;
-                   int alternation_starting_pos = 0;
                  int capture_starting_pos = 0;
                  int loop_iteration = 0;
                  int stackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // 1st capture group.
-                   //{
+                   {
                      capture_starting_pos = pos;
                      
                      // Match the string "http".
                          return false; // The input didn't match.
                      }
                      
-                       // Match with 2 alternative expressions.
-                       //{
-                           alternation_starting_pos = pos;
-                           alternation_starting_capturepos = base.Crawlpos();
-                           
-                           // Branch 0
-                           //{
-                               
-                               alternation_branch = 0;
-                               pos += 4;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                               
-                               AlternationBranch:
-                               pos = alternation_starting_pos;
-                               slice = inputSpan.Slice(pos);
-                               UncaptureUntil(alternation_starting_capturepos);
-                           //}
-                           
-                           // Branch 1
-                           //{
-                               // Match 's'.
-                               if ((uint)slice.Length < 5 || slice[4] != 's')
-                               {
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                               }
-                               
-                               alternation_branch = 1;
-                               pos += 5;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                           //}
-                           
-                           AlternationBacktrack:
-                           if (Utilities.s_hasTimeout)
+                       // Match 's' atomically, optionally.
+                       {
+                           if ((uint)slice.Length > (uint)4 && slice[4] == 's')
                          {
-                               base.CheckTimeout();
+                               slice = slice.Slice(1);
+                               pos++;
                          }
-                           
-                           switch (alternation_branch)
-                           {
-                               case 0:
-                                   goto AlternationBranch;
-                               case 1:
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                           }
-                           
-                           AlternationMatch:;
-                       //}
+                       }
                      
+                       pos += 4;
+                       slice = inputSpan.Slice(pos);
                      base.Capture(1, capture_starting_pos, pos);
-                       
-                       goto CaptureSkipBacktrack;
-                       
-                       CaptureBacktrack:
-                       goto AlternationBacktrack;
-                       
-                       CaptureSkipBacktrack:;
-                   //}
+                   }
                  
                  // Match the string "://".
                  if (!slice.StartsWith("://"))
                  {
-                       goto CaptureBacktrack;
+                       UncaptureUntil(0);
+                       return false; // The input didn't match.
                  }
                  
                  // Match a character in the set [\-_\w] atomically at least once.
                      
                      if (iteration == 0)
                      {
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      slice = slice.Slice(iteration);
                      if (--loop_iteration < 0)
                      {
                          // Unable to match the remainder of the expression after exhausting the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      pos = base.runstack![--stackpos];
                      UncaptureUntil(base.runstack![--stackpos]);
                      if (loop_iteration == 0)
                      {
                          // All possible iterations have matched, but it's below the required minimum of 1. Fail the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      LoopEnd:
"(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+) ..." (479 uses)
[GeneratedRegex("(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,/~\\+#]*)?")]
  /// <code>
  /// ○ 1st capture group.<br/>
  ///     ○ Match the string "http".<br/>
-   ///     ○ Match with 2 alternative expressions.<br/>
-   ///         ○ Match an empty string.<br/>
-   ///         ○ Match 's'.<br/>
+   ///     ○ Match 's' atomically, optionally.<br/>
  /// ○ Match the string "://".<br/>
  /// ○ Match a character in the set [\-_\w] atomically at least once.<br/>
  /// ○ Loop greedily at least once.<br/>
                  int pos = base.runtextpos;
                  int matchStart = pos;
                  char ch;
-                   int alternation_branch = 0;
-                   int alternation_starting_capturepos = 0;
-                   int alternation_starting_pos = 0;
                  int capture_starting_pos = 0;
                  int charloop_starting_pos = 0, charloop_ending_pos = 0;
                  int loop_iteration = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // 1st capture group.
-                   //{
+                   {
                      capture_starting_pos = pos;
                      
                      // Match the string "http".
                          return false; // The input didn't match.
                      }
                      
-                       // Match with 2 alternative expressions.
-                       //{
-                           alternation_starting_pos = pos;
-                           alternation_starting_capturepos = base.Crawlpos();
-                           
-                           // Branch 0
-                           //{
-                               
-                               alternation_branch = 0;
-                               pos += 4;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                               
-                               AlternationBranch:
-                               pos = alternation_starting_pos;
-                               slice = inputSpan.Slice(pos);
-                               UncaptureUntil(alternation_starting_capturepos);
-                           //}
-                           
-                           // Branch 1
-                           //{
-                               // Match 's'.
-                               if ((uint)slice.Length < 5 || slice[4] != 's')
-                               {
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                               }
-                               
-                               alternation_branch = 1;
-                               pos += 5;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                           //}
-                           
-                           AlternationBacktrack:
-                           if (Utilities.s_hasTimeout)
+                       // Match 's' atomically, optionally.
+                       {
+                           if ((uint)slice.Length > (uint)4 && slice[4] == 's')
                          {
-                               base.CheckTimeout();
+                               slice = slice.Slice(1);
+                               pos++;
                          }
-                           
-                           switch (alternation_branch)
-                           {
-                               case 0:
-                                   goto AlternationBranch;
-                               case 1:
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                           }
-                           
-                           AlternationMatch:;
-                       //}
+                       }
                      
+                       pos += 4;
+                       slice = inputSpan.Slice(pos);
                      base.Capture(1, capture_starting_pos, pos);
-                       
-                       goto CaptureSkipBacktrack;
-                       
-                       CaptureBacktrack:
-                       goto AlternationBacktrack;
-                       
-                       CaptureSkipBacktrack:;
-                   //}
+                   }
                  
                  // Match the string "://".
                  if (!slice.StartsWith("://"))
                  {
-                       goto CaptureBacktrack;
+                       UncaptureUntil(0);
+                       return false; // The input didn't match.
                  }
                  
                  // Match a character in the set [\-_\w] atomically at least once.
                      
                      if (iteration == 0)
                      {
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      slice = slice.Slice(iteration);
                          base.Capture(2, capture_starting_pos1, pos);
                          
                          Utilities.StackPush(ref base.runstack!, ref stackpos, capture_starting_pos1);
-                           goto CaptureSkipBacktrack1;
+                           goto CaptureSkipBacktrack;
                          
-                           CaptureBacktrack1:
+                           CaptureBacktrack:
                          capture_starting_pos1 = base.runstack![--stackpos];
                          goto CharLoopBacktrack;
                          
-                           CaptureSkipBacktrack1:;
+                           CaptureSkipBacktrack:;
                      //}
                      
                      
                      if (--loop_iteration < 0)
                      {
                          // Unable to match the remainder of the expression after exhausting the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      pos = base.runstack![--stackpos];
                      UncaptureUntil(base.runstack![--stackpos]);
                      if (loop_iteration == 0)
                      {
                          // No iterations have been matched to backtrack into. Fail the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      goto LoopEnd;
                      if (loop_iteration == 0)
                      {
                          // No iterations of the loop remain to backtrack into. Fail the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
-                       goto CaptureBacktrack1;
+                       goto CaptureBacktrack;
                      LoopEnd:;
                  //}
"(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+) ..." (479 uses)
[GeneratedRegex("(http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,/~\\+#]*)?/")]
  /// <code>
  /// ○ 1st capture group.<br/>
  ///     ○ Match the string "http".<br/>
-   ///     ○ Match with 2 alternative expressions.<br/>
-   ///         ○ Match an empty string.<br/>
-   ///         ○ Match 's'.<br/>
+   ///     ○ Match 's' atomically, optionally.<br/>
  /// ○ Match the string "://".<br/>
  /// ○ Match a character in the set [\-_\w] atomically at least once.<br/>
  /// ○ Loop greedily at least once.<br/>
                  int pos = base.runtextpos;
                  int matchStart = pos;
                  char ch;
-                   int alternation_branch = 0;
-                   int alternation_starting_capturepos = 0;
-                   int alternation_starting_pos = 0;
                  int capture_starting_pos = 0;
                  int charloop_starting_pos = 0, charloop_ending_pos = 0;
                  int charloop_starting_pos1 = 0, charloop_ending_pos1 = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // 1st capture group.
-                   //{
+                   {
                      capture_starting_pos = pos;
                      
                      // Match the string "http".
                          return false; // The input didn't match.
                      }
                      
-                       // Match with 2 alternative expressions.
-                       //{
-                           alternation_starting_pos = pos;
-                           alternation_starting_capturepos = base.Crawlpos();
-                           
-                           // Branch 0
-                           //{
-                               
-                               alternation_branch = 0;
-                               pos += 4;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                               
-                               AlternationBranch:
-                               pos = alternation_starting_pos;
-                               slice = inputSpan.Slice(pos);
-                               UncaptureUntil(alternation_starting_capturepos);
-                           //}
-                           
-                           // Branch 1
-                           //{
-                               // Match 's'.
-                               if ((uint)slice.Length < 5 || slice[4] != 's')
-                               {
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                               }
-                               
-                               alternation_branch = 1;
-                               pos += 5;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                           //}
-                           
-                           AlternationBacktrack:
-                           if (Utilities.s_hasTimeout)
+                       // Match 's' atomically, optionally.
+                       {
+                           if ((uint)slice.Length > (uint)4 && slice[4] == 's')
                          {
-                               base.CheckTimeout();
+                               slice = slice.Slice(1);
+                               pos++;
                          }
-                           
-                           switch (alternation_branch)
-                           {
-                               case 0:
-                                   goto AlternationBranch;
-                               case 1:
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                           }
-                           
-                           AlternationMatch:;
-                       //}
+                       }
                      
+                       pos += 4;
+                       slice = inputSpan.Slice(pos);
                      base.Capture(1, capture_starting_pos, pos);
-                       
-                       goto CaptureSkipBacktrack;
-                       
-                       CaptureBacktrack:
-                       goto AlternationBacktrack;
-                       
-                       CaptureSkipBacktrack:;
-                   //}
+                   }
                  
                  // Match the string "://".
                  if (!slice.StartsWith("://"))
                  {
-                       goto CaptureBacktrack;
+                       UncaptureUntil(0);
+                       return false; // The input didn't match.
                  }
                  
                  // Match a character in the set [\-_\w] atomically at least once.
                      
                      if (iteration == 0)
                      {
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      slice = slice.Slice(iteration);
                          base.Capture(2, capture_starting_pos1, pos);
                          
                          Utilities.StackPush(ref base.runstack!, ref stackpos, capture_starting_pos1);
-                           goto CaptureSkipBacktrack1;
+                           goto CaptureSkipBacktrack;
                          
-                           CaptureBacktrack1:
+                           CaptureBacktrack:
                          capture_starting_pos1 = base.runstack![--stackpos];
                          goto CharLoopBacktrack;
                          
-                           CaptureSkipBacktrack1:;
+                           CaptureSkipBacktrack:;
                      //}
                      
                      
                      if (--loop_iteration < 0)
                      {
                          // Unable to match the remainder of the expression after exhausting the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      pos = base.runstack![--stackpos];
                      UncaptureUntil(base.runstack![--stackpos]);
                      if (loop_iteration == 0)
                      {
                          // No iterations have been matched to backtrack into. Fail the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
                      
                      goto LoopEnd;
                      if (loop_iteration == 0)
                      {
                          // No iterations of the loop remain to backtrack into. Fail the loop.
-                           goto CaptureBacktrack;
+                           UncaptureUntil(0);
+                           return false; // The input didn't match.
                      }
-                       goto CaptureBacktrack1;
+                       goto CaptureBacktrack;
                      LoopEnd:;
                  //}
                  
                          base.Capture(3, capture_starting_pos2, pos);
                          
                          Utilities.StackPush(ref base.runstack!, ref stackpos, capture_starting_pos2);
-                           goto CaptureSkipBacktrack2;
+                           goto CaptureSkipBacktrack1;
                          
-                           CaptureBacktrack2:
+                           CaptureBacktrack1:
                          capture_starting_pos2 = base.runstack![--stackpos];
                          goto CharLoopBacktrack1;
                          
-                           CaptureSkipBacktrack2:;
+                           CaptureSkipBacktrack1:;
                      //}
                      
                      
                          // No iterations of the loop remain to backtrack into. Fail the loop.
                          goto LoopBacktrack;
                      }
-                       goto CaptureBacktrack2;
+                       goto CaptureBacktrack1;
                      LoopEnd1:;
                  //}
"^ps_(?<major>1|2|3|4|5)_(?<minor>0|1|)$" (308 uses)
[GeneratedRegex("^ps_(?<major>1|2|3|4|5)_(?<minor>0|1|)$")]
  ///     ○ Match a character in the set [1-5].<br/>
  /// ○ Match '_'.<br/>
  /// ○ "minor" capture group.<br/>
-   ///     ○ Match with 2 alternative expressions.<br/>
-   ///         ○ Match a character in the set [01].<br/>
-   ///         ○ Match an empty string.<br/>
+   ///     ○ Match a character in the set [01] atomically, optionally.<br/>
  /// ○ Match if at the end of the string or if before an ending newline.<br/>
  /// </code>
  /// </remarks>
              {
                  int pos = base.runtextpos;
                  int matchStart = pos;
-                   int alternation_branch = 0;
-                   int alternation_starting_capturepos = 0;
-                   int alternation_starting_pos = 0;
                  int capture_starting_pos = 0;
                  int capture_starting_pos1 = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  }
                  
                  // "minor" capture group.
-                   //{
+                   {
                      pos++;
                      slice = inputSpan.Slice(pos);
                      capture_starting_pos1 = pos;
                      
-                       // Match with 2 alternative expressions.
-                       //{
-                           alternation_starting_pos = pos;
-                           alternation_starting_capturepos = base.Crawlpos();
-                           
-                           // Branch 0
-                           //{
-                               // Match a character in the set [01].
-                               if (slice.IsEmpty || !char.IsBetween(slice[0], '0', '1'))
-                               {
-                                   goto AlternationBranch;
-                               }
-                               
-                               alternation_branch = 0;
+                       // Match a character in the set [01] atomically, optionally.
+                       {
+                           if (!slice.IsEmpty && char.IsBetween(slice[0], '0', '1'))
+                           {
+                               slice = slice.Slice(1);
                              pos++;
-                               slice = inputSpan.Slice(pos);
-                               goto AlternationMatch;
-                               
-                               AlternationBranch:
-                               pos = alternation_starting_pos;
-                               slice = inputSpan.Slice(pos);
-                               UncaptureUntil(alternation_starting_capturepos);
-                           //}
-                           
-                           // Branch 1
-                           //{
-                               
-                               alternation_branch = 1;
-                               goto AlternationMatch;
-                           //}
-                           
-                           AlternationBacktrack:
-                           if (Utilities.s_hasTimeout)
-                           {
-                               base.CheckTimeout();
                          }
-                           
-                           switch (alternation_branch)
-                           {
-                               case 0:
-                                   goto AlternationBranch;
-                               case 1:
-                                   UncaptureUntil(0);
-                                   return false; // The input didn't match.
-                           }
-                           
-                           AlternationMatch:;
-                       //}
+                       }
                      
                      base.Capture(2, capture_starting_pos1, pos);
-                       
-                       goto CaptureSkipBacktrack;
-                       
-                       CaptureBacktrack:
-                       goto AlternationBacktrack;
-                       
-                       CaptureSkipBacktrack:;
-                   //}
+                   }
                  
                  // Match if at the end of the string or if before an ending newline.
                  if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n'))
                  {
-                       goto CaptureBacktrack;
+                       UncaptureUntil(0);
+                       return false; // The input didn't match.
                  }
                  
                  // The input matched.

For more diff examples, see https://gist.github.com/MihuBot/3bce07f425cf5dd2f9385e5952306c0c

JIT assembly changes
Total bytes of base: 53732389
Total bytes of diff: 53928517
Total bytes of delta: 196128 (0.37 % of base)
Total relative delta: 53.96
    diff is a regression.
    relative diff is a regression.

For a list of JIT diff regressions, see Regressions.md
For a list of JIT diff improvements, see Improvements.md

Sample source code for further analysis
const string JsonPath = "RegexResults-1316.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2wTWV0");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

@stephentoub stephentoub merged commit 27cc30f into dotnet:main Jul 29, 2025
85 of 87 checks passed
@stephentoub stephentoub deleted the alternationoptional branch July 29, 2025 00:37
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants