-
Notifications
You must be signed in to change notification settings - Fork 5.1k
Transform regex X|
into X?
#118087
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Transform regex X|
into X?
#118087
Conversation
An alternation with two branches where the second is empty is the same as the first branch just being an optional loop. The latter way of it being expressed is better optimized elsewhere in the regex transforms, e.g. with loop coalescing, so we're better off with the optional representation.
@MihuBot regexdiff |
Tagging subscribers to this area: @dotnet/area-system-text-regularexpressions |
471 out of 18857 patterns have generated source code changes. Examples of GeneratedRegex source diffs"\\r\\n|\\r" (823 uses)[GeneratedRegex("\\r\\n|\\r")] /// Explanation:<br/>
/// <code>
/// ○ Match '\r'.<br/>
- /// ○ Match with 2 alternative expressions, atomically.<br/>
- /// ○ Match '\n'.<br/>
- /// ○ Match an empty string.<br/>
+ /// ○ Match '\n' atomically, optionally.<br/>
/// </code>
/// </remarks>
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
return false; // The input didn't match.
}
- // Match with 2 alternative expressions, atomically.
+ // Match '\n' atomically, optionally.
{
- int alternation_starting_pos = pos;
-
- // Branch 0
+ if ((uint)slice.Length > (uint)1 && slice[1] == '\n')
{
- // Match '\n'.
- if ((uint)slice.Length < 2 || slice[1] != '\n')
- {
- goto AlternationBranch;
- }
-
- pos += 2;
- slice = inputSpan.Slice(pos);
- goto AlternationMatch;
-
- AlternationBranch:
- pos = alternation_starting_pos;
- slice = inputSpan.Slice(pos);
- }
-
- // Branch 1
- {
-
+ slice = slice.Slice(1);
pos++;
- slice = inputSpan.Slice(pos);
}
-
- AlternationMatch:;
}
// The input matched.
+ pos++;
base.runtextpos = pos;
base.Capture(0, matchStart, pos);
return true; "(\\b\\d+([\\.,]\\d*)?|\\b)(?<unit>or[ae]|hrs ..." (330 uses)[GeneratedRegex("(\\b\\d+([\\.,]\\d*)?|\\b)(?<unit>or[ae]|hrs|hr|h|minut[oi]|mins|min|second[oi]|secs|sec)\\b", RegexOptions.ExplicitCapture | RegexOptions.Singleline)] /// ○ Match a character in the set [ae].<br/>
/// ○ Match a sequence of expressions.<br/>
/// ○ Match 'h'.<br/>
- /// ○ Match with 2 alternative expressions.<br/>
- /// ○ Match a sequence of expressions.<br/>
- /// ○ Match 'r'.<br/>
- /// ○ Match with 2 alternative expressions.<br/>
- /// ○ Match 's'.<br/>
- /// ○ Match an empty string.<br/>
- /// ○ Match an empty string.<br/>
+ /// ○ Optional (greedy).<br/>
+ /// ○ Match 'r'.<br/>
+ /// ○ Match 's' greedily, optionally.<br/>
/// ○ Match a sequence of expressions.<br/>
/// ○ Match the string "min".<br/>
/// ○ Match with 3 alternative expressions.<br/>
int alternation_branch1 = 0;
int alternation_branch2 = 0;
int alternation_branch3 = 0;
- int alternation_branch4 = 0;
- int alternation_branch5 = 0;
int alternation_starting_capturepos = 0;
int alternation_starting_capturepos1 = 0;
int alternation_starting_capturepos2 = 0;
int alternation_starting_capturepos3 = 0;
- int alternation_starting_capturepos4 = 0;
- int alternation_starting_capturepos5 = 0;
int alternation_starting_pos = 0;
int alternation_starting_pos1 = 0;
int alternation_starting_pos2 = 0;
int alternation_starting_pos3 = 0;
- int alternation_starting_pos4 = 0;
- int alternation_starting_pos5 = 0;
int capture_starting_pos = 0;
int charloop_capture_pos = 0;
int charloop_starting_pos = 0, charloop_ending_pos = 0;
+ int charloop_starting_pos1 = 0, charloop_ending_pos1 = 0;
int loop_iteration = 0;
+ int loop_iteration1 = 0;
int stackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
goto AlternationBranch2;
}
- // Match with 2 alternative expressions.
+ // Optional (greedy).
//{
- alternation_starting_pos2 = pos;
- alternation_starting_capturepos2 = base.Crawlpos();
+ pos++;
+ slice = inputSpan.Slice(pos);
+ loop_iteration1 = 0;
- // Branch 0
- //{
- // Match 'r'.
- if ((uint)slice.Length < 2 || slice[1] != 'r')
- {
- goto AlternationBranch3;
- }
-
- // Match with 2 alternative expressions.
- //{
- alternation_starting_pos3 = pos;
- alternation_starting_capturepos3 = base.Crawlpos();
-
- // Branch 0
- //{
- // Match 's'.
- if ((uint)slice.Length < 3 || slice[2] != 's')
- {
- goto AlternationBranch4;
- }
-
- alternation_branch3 = 0;
- pos += 3;
- slice = inputSpan.Slice(pos);
- goto AlternationMatch3;
-
- AlternationBranch4:
- pos = alternation_starting_pos3;
- slice = inputSpan.Slice(pos);
- UncaptureUntil(alternation_starting_capturepos3);
- //}
-
- // Branch 1
- //{
-
- alternation_branch3 = 1;
- pos += 2;
- slice = inputSpan.Slice(pos);
- goto AlternationMatch3;
- //}
-
- AlternationBacktrack3:
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- switch (alternation_branch3)
- {
- case 0:
- goto AlternationBranch4;
- case 1:
- goto AlternationBranch3;
- }
-
- AlternationMatch3:;
- //}
-
- alternation_branch2 = 0;
- goto AlternationMatch2;
-
- AlternationBranch3:
- pos = alternation_starting_pos2;
- slice = inputSpan.Slice(pos);
- UncaptureUntil(alternation_starting_capturepos2);
- //}
+ LoopBody1:
+ Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), pos);
- // Branch 1
+ loop_iteration1++;
+
+ // Match 'r'.
+ if (slice.IsEmpty || slice[0] != 'r')
+ {
+ goto LoopIterationNoMatch1;
+ }
+
+ // Match 's' greedily, optionally.
//{
-
- alternation_branch2 = 1;
pos++;
slice = inputSpan.Slice(pos);
- goto AlternationMatch2;
+ charloop_starting_pos1 = pos;
+
+ if (!slice.IsEmpty && slice[0] == 's')
+ {
+ slice = slice.Slice(1);
+ pos++;
+ }
+
+ charloop_ending_pos1 = pos;
+ goto CharLoopEnd1;
+
+ CharLoopBacktrack1:
+ UncaptureUntil(base.runstack![--stackpos]);
+ Utilities.StackPop(base.runstack!, ref stackpos, out charloop_ending_pos1, out charloop_starting_pos1);
+
+ if (Utilities.s_hasTimeout)
+ {
+ base.CheckTimeout();
+ }
+
+ if (charloop_starting_pos1 >= charloop_ending_pos1)
+ {
+ goto LoopIterationNoMatch1;
+ }
+ pos = --charloop_ending_pos1;
+ slice = inputSpan.Slice(pos);
+
+ CharLoopEnd1:
+ Utilities.StackPush(ref base.runstack!, ref stackpos, charloop_starting_pos1, charloop_ending_pos1, base.Crawlpos());
//}
- AlternationBacktrack2:
+
+ // The loop has an upper bound of 1. Continue iterating greedily if it hasn't yet been reached.
+ if (loop_iteration1 == 0)
+ {
+ goto LoopBody1;
+ }
+ goto LoopEnd1;
+
+ // The loop iteration failed. Put state back to the way it was before the iteration.
+ LoopIterationNoMatch1:
+ if (--loop_iteration1 < 0)
+ {
+ // Unable to match the remainder of the expression after exhausting the loop.
+ goto AlternationBranch2;
+ }
+ pos = base.runstack![--stackpos];
+ UncaptureUntil(base.runstack![--stackpos]);
+ slice = inputSpan.Slice(pos);
+ goto LoopEnd1;
+
+ LoopBacktrack:
if (Utilities.s_hasTimeout)
{
base.CheckTimeout();
}
- switch (alternation_branch2)
+ if (loop_iteration1 == 0)
{
- case 0:
- goto AlternationBacktrack3;
- case 1:
- goto AlternationBranch2;
+ // No iterations of the loop remain to backtrack into. Fail the loop.
+ goto AlternationBranch2;
}
-
- AlternationMatch2:;
+ goto CharLoopBacktrack1;
+ LoopEnd1:;
//}
alternation_branch1 = 1;
// Match the string "min".
if (!slice.StartsWith("min"))
{
- goto AlternationBranch5;
+ goto AlternationBranch3;
}
// Match with 3 alternative expressions.
//{
- alternation_starting_pos4 = pos;
- alternation_starting_capturepos4 = base.Crawlpos();
+ alternation_starting_pos2 = pos;
+ alternation_starting_capturepos2 = base.Crawlpos();
// Branch 0
//{
!slice.Slice(3).StartsWith("ut") || // Match the string "ut".
(((ch = slice[5]) != 'i') & (ch != 'o'))) // Match a character in the set [io].
{
- goto AlternationBranch6;
+ goto AlternationBranch4;
}
- alternation_branch4 = 0;
+ alternation_branch2 = 0;
pos += 6;
slice = inputSpan.Slice(pos);
- goto AlternationMatch4;
+ goto AlternationMatch2;
- AlternationBranch6:
- pos = alternation_starting_pos4;
+ AlternationBranch4:
+ pos = alternation_starting_pos2;
slice = inputSpan.Slice(pos);
- UncaptureUntil(alternation_starting_capturepos4);
+ UncaptureUntil(alternation_starting_capturepos2);
//}
// Branch 1
// Match 's'.
if ((uint)slice.Length < 4 || slice[3] != 's')
{
- goto AlternationBranch7;
+ goto AlternationBranch5;
}
- alternation_branch4 = 1;
+ alternation_branch2 = 1;
pos += 4;
slice = inputSpan.Slice(pos);
- goto AlternationMatch4;
+ goto AlternationMatch2;
- AlternationBranch7:
- pos = alternation_starting_pos4;
+ AlternationBranch5:
+ pos = alternation_starting_pos2;
slice = inputSpan.Slice(pos);
- UncaptureUntil(alternation_starting_capturepos4);
+ UncaptureUntil(alternation_starting_capturepos2);
//}
// Branch 2
//{
- alternation_branch4 = 2;
+ alternation_branch2 = 2;
pos += 3;
slice = inputSpan.Slice(pos);
- goto AlternationMatch4;
+ goto AlternationMatch2;
//}
- AlternationBacktrack4:
+ AlternationBacktrack2:
if (Utilities.s_hasTimeout)
{
base.CheckTimeout();
}
- switch (alternation_branch4)
+ switch (alternation_branch2)
{
case 0:
- goto AlternationBranch6;
+ goto AlternationBranch4;
case 1:
- goto AlternationBranch7;
- case 2:
goto AlternationBranch5;
+ case 2:
+ goto AlternationBranch3;
}
- AlternationMatch4:;
+ AlternationMatch2:;
//}
alternation_branch1 = 2;
goto AlternationMatch1;
- AlternationBranch5:
+ AlternationBranch3:
pos = alternation_starting_pos1;
slice = inputSpan.Slice(pos);
UncaptureUntil(alternation_starting_capturepos1);
// Match with 3 alternative expressions.
//{
- alternation_starting_pos5 = pos;
- alternation_starting_capturepos5 = base.Crawlpos();
+ alternation_starting_pos3 = pos;
+ alternation_starting_capturepos3 = base.Crawlpos();
// Branch 0
//{
!slice.Slice(3).StartsWith("ond") || // Match the string "ond".
(((ch = slice[6]) != 'i') & (ch != 'o'))) // Match a character in the set [io].
{
- goto AlternationBranch8;
+ goto AlternationBranch6;
}
- alternation_branch5 = 0;
+ alternation_branch3 = 0;
pos += 7;
slice = inputSpan.Slice(pos);
- goto AlternationMatch5;
+ goto AlternationMatch3;
- AlternationBranch8:
- pos = alternation_starting_pos5;
+ AlternationBranch6:
+ pos = alternation_starting_pos3;
slice = inputSpan.Slice(pos);
- UncaptureUntil(alternation_starting_capturepos5);
+ UncaptureUntil(alternation_starting_capturepos3);
//}
// Branch 1
// Match 's'.
if ((uint)slice.Length < 4 || slice[3] != 's')
{
- goto AlternationBranch9;
+ goto AlternationBranch7;
}
- alternation_branch5 = 1;
+ alternation_branch3 = 1;
pos += 4;
slice = inputSpan.Slice(pos);
- goto AlternationMatch5;
+ goto AlternationMatch3;
- AlternationBranch9:
- pos = alternation_starting_pos5;
+ AlternationBranch7:
+ pos = alternation_starting_pos3;
slice = inputSpan.Slice(pos);
- UncaptureUntil(alternation_starting_capturepos5);
+ UncaptureUntil(alternation_starting_capturepos3);
//}
// Branch 2
//{
- alternation_branch5 = 2;
+ alternation_branch3 = 2;
pos += 3;
slice = inputSpan.Slice(pos);
- goto AlternationMatch5;
+ goto AlternationMatch3;
//}
- AlternationBacktrack5:
+ AlternationBacktrack3:
if (Utilities.s_hasTimeout)
{
base.CheckTimeout();
}
- switch (alternation_branch5)
+ switch (alternation_branch3)
{
case 0:
- goto AlternationBranch8;
+ goto AlternationBranch6;
case 1:
- goto AlternationBranch9;
+ goto AlternationBranch7;
case 2:
goto AlternationBacktrack;
}
- AlternationMatch5:;
+ AlternationMatch3:;
//}
alternation_branch1 = 3;
case 0:
goto AlternationBranch1;
case 1:
- goto AlternationBacktrack2;
+ goto LoopBacktrack;
case 2:
- goto AlternationBacktrack4;
+ goto AlternationBacktrack2;
case 3:
- goto AlternationBacktrack5;
+ goto AlternationBacktrack3;
}
AlternationMatch1:;
(WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
}
+ /// <summary>Pops 2 values from the backtracking stack.</summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
+ {
+ arg0 = stack[--pos];
+ arg1 = stack[--pos];
+ }
+
/// <summary>Pushes 2 values onto the backtracking stack.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1)
}
}
+ /// <summary>Pushes 3 values onto the backtracking stack.</summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
+ {
+ // If there's space available for all 3 values, store them.
+ int[] s = stack;
+ int p = pos;
+ if ((uint)(p + 2) < (uint)s.Length)
+ {
+ s[p] = arg0;
+ s[p + 1] = arg1;
+ s[p + 2] = arg2;
+ pos += 3;
+ return;
+ }
+
+ // Otherwise, resize the stack to make room and try again.
+ WithResize(ref stack, ref pos, arg0, arg1, arg2);
+
+ // <summary>Resize the backtracking stack array and push 3 values onto the stack.</summary>
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
+ {
+ Array.Resize(ref stack, (pos + 2) * 2);
+ StackPush(ref stack, ref pos, arg0, arg1, arg2);
+ }
+ }
+
/// <summary>Supports searching for characters in or not in "\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\t\n\v\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f !\"#$%&'()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefgijklnpqrtuvwxyz{|}~\u007f".</summary>
internal static readonly SearchValues<char> s_ascii_FFFFFFFFFFFF00FCFFFFFFFFFF5EF7FF = SearchValues.Create("\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\t\n\v\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f !\"#$%&'()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefgijklnpqrtuvwxyz{|}~\u007f");
} For more diff examples, see https://gist.github.com/MihuBot/a495e0c60796a8d6786b3298c93aa46a JIT assembly changes
For a list of JIT diff regressions, see Regressions.md Sample source code for further analysisconst string JsonPath = "RegexResults-1300.json";
if (!File.Exists(JsonPath))
{
await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2loz5g");
using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}
using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");
record KnownPattern(string Pattern, RegexOptions Options, int Count);
sealed class RegexEntry
{
public required KnownPattern Regex { get; set; }
public required string MainSource { get; set; }
public required string PrSource { get; set; }
public string? FullDiff { get; set; }
public string? ShortDiff { get; set; }
public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
} |
An alternation with two branches where the second is empty is the same as the first branch just being an optional loop. The latter way of it being expressed is better optimized elsewhere in the regex transforms, e.g. with loop coalescing, so we're better off with the optional representation.