diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 57ab539a01337f..e77904c35e7ed6 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -223,7 +223,7 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine(); writer.WriteLine(" // Description:"); - DescribeExpression(writer, rm.Code.Tree.Root.Child(0), " // "); // skip implicit root capture + DescribeExpression(writer, rm.Code.Tree.Root.Child(0), " // ", rm.Code); // skip implicit root capture writer.WriteLine(); writer.WriteLine($" protected override bool FindFirstChar()"); @@ -903,11 +903,11 @@ void EmitSwitchedBranches() sliceStaticPos = startingSliceStaticPos; RegexNode child = node.Child(i); - Debug.Assert(child.Type is RegexNode.One or RegexNode.Multi or RegexNode.Concatenate, DescribeNode(child)); + Debug.Assert(child.Type is RegexNode.One or RegexNode.Multi or RegexNode.Concatenate, DescribeNode(child, rm.Code)); Debug.Assert(child.Type is not RegexNode.Concatenate || (child.ChildCount() >= 2 && child.Child(0).Type is RegexNode.One or RegexNode.Multi)); RegexNode? childStart = child.FindBranchOneOrMultiStart(); - Debug.Assert(childStart is not null, DescribeNode(child)); + Debug.Assert(childStart is not null, DescribeNode(child, rm.Code)); writer.WriteLine($"case {Literal(childStart.FirstCharOfOneOrMulti())}:"); writer.Indent++; @@ -1140,7 +1140,7 @@ void EmitBackreference(RegexNode node) // in which case per ECMA 262 section 21.2.2.9 the backreference should succeed. if ((node.Options & RegexOptions.ECMAScript) != 0) { - writer.WriteLine($"// If the {DescribeNonNegative(node.M)} capture hasn't matched, the backreference matches with RegexOptions.ECMAScript rules."); + writer.WriteLine($"// If the {DescribeCapture(node.M, rm.Code)} hasn't matched, the backreference matches with RegexOptions.ECMAScript rules."); using (EmitBlock(writer, $"if (base.IsMatched({capnum}))")) { EmitWhenHasCapture(); @@ -1148,7 +1148,7 @@ void EmitBackreference(RegexNode node) } else { - writer.WriteLine($"// If the {DescribeNonNegative(node.M)} capture hasn't matched, the backreference doesn't match."); + writer.WriteLine($"// If the {DescribeCapture(node.M, rm.Code)} hasn't matched, the backreference doesn't match."); using (EmitBlock(writer, $"if (!base.IsMatched({capnum}))")) { writer.WriteLine($"goto {doneLabel};"); @@ -1225,7 +1225,7 @@ void EmitBackreferenceConditional(RegexNode node) { using (EmitBlock(writer, $"if (base.IsMatched({capnum}))")) { - writer.WriteLine($"// The {DescribeNonNegative(node.M)} capture group captured a value. Match the first branch."); + writer.WriteLine($"// The {DescribeCapture(node.M, rm.Code)} captured a value. Match the first branch."); EmitNode(yesBranch); writer.WriteLine(); TransferSliceStaticPosToPos(); // make sure sliceStaticPos is 0 after each branch @@ -1702,7 +1702,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck // Put the node's code into its own scope. If the node contains labels that may need to // be visible outside of its scope, the scope is still emitted for clarity but is commented out. - using (EmitScope(writer, DescribeNode(node), faux: PossiblyBacktracks(node) && !node.IsAtomicByParent())) + using (EmitScope(writer, DescribeNode(node, rm.Code), faux: PossiblyBacktracks(node) && !node.IsAtomicByParent())) { switch (node.Type) { @@ -1857,7 +1857,7 @@ void WriteSingleCharChild(RegexNode child, bool includeDescription = true) writer.Write("if ("); } EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true); - prevDescription = includeDescription ? DescribeNode(child) : null; + prevDescription = includeDescription ? DescribeNode(child, rm.Code) : null; wroteClauses = true; } @@ -3395,7 +3395,7 @@ private static string Literal(RegexOptions options) } /// Gets a textual description of the node fit for rendering in a comment in source. - private static string DescribeNode(RegexNode node) => + private static string DescribeNode(RegexNode node, RegexCode regexCode) => node.Type switch { RegexNode.Alternate => $"Match with {node.ChildCount()} alternative expressions{(node.IsAtomicByParent() ? ", atomically" : "")}.", @@ -3403,8 +3403,9 @@ private static string DescribeNode(RegexNode node) => RegexNode.Beginning => "Match if at the beginning of the string.", RegexNode.Bol => "Match if at the beginning of a line.", RegexNode.Boundary => $"Match if at a word boundary.", - RegexNode.Capture when node.N != -1 => $"{DescribeNonNegative(node.M)} capturing group. Uncaptures the {DescribeNonNegative(node.N)} capturing group.", - RegexNode.Capture when node.N == -1 => $"{DescribeNonNegative(node.M)} capturing group.", + RegexNode.Capture when node.M == -1 && node.N != -1 => $"Non-capturing balancing group. Uncaptures the {DescribeCapture(node.N, regexCode)}.", + RegexNode.Capture when node.N != -1 => $"Balancing group. Captures the {DescribeCapture(node.M, regexCode)} and uncaptures the {DescribeCapture(node.N, regexCode)}.", + RegexNode.Capture when node.N == -1 => $"{DescribeCapture(node.M, regexCode)}.", RegexNode.Concatenate => "Match a sequence of expressions.", RegexNode.ECMABoundary => $"Match if at a word boundary (according to ECMAScript rules).", RegexNode.Empty => $"Match an empty string.", @@ -3421,23 +3422,51 @@ private static string DescribeNode(RegexNode node) => RegexNode.One => $"Match {Literal(node.Ch)}.", RegexNode.Oneloop or RegexNode.Oneloopatomic or RegexNode.Onelazy => $"Match {Literal(node.Ch)} {DescribeLoop(node)}.", RegexNode.Prevent => $"Zero-width negative lookahead assertion.", - RegexNode.Ref => $"Match the same text as matched by the {DescribeNonNegative(node.M)} capture group.", + RegexNode.Ref => $"Match the same text as matched by the {DescribeCapture(node.M, regexCode)}.", RegexNode.Require => $"Zero-width positive lookahead assertion.", RegexNode.Set => $"Match a character in the set {RegexCharClass.SetDescription(node.Str!)}.", RegexNode.Setloop or RegexNode.Setloopatomic or RegexNode.Setlazy => $"Match a character in the set {RegexCharClass.SetDescription(node.Str!)} {DescribeLoop(node)}.", RegexNode.Start => "Match if at the start position.", RegexNode.Testgroup => $"Conditionally match one of two expressions depending on whether an initial expression matches.", - RegexNode.Testref => $"Conditionally match one of two expressions depending on whether the {DescribeNonNegative(node.M)} capture group matched.", + RegexNode.Testref => $"Conditionally match one of two expressions depending on whether the {DescribeCapture(node.M, regexCode)} matched.", RegexNode.UpdateBumpalong => $"Advance the next matching position.", _ => $"Unknown node type {node.Type}", }; + /// Gets an identifer to describe a capture group. + private static string DescribeCapture(int capNum, RegexCode regexCode) + { + // If we can get a capture name from the captures collection and it's not just a numerical representation of the group, use it. + string name = RegexParser.GroupNameFromNumber(regexCode.Caps, regexCode.Tree.CapsList, regexCode.CapSize, capNum); + if (!string.IsNullOrEmpty(name) && + (!int.TryParse(name, out int id) || id != capNum)) + { + name = Literal(name); + } + else + { + // Otherwise, create a numerical description of the capture group. + int tens = capNum % 10; + name = tens is >= 1 and <= 3 && capNum % 100 is < 10 or > 20 ? // Ends in 1, 2, 3 but not 11, 12, or 13 + tens switch + { + 1 => $"{capNum}st", + 2 => $"{capNum}nd", + _ => $"{capNum}rd", + } : + $"{capNum}th"; + } + + return $"{name} capture group"; + } + /// Writes a textual description of the node tree fit for rending in source. /// The writer to which the description should be written. /// The node being written. /// The prefix to write at the beginning of every line, including a "//" for a comment. + /// regex tree /// The depth of the current node. - private static void DescribeExpression(TextWriter writer, RegexNode node, string prefix, int depth = 0) + private static void DescribeExpression(TextWriter writer, RegexNode node, string prefix, RegexCode regexCode, int depth = 0) { bool skip = node.Type switch { @@ -3468,7 +3497,7 @@ RegexNode.Testref when node.Next.Child(1) == node => "Not Matched: ", // Write out the line for the node. const char BulletPoint = '\u25CB'; - writer.WriteLine($"{prefix}{new string(' ', depth * 4)}{BulletPoint} {tag}{DescribeNode(node)}"); + writer.WriteLine($"{prefix}{new string(' ', depth * 4)}{BulletPoint} {tag}{DescribeNode(node, regexCode)}"); } // Recur into each of its children. @@ -3476,27 +3505,8 @@ RegexNode.Testref when node.Next.Child(1) == node => "Not Matched: ", for (int i = 0; i < childCount; i++) { int childDepth = skip ? depth : depth + 1; - DescribeExpression(writer, node.Child(i), prefix, childDepth); - } - } - - /// Gets a textual description of a number, e.g. 3 => "3rd". - private static string DescribeNonNegative(int n) - { - if (n < 0) - { - return n.ToString(CultureInfo.InvariantCulture); + DescribeExpression(writer, node.Child(i), prefix, regexCode, childDepth); } - - int tens = n % 10; - return tens is >= 1 and <= 3 && n % 100 is < 10 or > 20 ? // Ends in 1, 2, 3 but not 11, 12, or 13 - tens switch - { - 1 => $"{n}st", - 2 => $"{n}nd", - _ => $"{n}rd", - } : - $"{n}th"; } /// Gets a textual description of a loop's style and bounds. diff --git a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj index 8e1ec70d99d6cc..56f7899f081db8 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj +++ b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj @@ -42,6 +42,7 @@ + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 284c8647f31f43..9d108abfb8f024 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -340,18 +340,7 @@ public int[] GetGroupNumbers() /// public string GroupNameFromNumber(int i) { - if (capslist is null) - { - return (uint)i < (uint)capsize ? - ((uint)i).ToString() : - string.Empty; - } - else - { - return caps != null && !caps.TryGetValue(i, out i) ? string.Empty : - (uint)i < (uint)capslist.Length ? capslist[i] : - string.Empty; - } + return RegexParser.GroupNameFromNumber(caps, capslist, capsize, i); } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 070acc36abcc4c..bcc83926945696 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -2346,5 +2346,27 @@ private RegexParseException MakeException(RegexParseError error, string message) /// Number of characters to the right of the current parsing position. private int CharsRight() => _pattern.Length - _currentPos; + + /// Gets group name from its number. + internal static string GroupNameFromNumber(Hashtable? caps, string[]? capslist, int capsize, int i) + { + if (capslist is null) + { + if ((uint)i < (uint)capsize) + { + return ((uint)i).ToString(); + } + } + else + { + if ((caps is null || caps.TryGetValue(i, out i)) && + (uint)i < (uint)capslist.Length) + { + return capslist[i]; + } + } + + return string.Empty; + } } }