diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 28d3e2850ca518..529e9d37e43033 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -98,6 +98,7 @@ internal abstract class RegexCompiler private static MethodInfo SpanSliceIntIntMethod => field ??= typeof(ReadOnlySpan).GetMethod("Slice", [typeof(int), typeof(int)])!; private static MethodInfo SpanStartsWithSpanMethod => field ??= typeof(MemoryExtensions).GetMethod("StartsWith", [typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0))])!.MakeGenericMethod(typeof(char)); private static MethodInfo SpanStartsWithSpanComparisonMethod => field ??= typeof(MemoryExtensions).GetMethod("StartsWith", [typeof(ReadOnlySpan), typeof(ReadOnlySpan), typeof(StringComparison)])!; + private static MethodInfo SpanSequenceEqualSpanMethod => field ??= typeof(MemoryExtensions).GetMethod("SequenceEqual", [typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0))])!.MakeGenericMethod(typeof(char)); private static MethodInfo StringAsSpanMethod => field ??= typeof(MemoryExtensions).GetMethod("AsSpan", [typeof(string)])!; private static MethodInfo StringGetCharsMethod => field ??= typeof(string).GetMethod("get_Chars", [typeof(int)])!; private static MethodInfo ArrayResizeMethod => field ??= typeof(Array).GetMethod("Resize")!.MakeGenericMethod(typeof(int)); @@ -2112,8 +2113,6 @@ void EmitBackreference(RegexNode node) BrfalseFar((node.Options & RegexOptions.ECMAScript) == 0 ? doneLabel : backreferenceEnd); using RentedLocalBuilder matchLength = RentInt32Local(); - using RentedLocalBuilder matchIndex = RentInt32Local(); - using RentedLocalBuilder i = RentInt32Local(); // int matchLength = base.MatchLength(capnum); Ldthis(); @@ -2135,64 +2134,68 @@ void EmitBackreference(RegexNode node) Ldloc(matchLength); BltFar(doneLabel); - // int matchIndex = base.MatchIndex(capnum); - Ldthis(); - Ldc(capnum); - Call(MatchIndexMethod); - Stloc(matchIndex); + if ((node.Options & RegexOptions.IgnoreCase) != 0) + { + // For case-insensitive, we need to compare character-by-character with case equivalence checks. + using RentedLocalBuilder matchIndex = RentInt32Local(); + using RentedLocalBuilder i = RentInt32Local(); - Label condition = DefineLabel(); - Label body = DefineLabel(); - Label charactersMatched = DefineLabel(); - LocalBuilder backreferenceCharacter = _ilg!.DeclareLocal(typeof(char)); - LocalBuilder currentCharacter = _ilg.DeclareLocal(typeof(char)); + // int matchIndex = base.MatchIndex(capnum); + Ldthis(); + Ldc(capnum); + Call(MatchIndexMethod); + Stloc(matchIndex); - // for (int i = 0; ...) - Ldc(0); - Stloc(i); - Br(condition); + Label condition = DefineLabel(); + Label body = DefineLabel(); + Label charactersMatched = DefineLabel(); + LocalBuilder backreferenceCharacter = _ilg!.DeclareLocal(typeof(char)); + LocalBuilder currentCharacter = _ilg.DeclareLocal(typeof(char)); - MarkLabel(body); + // for (int i = 0; ...) + Ldc(0); + Stloc(i); + Br(condition); - // char backreferenceChar = inputSpan[matchIndex + i]; - Ldloca(inputSpan); - Ldloc(matchIndex); - Ldloc(i); - Add(); - Call(SpanGetItemMethod); - LdindU2(); - Stloc(backreferenceCharacter); - if (!rtl) - { - // char currentChar = slice[i]; - Ldloca(slice); - Ldloc(i); - } - else - { - // char currentChar = inputSpan[pos - matchLength + i]; + MarkLabel(body); + + // char backreferenceChar = inputSpan[matchIndex + i]; Ldloca(inputSpan); - Ldloc(pos); - Ldloc(matchLength); - Sub(); + Ldloc(matchIndex); Ldloc(i); Add(); - } - Call(SpanGetItemMethod); - LdindU2(); - Stloc(currentCharacter); + Call(SpanGetItemMethod); + LdindU2(); + Stloc(backreferenceCharacter); + if (!rtl) + { + // char currentChar = slice[i]; + Ldloca(slice); + Ldloc(i); + } + else + { + // char currentChar = inputSpan[pos - matchLength + i]; + Ldloca(inputSpan); + Ldloc(pos); + Ldloc(matchLength); + Sub(); + Ldloc(i); + Add(); + } + Call(SpanGetItemMethod); + LdindU2(); + Stloc(currentCharacter); - if ((node.Options & RegexOptions.IgnoreCase) != 0) - { LocalBuilder caseEquivalences = DeclareReadOnlySpanChar(); - // if (backreferenceChar != currentChar) + // if (backreferenceChar == currentChar) goto charactersMatched; Ldloc(backreferenceCharacter); Ldloc(currentCharacter); Ceq(); BrtrueFar(charactersMatched); - // if (RegexCaseEquivalences.TryFindCaseEquivalencesForCharWithIBehavior(backreferenceChar, _culture, ref _caseBehavior, out ReadOnlySpan equivalences)) + // if (!RegexCaseEquivalences.TryFindCaseEquivalencesForCharWithIBehavior(backreferenceChar, _culture, ref _caseBehavior, out ReadOnlySpan equivalences)) goto doneLabel; Ldloc(backreferenceCharacter); Ldthisfld(CultureField); Ldthisflda(CaseBehaviorField); @@ -2200,53 +2203,66 @@ void EmitBackreference(RegexNode node) Call(RegexCaseEquivalencesTryFindCaseEquivalencesForCharWithIBehaviorMethod); BrfalseFar(doneLabel); - // if (equivalences.IndexOf(slice[i]) < 0) // Or if (equivalences.IndexOf(inputSpan[pos - matchLength + i]) < 0) when rtl + // if (equivalences.IndexOf(currentCharacter) < 0) goto doneLabel; Ldloc(caseEquivalences); + Ldloc(currentCharacter); + Call(SpanIndexOfCharMethod); + Ldc(0); + BltFar(doneLabel); + + MarkLabel(charactersMatched); + + // for (...; ...; i++) + Ldloc(i); + Ldc(1); + Add(); + Stloc(i); + + // for (...; i < matchLength; ...) + MarkLabel(condition); + Ldloc(i); + Ldloc(matchLength); + Blt(body); + } + else + { + // For case-sensitive, we can use SequenceEqual for efficient comparison. + // if (!inputSpan.Slice(base.MatchIndex(capnum), matchLength).SequenceEqual(slice.Slice(0, matchLength))) goto doneLabel; + // or for RTL: + // if (!inputSpan.Slice(base.MatchIndex(capnum), matchLength).SequenceEqual(inputSpan.Slice(pos - matchLength, matchLength))) goto doneLabel; + + // inputSpan.Slice(base.MatchIndex(capnum), matchLength) + Ldloca(inputSpan); + Ldthis(); + Ldc(capnum); + Call(MatchIndexMethod); + Ldloc(matchLength); + Call(SpanSliceIntIntMethod); + if (!rtl) { + // slice.Slice(0, matchLength) Ldloca(slice); - Ldloc(i); + Ldc(0); + Ldloc(matchLength); + Call(SpanSliceIntIntMethod); } else { + // inputSpan.Slice(pos - matchLength, matchLength) Ldloca(inputSpan); Ldloc(pos); Ldloc(matchLength); Sub(); - Ldloc(i); - Add(); + Ldloc(matchLength); + Call(SpanSliceIntIntMethod); } - Call(SpanGetItemMethod); - LdindU2(); - Call(SpanIndexOfCharMethod); - Ldc(0); - // return false; // input didn't match. - BltFar(doneLabel); - } - else - { - // if (backreferenceCharacter != currentCharacter) - Ldloc(backreferenceCharacter); - Ldloc(currentCharacter); - Ceq(); - // return false; // input didn't match. + + // .SequenceEqual(...) + Call(SpanSequenceEqualSpanMethod); BrfalseFar(doneLabel); } - MarkLabel(charactersMatched); - - // for (...; ...; i++) - Ldloc(i); - Ldc(1); - Add(); - Stloc(i); - - // for (...; i < matchLength; ...) - MarkLabel(condition); - Ldloc(i); - Ldloc(matchLength); - Blt(body); - // pos += matchLength; // or -= for rtl Ldloc(pos); Ldloc(matchLength); diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index bd62dafaa040c8..f605da6f546d70 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -1940,6 +1940,72 @@ public static IEnumerable Match_Advanced_TestData() } }; + // Backreferences with RightToLeft + // Note: For RTL, the pattern is processed right-to-left, so the group must come + // AFTER the backreference in the pattern (i.e., to the right of \1) + yield return new object[] + { + engine, + @"\1(\w)", "aa", RegexOptions.RightToLeft, 2, 2, + new CaptureData[] + { + new CaptureData("aa", 0, 2), + new CaptureData("a", 1, 1), + } + }; + yield return new object[] + { + engine, + @"\1(\w+)", "abcabc", RegexOptions.RightToLeft, 6, 6, + new CaptureData[] + { + new CaptureData("abcabc", 0, 6), + new CaptureData("abc", 3, 3), + } + }; + yield return new object[] + { + engine, + @"\1(\w)", "abba", RegexOptions.RightToLeft, 4, 4, + new CaptureData[] + { + new CaptureData("bb", 1, 2), + new CaptureData("b", 2, 1), + } + }; + + // Backreferences with RightToLeft and IgnoreCase + yield return new object[] + { + engine, + @"\1(\w)", "aA", RegexOptions.RightToLeft | RegexOptions.IgnoreCase, 2, 2, + new CaptureData[] + { + new CaptureData("aA", 0, 2), + new CaptureData("A", 1, 1), + } + }; + yield return new object[] + { + engine, + @"\1(\w+)", "abcABC", RegexOptions.RightToLeft | RegexOptions.IgnoreCase, 6, 6, + new CaptureData[] + { + new CaptureData("abcABC", 0, 6), + new CaptureData("ABC", 3, 3), + } + }; + yield return new object[] + { + engine, + @"\1(\w)", "aBBa", RegexOptions.RightToLeft | RegexOptions.IgnoreCase, 4, 4, + new CaptureData[] + { + new CaptureData("BB", 1, 2), + new CaptureData("B", 2, 1), + } + }; + // Actual - "(?<1>\\d+)abc(?(1)222|111)" yield return new object[] {