diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/Strings.resx b/src/libraries/System.Text.RegularExpressions/gen/Resources/Strings.resx
index 8c2c008f7b7806..4edd9a7dc7dbb7 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/Strings.resx
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/Strings.resx
@@ -303,4 +303,7 @@
Unterminated (?#...) comment.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.cs.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.cs.xlf
index 898a8b49d7fa00..5c933786208a19 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.cs.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.cs.xlf
@@ -192,6 +192,11 @@
Výsledek nelze volat pro shodu, která se nezdařila.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.Kolekce je jen pro čtení.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.de.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.de.xlf
index e20c53d2614c76..821165f07545cd 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.de.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.de.xlf
@@ -192,6 +192,11 @@
Das Ergebnis kann nicht für eine fehlgeschlagene Übereinstimmung aufgerufen werden.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.Die Sammlung ist schreibgeschützt.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.es.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.es.xlf
index 503face0253a4e..f0f919acc6e5bd 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.es.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.es.xlf
@@ -192,6 +192,11 @@
No se puede llamar al resultado si no se encuentra ninguna coincidencia.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.La colección es de sólo lectura.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.fr.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.fr.xlf
index efb0bb67f03002..0a9bb527192bd5 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.fr.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.fr.xlf
@@ -192,6 +192,11 @@
Le résultat ne peut pas être appelé sur un Match ayant échoué.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.La collection est en lecture seule.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.it.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.it.xlf
index b4b166fc3bd702..30d0b81bd4b420 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.it.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.it.xlf
@@ -192,6 +192,11 @@
Impossibile chiamare Result su un Match non riuscito.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.La raccolta è di sola lettura.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ja.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ja.xlf
index 8e392f02b4bb40..fe5540dc87d23a 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ja.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ja.xlf
@@ -192,6 +192,11 @@
失敗した Match で Result を呼び出すことはできません。
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.コレクションは読み取り専用です。
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ko.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ko.xlf
index 08578afc7a53a1..6654610f3f303c 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ko.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ko.xlf
@@ -192,6 +192,11 @@
실패한 Match에서 결과를 호출할 수 없습니다.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.읽기 전용 컬렉션입니다.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pl.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pl.xlf
index 474f5be3212a82..f84553cffe9b58 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pl.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pl.xlf
@@ -192,6 +192,11 @@
Nie można wywołać wyniku błędnego dopasowania.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.Kolekcja jest tylko do odczytu.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pt-BR.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pt-BR.xlf
index b3f2489ecf7236..ee284582bf53e2 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pt-BR.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.pt-BR.xlf
@@ -192,6 +192,11 @@
Não é possível chamar resultado quando há falha na correspondência.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.A coleção é somente leitura.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ru.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ru.xlf
index 7939800aa28a5b..99b880cc7b40ad 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ru.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.ru.xlf
@@ -192,6 +192,11 @@
Вызов результата невозможен при сбойном соответствии.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.Данная коллекция предназначена только для чтения.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.tr.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.tr.xlf
index 567b9974605a9e..bb6c8e1863a10c 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.tr.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.tr.xlf
@@ -192,6 +192,11 @@
Sonuç, başarısız Eşleştirmede çağrılamaz.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.Koleksiyon salt okunur.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hans.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hans.xlf
index 793005782d3ff3..a17a2102fd7d33 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hans.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hans.xlf
@@ -192,6 +192,11 @@
不能对失败的匹配调用结果。
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.集合是只读的。
diff --git a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hant.xlf b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hant.xlf
index b50bd83c543e43..6c552441ce0e9f 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hant.xlf
+++ b/src/libraries/System.Text.RegularExpressions/gen/Resources/xlf/Strings.zh-Hant.xlf
@@ -192,6 +192,11 @@
無法在已失敗的對應 (Match) 上呼叫結果。
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+ Collection is read-only.集合是唯讀的。
diff --git a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs
index c1fbfcad03a9d1..9c0062d5b810c8 100644
--- a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs
+++ b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs
@@ -255,6 +255,7 @@ public enum RegexOptions
RightToLeft = 64,
ECMAScript = 256,
CultureInvariant = 512,
+ NonBacktracking = 1024,
}
public enum RegexParseError
{
diff --git a/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx b/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx
index 9d3c5d417f9eb8..4f4fef7a861e7e 100644
--- a/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx
+++ b/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx
@@ -1,4 +1,5 @@
-
+
+
@@ -153,9 +154,6 @@
Collection is read-only.
-
- This operation is only allowed once per object.
-
This platform does not support writing compiled regular expressions to an assembly. Use RegexGeneratorAttribute with the regular expression source generator instead.
@@ -225,4 +223,37 @@
Unterminated (?#...) comment.
+
+ Regex replacements with substitutions of groups are not supported with RegexOptions.NonBacktracking.
+
+
+ RegexOptions.NonBacktracking is not supported in conjunction with RegexOptions.{0}.
+
+
+ RegexOptions.NonBacktracking is not supported in conjunction with expressions containing: '{0}'.
+
+
+ backreference (\\ number)
+
+
+ captured group conditional (?( name ) yes-pattern | no-pattern ) or (?( number ) yes-pattern| no-pattern )
+
+
+ positive lookahead (?= pattern) or positive lookbehind (?<= pattern)
+
+
+ negative lookahead (?! pattern) or negative lookbehind (?<! pattern)
+
+
+ contiguous matches (\\G)
+
+
+ atomic subexpressions (?> pattern)
+
+
+ test conditional (?( test-pattern ) yes-pattern | no-pattern )
+
+
+ balancing group (?<name1-name2>subexpression) or (?'name1-name2' subexpression)
+
diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index a8f8b7bd4f69f1..4da04ae480e510 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -5,9 +5,9 @@
enable
-
+
@@ -18,6 +18,7 @@
+
@@ -41,11 +42,52 @@
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -60,7 +102,7 @@
-
+
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs
index e5022d109f9557..b7f2f032e78a82 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs
@@ -121,7 +121,7 @@ public virtual string Result(string replacement)
}
// Gets the weakly cached replacement helper or creates one if there isn't one already.
- RegexReplacement repl = RegexReplacement.GetOrCreate(regex._replref!, replacement, regex.caps!, regex.capsize, regex.capnames!, regex.roptions);
+ RegexReplacement repl = RegexReplacement.GetOrCreate(regex.RegexReplacementWeakReference, replacement, regex.caps!, regex.capsize, regex.capnames!, regex.roptions);
SegmentStringBuilder segments = SegmentStringBuilder.Create();
repl.ReplacementImpl(ref segments, this);
return segments.ToString();
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Debug.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Debug.cs
new file mode 100644
index 00000000000000..865c5d096bad54
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Debug.cs
@@ -0,0 +1,68 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#if DEBUG
+using System.Diagnostics.CodeAnalysis;
+using System.IO;
+using System.Text.RegularExpressions.Symbolic;
+using System.Text.RegularExpressions.Symbolic.Unicode;
+
+namespace System.Text.RegularExpressions
+{
+ public partial class Regex
+ {
+ /// True if the regex has debugging enabled.
+ [ExcludeFromCodeCoverage(Justification = "Debug only")]
+ internal bool IsDebug => (roptions & RegexOptions.Debug) != 0;
+
+ /// Unwind the regex and save the resulting state graph in DGML
+ /// roughly the maximum number of states, 0 means no bound
+ /// if true then hide state info
+ /// if true then pretend that there is a .* at the beginning
+ /// if true then unwind the regex backwards (addDotStar is then ignored)
+ /// if true then compute and save only general DFA info
+ /// dgml output is written here
+ /// maximum length of labels in nodes anything over that length is indicated with ..
+ /// if true creates NFA instead of DFA
+ [ExcludeFromCodeCoverage(Justification = "Debug only")]
+ internal void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA)
+ {
+ if (factory is not SymbolicRegexRunnerFactory srmFactory)
+ {
+ throw new NotSupportedException();
+ }
+
+ srmFactory._runner._matcher.SaveDGML(writer, bound, hideStateInfo, addDotStar, inReverse, onlyDFAinfo, maxLabelLength, asNFA);
+ }
+
+ ///
+ /// Generates two files IgnoreCaseRelation.cs and UnicodeCategoryRanges.cs for the namespace System.Text.RegularExpressions.Symbolic.Unicode
+ /// in the given directory path. Only avaliable in DEBUG mode.
+ ///
+ [ExcludeFromCodeCoverage(Justification = "Debug only")]
+ internal static void GenerateUnicodeTables(string path)
+ {
+ IgnoreCaseRelationGenerator.Generate("System.Text.RegularExpressions.Symbolic.Unicode", "IgnoreCaseRelation", path);
+ UnicodeCategoryRangesGenerator.Generate("System.Text.RegularExpressions.Symbolic.Unicode", "UnicodeCategoryRanges", path);
+ }
+
+ ///
+ /// Generates up to k random strings matched by the regex
+ ///
+ /// upper bound on the number of generated strings
+ /// random seed for the generator, 0 means no random seed
+ /// if true then generate inputs that do not match
+ ///
+ [ExcludeFromCodeCoverage(Justification = "Debug only")]
+ internal Collections.Generic.IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative)
+ {
+ if (factory is not SymbolicRegexRunnerFactory srmFactory)
+ {
+ throw new NotSupportedException();
+ }
+
+ return srmFactory._runner._matcher.GenerateRandomMembers(k, randomseed, negative);
+ }
+ }
+}
+#endif
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs
index b3dfbcb4ac5749..1b048fc6875061 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs
@@ -77,7 +77,7 @@ public string Replace(string input, string replacement, int count, int startat)
// Gets the weakly cached replacement helper or creates one if there isn't one already,
// then uses it to perform the replace.
return
- RegexReplacement.GetOrCreate(_replref!, replacement, caps!, capsize, capnames!, roptions).
+ RegexReplacement.GetOrCreate(RegexReplacementWeakReference, replacement, caps!, capsize, capnames!, roptions).
Replace(this, input, count, startat);
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
index 69130ec6c03466..1fc73e49a2b5bc 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
@@ -9,6 +9,7 @@
using System.Reflection.Emit;
using System.Runtime.CompilerServices;
using System.Runtime.Serialization;
+using System.Text.RegularExpressions.Symbolic;
using System.Threading;
namespace System.Text.RegularExpressions
@@ -19,20 +20,19 @@ namespace System.Text.RegularExpressions
///
public partial class Regex : ISerializable
{
- internal const int MaxOptionShift = 10;
+ internal const int MaxOptionShift = 11;
protected internal string? pattern; // The string pattern provided
protected internal RegexOptions roptions; // the top-level options from the options string
- protected internal RegexRunnerFactory? factory;
+ protected internal RegexRunnerFactory? factory; // Factory used to create runner instances for executing the regex
protected internal Hashtable? caps; // if captures are sparse, this is the hashtable capnum->index
protected internal Hashtable? capnames; // if named captures are used, this maps names->index
protected internal string[]? capslist; // if captures are sparse or named captures are used, this is the sorted list of names
protected internal int capsize; // the size of the capture array
- internal WeakReference? _replref; // cached parsed replacement pattern
+ private WeakReference? _replref; // cached parsed replacement pattern
private volatile RegexRunner? _runner; // cached runner
private RegexCode? _code; // if interpreted, this is the code for RegexInterpreter
- private bool _refsInitialized;
protected Regex()
{
@@ -64,28 +64,40 @@ internal Regex(string pattern, CultureInfo? culture)
{
// Call Init directly rather than delegating to a Regex ctor that takes
// options to enable linking / tree shaking to remove the Regex compiler
- // if it may not be used.
+ // and NonBacktracking implementation if it's not used.
Init(pattern, RegexOptions.None, s_defaultMatchTimeout, culture);
}
internal Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo? culture)
{
+ culture ??= GetTargetCulture(options);
Init(pattern, options, matchTimeout, culture);
- // if the compile option is set, then compile the code
- if (RuntimeFeature.IsDynamicCodeCompiled && UseOptionC())
+ if ((options & RegexOptions.NonBacktracking) != 0)
{
- factory = Compile(pattern, _code!, options, matchTimeout != InfiniteMatchTimeout);
+ // If we're in non-backtracking mode, create the appropriate factory.
+ factory = SymbolicRegexRunner.CreateFactory(_code, options, matchTimeout, culture);
+ _code = null;
+ }
+ else if (RuntimeFeature.IsDynamicCodeCompiled && UseOptionC())
+ {
+ // If the compile option is set and compilation is supported, then compile the code.
+ factory = Compile(pattern, _code, options, matchTimeout != InfiniteMatchTimeout);
_code = null;
}
}
+ /// Gets the culture to use based on the specified options.
+ private static CultureInfo GetTargetCulture(RegexOptions options) =>
+ (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
+
/// Initializes the instance.
///
/// This is separated out of the constructor so that an app only using 'new Regex(pattern)'
/// rather than 'new Regex(pattern, options)' can avoid statically referencing the Regex
/// compiler, such that a tree shaker / linker can trim it away if it's not otherwise used.
///
+ [MemberNotNull(nameof(_code))]
private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo? culture)
{
ValidatePattern(pattern);
@@ -93,8 +105,9 @@ private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, C
ValidateMatchTimeout(matchTimeout);
this.pattern = pattern;
- roptions = options;
internalMatchTimeout = matchTimeout;
+ roptions = options;
+ culture ??= GetTargetCulture(options);
#if DEBUG
if (IsDebug)
@@ -104,16 +117,27 @@ private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, C
#endif
// Parse the input
- RegexTree tree = RegexParser.Parse(pattern, roptions, culture ?? ((options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture));
+ RegexTree tree = RegexParser.Parse(pattern, roptions, culture);
- // Extract the relevant information
- capnames = tree.CapNames;
- capslist = tree.CapsList;
+ // Generate the RegexCode from the node tree. This is required for interpreting,
+ // and is used as input into RegexOptions.Compiled and RegexOptions.NonBacktracking.
_code = RegexWriter.Write(tree);
- caps = _code.Caps;
- capsize = _code.CapSize;
- InitializeReferences();
+ if ((options & RegexOptions.NonBacktracking) != 0)
+ {
+ // NonBacktracking doesn't support captures (other than the implicit top-level capture).
+ capnames = null;
+ capslist = null;
+ caps = null;
+ capsize = 1;
+ }
+ else
+ {
+ capnames = tree.CapNames;
+ capslist = tree.CapsList;
+ caps = _code.Caps;
+ capsize = _code.CapSize;
+ }
}
internal static void ValidatePattern(string pattern)
@@ -128,7 +152,7 @@ internal static void ValidateOptions(RegexOptions options)
{
if (((((uint)options) >> MaxOptionShift) != 0) ||
((options & RegexOptions.ECMAScript) != 0 &&
- (options & ~(RegexOptions.ECMAScript | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled |
+ (options & ~(RegexOptions.ECMAScript | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled | RegexOptions.NonBacktracking |
#if DEBUG
RegexOptions.Debug |
#endif
@@ -352,15 +376,17 @@ public int GroupNumberFromName(string name)
}
}
+ /// A weak reference to a regex replacement, lazily initialized.
+ internal WeakReference RegexReplacementWeakReference =>
+ _replref ??
+ Interlocked.CompareExchange(ref _replref, new WeakReference(null), null) ??
+ _replref;
+
protected void InitializeReferences()
{
- if (_refsInitialized)
- {
- ThrowHelper.ThrowNotSupportedException(ExceptionResource.OnlyAllowedOnce);
- }
-
- _replref = new WeakReference(null);
- _refsInitialized = true;
+ // This method no longer has anything to initialize. It continues to exist
+ // purely for API compat, as it was originally shipped as protected, with
+ // assemblies generated by Regex.CompileToAssembly calling it.
}
/// Internal worker called by the public APIs
@@ -375,7 +401,7 @@ protected void InitializeReferences()
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.length, ExceptionResource.LengthNotNegative);
}
- RegexRunner runner = RentRunner();
+ RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner();
try
{
// Do the scan starting at the requested position
@@ -387,47 +413,33 @@ protected void InitializeReferences()
}
finally
{
- ReturnRunner(runner);
+ _runner = runner;
}
}
internal void Run(string input, int startat, ref TState state, MatchCallback callback, bool reuseMatchObject)
{
Debug.Assert((uint)startat <= (uint)input.Length);
- RegexRunner runner = RentRunner();
+ RegexRunner runner = Interlocked.Exchange(ref _runner, null) ?? CreateRunner();
try
{
- runner.Scan(this, input, startat, ref state, callback, reuseMatchObject, internalMatchTimeout);
+ runner.ScanInternal(this, input, startat, ref state, callback, reuseMatchObject, internalMatchTimeout);
}
finally
{
- ReturnRunner(runner);
+ _runner = runner;
}
}
- /// Gets a runner from the cache, or creates a new one.
- [MethodImpl(MethodImplOptions.AggressiveInlining)] // factored out to be used by only two call sites
- private RegexRunner RentRunner() =>
- Interlocked.Exchange(ref _runner, null) ?? // use a cached runner if there is one
- (factory != null ? factory.CreateInstance() : // use the compiled RegexRunner factory if there is one
- new RegexInterpreter(_code!, UseOptionInvariant() ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture));
-
- /// Release the runner back to the cache.
- internal void ReturnRunner(RegexRunner runner) => _runner = runner;
+ /// Creates a new runner instance.
+ private RegexRunner CreateRunner() =>
+ factory?.CreateInstance() ??
+ new RegexInterpreter(_code!, GetTargetCulture(roptions));
/// True if the option was set.
protected bool UseOptionC() => (roptions & RegexOptions.Compiled) != 0;
/// True if the option was set.
protected internal bool UseOptionR() => (roptions & RegexOptions.RightToLeft) != 0;
-
- /// True if the option was set.
- internal bool UseOptionInvariant() => (roptions & RegexOptions.CultureInvariant) != 0;
-
-#if DEBUG
- /// True if the regex has debugging enabled.
- [ExcludeFromCodeCoverage(Justification = "Debug only")]
- internal bool IsDebug => (roptions & RegexOptions.Debug) != 0;
-#endif
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index fb99b0b6300c0b..9e0afe80d01ac6 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -35,7 +35,7 @@ internal sealed partial class RegexCharClass
private const string NullCharString = "\0";
private const char NullChar = '\0';
- private const char LastChar = '\uFFFF';
+ internal const char LastChar = '\uFFFF';
private const short SpaceConst = 100;
private const short NotSpaceConst = -100;
@@ -1267,16 +1267,17 @@ private static RegexCharClass ParseRecursive(string charClass, int start)
///
/// Constructs the string representation of the class.
///
- public string ToStringClass()
+ public string ToStringClass(RegexOptions options = RegexOptions.None)
{
+ bool isNonBacktracking = (options & RegexOptions.NonBacktracking) != 0;
var vsb = new ValueStringBuilder(stackalloc char[256]);
- ToStringClass(ref vsb);
+ ToStringClass(isNonBacktracking, ref vsb);
return vsb.ToString();
}
- private void ToStringClass(ref ValueStringBuilder vsb)
+ private void ToStringClass(bool isNonBacktracking, ref ValueStringBuilder vsb)
{
- Canonicalize();
+ Canonicalize(isNonBacktracking);
int initialLength = vsb.Length;
int categoriesLength = _categories?.Length ?? 0;
@@ -1302,7 +1303,7 @@ private void ToStringClass(ref ValueStringBuilder vsb)
// Update the range length. The ValueStringBuilder may have already had some
// contents (if this is a subtactor), so we need to offset by the initial length.
- vsb[initialLength + SetLengthIndex] = (char)((vsb.Length - initialLength) - SetStartIndex);
+ vsb[initialLength + SetLengthIndex] = (char)(vsb.Length - initialLength - SetStartIndex);
// Append categories
if (categoriesLength != 0)
@@ -1314,13 +1315,13 @@ private void ToStringClass(ref ValueStringBuilder vsb)
}
// Append a subtractor if there is one.
- _subtractor?.ToStringClass(ref vsb);
+ _subtractor?.ToStringClass(isNonBacktracking, ref vsb);
}
///
/// Logic to reduce a character class to a unique, sorted form.
///
- private void Canonicalize()
+ private void Canonicalize(bool isNonBacktracking)
{
List? rangelist = _rangelist;
if (rangelist != null)
@@ -1376,7 +1377,10 @@ private void Canonicalize()
// If the class now represents a single negated character, but does so by including every
// other character, invert it to produce a normalized form recognized by IsSingletonInverse.
- if (!_negate && _subtractor is null && (_categories is null || _categories.Length == 0))
+ if (!isNonBacktracking && // do not produce the IsSingletonInverse transformation in NonBacktracking mode
+ !_negate &&
+ _subtractor is null &&
+ (_categories is null || _categories.Length == 0))
{
if (rangelist.Count == 2)
{
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
index 4de1c21514b179..282483e8d90657 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -152,10 +152,7 @@ public RegexNode(int type, RegexOptions options, int m, int n)
N = n;
}
- public bool UseOptionR()
- {
- return (Options & RegexOptions.RightToLeft) != 0;
- }
+ public bool UseOptionR() => (Options & RegexOptions.RightToLeft) != 0;
public RegexNode ReverseLeft()
{
@@ -172,7 +169,7 @@ public RegexNode ReverseLeft()
///
private void MakeRep(int type, int min, int max)
{
- Type += (type - One);
+ Type += type - One;
M = min;
N = max;
}
@@ -384,6 +381,13 @@ private static void EliminateEndingBacktracking(RegexNode node, uint maxDepth)
return;
}
+ // RegexOptions.NonBacktracking doesn't support atomic groups, so when that option
+ // is set we don't want to create atomic groups where they weren't explicitly authored.
+ if ((node.Options & RegexOptions.NonBacktracking) != 0)
+ {
+ return;
+ }
+
// Walk the tree starting from the provided node.
while (true)
{
@@ -560,6 +564,13 @@ private RegexNode ReduceGroup()
///
private RegexNode ReduceAtomic()
{
+ // RegexOptions.NonBacktracking doesn't support atomic groups, so when that option
+ // is set we don't want to create atomic groups where they weren't explicitly authored.
+ if ((Options & RegexOptions.NonBacktracking) != 0)
+ {
+ return this;
+ }
+
Debug.Assert(Type == Atomic);
Debug.Assert(ChildCount() == 1);
@@ -967,7 +978,7 @@ void ReduceSingleLetterAndNestedAlternations()
}
prev.Type = Set;
- prev.Str = prevCharClass.ToStringClass();
+ prev.Str = prevCharClass.ToStringClass(Options);
}
else if (at.Type == Nothing)
{
@@ -1442,6 +1453,13 @@ static bool CanCombineCounts(int nodeMin, int nodeMax, int nextMin, int nextMax)
///
private void ReduceConcatenationWithAutoAtomic()
{
+ // RegexOptions.NonBacktracking doesn't support atomic groups, so when that option
+ // is set we don't want to create atomic groups where they weren't explicitly authored.
+ if ((Options & RegexOptions.NonBacktracking) != 0)
+ {
+ return;
+ }
+
Debug.Assert(Type == Concatenate);
Debug.Assert((Options & RegexOptions.RightToLeft) == 0);
Debug.Assert(Children is List);
@@ -1660,7 +1678,7 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, uint m
case End:
case EndZ when !RegexCharClass.CharInClass('\n', node.Str!):
case Eol when !RegexCharClass.CharInClass('\n', node.Str!):
- case Boundary when node.Str == RegexCharClass.WordClass || node.Str == RegexCharClass.DigitClass: // TODO: Expand these with a more inclusive overlap check that considers categories
+ case Boundary when node.Str == RegexCharClass.WordClass || node.Str == RegexCharClass.DigitClass:
case NonBoundary when node.Str == RegexCharClass.NotWordClass || node.Str == RegexCharClass.NotDigitClass:
case ECMABoundary when node.Str == RegexCharClass.ECMAWordClass || node.Str == RegexCharClass.ECMADigitClass:
case NonECMABoundary when node.Str == RegexCharClass.NotECMAWordClass || node.Str == RegexCharClass.NotDigitClass:
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs
index d39b2d557ab0e8..47b7eed807dc7b 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOptions.cs
@@ -11,20 +11,61 @@ namespace System.Text.RegularExpressions
#endif
enum RegexOptions
{
+ /// Use default behavior.
None = 0x0000,
+
+ /// Use case-insensitive matching.
IgnoreCase = 0x0001, // "i"
+
+ ///
+ /// Use multiline mode, where ^ and $ match the beginning and end of each line
+ /// (instead of the beginning and end of the input string).
+ ///
Multiline = 0x0002, // "m"
+
+ ///
+ /// Do not capture unnamed groups. The only valid captures are explicitly named
+ /// or numbered groups of the form (?<name> subexpression).
+ ///
ExplicitCapture = 0x0004, // "n"
+
+ /// Compile the regular expression to Microsoft intermediate language (MSIL).
Compiled = 0x0008, // "c"
+
+ ///
+ /// Use single-line mode, where the period (.) matches every character (instead of every character except \n).
+ ///
Singleline = 0x0010, // "s"
+
+ /// Exclude unescaped white space from the pattern, and enable comments after a number sign (#).
IgnorePatternWhitespace = 0x0020, // "x"
+
+ /// Change the search direction. Search moves from right to left instead of from left to right.
RightToLeft = 0x0040, // "r"
+
#if DEBUG
+ /// Enable Regex debugging.
Debug = 0x0080, // "d"
#endif
+
+ /// Enable ECMAScript-compliant behavior for the expression.
ECMAScript = 0x0100, // "e"
+
+ /// Ignore cultural differences in language.
CultureInvariant = 0x0200,
+ ///
+ /// Enable matching using an approach that avoids backtracking and guarantees linear-time processing
+ /// in the length of the input.
+ ///
+ ///
+ /// Certain features aren't available when this option is set, including balancing groups,
+ /// backreferences, positive and negative lookaheads and lookbehinds, and atomic groups.
+ /// Capture groups are also ignored, such that the only capture available is that for
+ /// the top-level match.
+ ///
+ NonBacktracking = 0x0400,
+
// RegexCompiler internally uses 0x80000000 for its own internal purposes.
// If such a value ever needs to be added publicly, RegexCompiler will need
// to be changed to avoid it.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
index 2e0ff1467eb65e..48f610caf303ba 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
@@ -62,17 +62,17 @@ private RegexParser(string pattern, RegexOptions options, CultureInfo culture, H
_capnames = capnames;
_optionsStack = new ValueListBuilder(optionSpan);
- _stack = default;
- _group = default;
- _alternation = default;
- _concatenation = default;
- _unit = default;
+ _stack = null;
+ _group = null;
+ _alternation = null;
+ _concatenation = null;
+ _unit = null;
_currentPos = 0;
- _autocap = default;
- _capcount = default;
- _captop = default;
- _capnumlist = default;
- _capnamelist = default;
+ _autocap = 0;
+ _capcount = 0;
+ _captop = 0;
+ _capnumlist = null;
+ _capnamelist = null;
_ignoreNextParen = false;
}
@@ -240,10 +240,7 @@ private void Reset(RegexOptions options)
_stack = null;
}
- public void Dispose()
- {
- _optionsStack.Dispose();
- }
+ public void Dispose() => _optionsStack.Dispose();
/*
* The main parsing function.
@@ -322,7 +319,7 @@ private RegexNode ScanRegex()
goto ContinueOuterScan;
case '[':
- AddUnitSet(ScanCharClass(UseOptionI(), scanOnly: false)!.ToStringClass());
+ AddUnitSet(ScanCharClass(UseOptionI(), scanOnly: false)!.ToStringClass(_options));
break;
case '(':
@@ -534,7 +531,19 @@ private RegexNode ScanReplacement()
{
if (RightCharMoveRight() == '$')
{
- AddUnitNode(ScanDollar());
+ RegexNode node = ScanDollar();
+
+ // NonBacktracking does not support capture groups, so any replacement patterns that refer to
+ // groups are unsupported. However, the replacement patterns that refer to the left/right portion
+ // or all of the input as well as referring to group 0 (i.e. the whole match) are supported.
+ if ((_options & RegexOptions.NonBacktracking) != 0 &&
+ node.Type == RegexNode.Ref &&
+ node.M is not (0 or RegexReplacement.LeftPortion or RegexReplacement.RightPortion or RegexReplacement.WholeString))
+ {
+ throw new NotSupportedException(SR.NotSupported_NonBacktrackingAndReplacementsWithSubstitutionsOfGroups);
+ }
+
+ AddUnitNode(node);
}
AddConcatenate();
@@ -1206,7 +1215,7 @@ private void ScanBlank()
cc.AddLowercase(_culture);
}
- return new RegexNode(RegexNode.Set, _options, cc.ToStringClass());
+ return new RegexNode(RegexNode.Set, _options, cc.ToStringClass(_options));
default:
return ScanBasicBackslash(scanOnly);
@@ -1390,6 +1399,7 @@ private RegexNode ScanDollar()
int capnum = -1;
int newcapnum = ch - '0';
MoveRight();
+ CheckUnsupportedNonBacktrackingNumericRef(newcapnum);
if (IsCaptureSlot(newcapnum))
{
capnum = newcapnum;
@@ -1407,6 +1417,7 @@ private RegexNode ScanDollar()
newcapnum = newcapnum * 10 + digit;
MoveRight();
+ CheckUnsupportedNonBacktrackingNumericRef(newcapnum);
if (IsCaptureSlot(newcapnum))
{
capnum = newcapnum;
@@ -1424,6 +1435,7 @@ private RegexNode ScanDollar()
int capnum = ScanDecimal();
if (!angled || CharsRight() > 0 && RightCharMoveRight() == '}')
{
+ CheckUnsupportedNonBacktrackingNumericRef(capnum);
if (IsCaptureSlot(capnum))
{
return new RegexNode(RegexNode.Ref, _options, capnum);
@@ -1434,9 +1446,19 @@ private RegexNode ScanDollar()
else if (angled && RegexCharClass.IsWordChar(ch))
{
string capname = ScanCapname();
- if (CharsRight() > 0 && RightCharMoveRight() == '}' && IsCaptureName(capname))
+ if (CharsRight() > 0 && RightCharMoveRight() == '}')
{
- return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname));
+ // Throw unconditionally for non-backtracking, even if not a valid capture name,
+ // as information to determine whether a name is valid or not isn't tracked.
+ if ((_options & RegexOptions.NonBacktracking) != 0)
+ {
+ throw new NotSupportedException(SR.NotSupported_NonBacktrackingAndReplacementsWithSubstitutionsOfGroups);
+ }
+
+ if (IsCaptureName(capname))
+ {
+ return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname));
+ }
}
}
else if (!angled)
@@ -1483,6 +1505,16 @@ private RegexNode ScanDollar()
return new RegexNode(RegexNode.One, _options, '$');
}
+ /// Throws on unsupported capture references for NonBacktracking in replacement patterns.
+ private void CheckUnsupportedNonBacktrackingNumericRef(int capnum)
+ {
+ // Throw for non-backtracking on non-zero group, even if not a valid capture number, as information to determine whether a name is valid or not isn't tracked
+ if ((_options & RegexOptions.NonBacktracking) != 0 && capnum != 0)
+ {
+ throw new NotSupportedException(SR.NotSupported_NonBacktrackingAndReplacementsWithSubstitutionsOfGroups);
+ }
+ }
+
/*
* Scans a capture name: consumes word chars
*/
@@ -2271,22 +2303,13 @@ private void AddUnitNotone(char ch)
}
/// Sets the current unit to a single set node
- private void AddUnitSet(string cc)
- {
- _unit = new RegexNode(RegexNode.Set, _options, cc);
- }
+ private void AddUnitSet(string cc) => _unit = new RegexNode(RegexNode.Set, _options, cc);
/// Sets the current unit to a subtree
- private void AddUnitNode(RegexNode node)
- {
- _unit = node;
- }
+ private void AddUnitNode(RegexNode node) => _unit = node;
/// Sets the current unit to an assertion of the specified type
- private void AddUnitType(int type)
- {
- _unit = new RegexNode(type, _options);
- }
+ private void AddUnitType(int type) => _unit = new RegexNode(type, _options);
/// Finish the current group (in response to a ')' or end)
private void AddGroup()
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
index 3a27f5852df00a..814f05e5aeb119 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
@@ -106,12 +106,11 @@ public RegexReplacement(string rep, RegexNode concat, Hashtable _caps)
/// Either returns a weakly cached RegexReplacement helper or creates one and caches it.
///
///
- public static RegexReplacement GetOrCreate(WeakReference replRef, string replacement, Hashtable caps,
+ public static RegexReplacement GetOrCreate(WeakReference replRef, string replacement, Hashtable caps,
int capsize, Hashtable capnames, RegexOptions roptions)
{
- RegexReplacement? repl;
- if (!replRef.TryGetTarget(out repl) || !repl.Pattern.Equals(replacement))
+ if (!replRef.TryGetTarget(out RegexReplacement? repl) || !repl.Pattern.Equals(replacement))
{
repl = RegexParser.ParseReplacement(replacement, roptions, caps, capsize, capnames);
replRef.SetTarget(repl);
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
index 3006cbc26cf8d3..dbab82e7cfd4a2 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
@@ -57,6 +57,9 @@ public abstract class RegexRunner
protected internal Match? runmatch; // result object
protected internal Regex? runregex; // regex object
+ // TODO: Expose something as protected internal: https://github.com/dotnet/runtime/issues/59629
+ private protected bool quick; // false if match details matter, true if only the fact that match occurred matters
+
private int _timeout; // timeout in milliseconds (needed for actual)
private bool _ignoreTimeout;
private int _timeoutOccursAt;
@@ -68,7 +71,7 @@ public abstract class RegexRunner
private const int TimeoutCheckFrequency = 1000;
private int _timeoutChecksToSkip;
- protected internal RegexRunner() { }
+ protected RegexRunner() { }
///
/// Scans the string to find the first match. Uses the Match object
@@ -82,11 +85,13 @@ protected internal RegexRunner() { }
/// and we could use a separate method Skip() that will quickly scan past
/// any characters that we know can't match.
///
- protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) =>
+ protected Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) =>
Scan(regex, text, textbeg, textend, textstart, prevlen, quick, regex.MatchTimeout);
protected internal Match? Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout)
{
+ this.quick = quick;
+
// Handle timeout argument
_timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds
bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout;
@@ -218,8 +223,10 @@ protected internal RegexRunner() { }
/// This optionally repeatedly hands out the same Match instance, updated with new information.
/// should be set to false if the Match object is handed out to user code.
///
- internal void Scan(Regex regex, string text, int textstart, ref TState state, MatchCallback callback, bool reuseMatchObject, TimeSpan timeout)
+ internal void ScanInternal(Regex regex, string text, int textstart, ref TState state, MatchCallback callback, bool reuseMatchObject, TimeSpan timeout)
{
+ quick = false;
+
// Handle timeout argument
_timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds
bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerFactory.cs
index 5d3a84299c30ac..93860f20cdf477 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerFactory.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunnerFactory.cs
@@ -1,8 +1,6 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
-// This RegexRunnerFactory class is a base class for compiled regex code.
-
namespace System.Text.RegularExpressions
{
public abstract class RegexRunnerFactory
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs
new file mode 100644
index 00000000000000..60640e1741e8f5
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs
@@ -0,0 +1,501 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ ///
+ /// Represents nodes in a Binary Decision Diagram (BDD), which compactly represent sets of integers. All non-leaf
+ /// nodes have an Ordinal, which indicates the position of the bit the node relates to (0 for the least significant
+ /// bit), and two children, One and Zero, for the cases of the current bit being 1 or 0, respectively. An integer
+ /// belongs to the set represented by the BDD if the path from the root following the branches that correspond to
+ /// the bits of the integer leads to the True leaf. This class also supports multi-terminal BDDs (MTBDD), i.e. ones where
+ /// the leaves are something other than True or False, which are used for representing classifiers.
+ ///
+ internal sealed class BDD : IComparable
+ {
+ ///
+ /// The ordinal for the True special value.
+ ///
+ private const int TrueOrdinal = -2;
+
+ ///
+ /// The ordinal for the False special value.
+ ///
+ private const int FalseOrdinal = -1;
+
+ ///
+ /// The unique BDD leaf that represents the full set or true.
+ ///
+ public static readonly BDD True = new BDD(TrueOrdinal, null, null);
+
+ ///
+ /// The unique BDD leaf that represents the empty set or false.
+ ///
+ public static readonly BDD False = new BDD(FalseOrdinal, null, null);
+
+ ///
+ /// The encoding of the set for lower ordinals for the case when the current bit is 1.
+ /// The value is null iff IsLeaf is true.
+ ///
+ public readonly BDD? One;
+
+ ///
+ /// The encoding of the set for lower ordinals for the case when the current bit is 0.
+ /// The value is null iff IsLeaf is true.
+ ///
+ public readonly BDD? Zero;
+
+ ///
+ /// Ordinal of this bit if nonleaf else MTBDD terminal value when nonnegative
+ ///
+ public readonly int Ordinal;
+
+ ///
+ /// Preassigned hashcode value that respects equivalence: equivalent BDDs have equal hashcodes
+ ///
+ private readonly int _hashcode;
+
+ ///
+ /// Representation of False for serialization.
+ ///
+ private static readonly long[] s_falseRepresentation = new long[] { 0 };
+
+ ///
+ /// Representation of True for serialization.
+ ///
+ private static readonly long[] s_trueRepresentation = new long[] { 1 };
+
+ internal BDD(int ordinal, BDD? one, BDD? zero)
+ {
+ One = one;
+ Zero = zero;
+ Ordinal = ordinal;
+
+ // Precompute a hashchode value that respects BDD equivalence.
+ // Two equivalent BDDs will always have the same hashcode
+ // that is independent of object id values of the BDD objects.
+ _hashcode = HashCode.Combine(ordinal, one, zero);
+ }
+
+ ///
+ /// True iff the node is a terminal (One and Zero are both null).
+ /// True and False are terminals.
+ ///
+ [MemberNotNullWhen(false, nameof(One))]
+ [MemberNotNullWhen(false, nameof(Zero))]
+ public bool IsLeaf
+ {
+ get
+ {
+ if (One is null)
+ {
+ Debug.Assert(Zero is null);
+ return true;
+ }
+
+ Debug.Assert(Zero is not null);
+ return false;
+ }
+ }
+
+ ///
+ /// True iff the BDD is True.
+ ///
+ public bool IsFull => this == True;
+
+ ///
+ /// True iff the BDD is False.
+ ///
+ public bool IsEmpty => this == False;
+
+ ///
+ /// Gets the lexicographically minimum bitvector in this BDD as a ulong.
+ /// The BDD must be nonempty.
+ ///
+ public ulong GetMin()
+ {
+ BDD set = this;
+ Debug.Assert(!set.IsEmpty);
+
+ if (set.IsFull)
+ return 0;
+
+ // starting from all 0, bits will be flipped to 1 as necessary
+ ulong res = 0;
+
+ // follow the minimum path throught the branches to a True leaf
+ while (!set.IsLeaf)
+ {
+ if (set.Zero.IsEmpty) //the bit must be set to 1
+ {
+ // the bit must be set to 1 when the zero branch is False
+ res |= (ulong)1 << set.Ordinal;
+ // if zero is empty then by the way BDDs are constructed one is not
+ set = set.One;
+ }
+ else
+ {
+ // otherwise, leaving the bit as 0 gives the smaller bitvector
+ set = set.Zero;
+ }
+ }
+
+ return res;
+ }
+
+ ///
+ /// O(1) operation that returns the precomputed hashcode.
+ ///
+ public override int GetHashCode() => _hashcode;
+
+ ///
+ /// A shallow equality check that holds if ordinals are identical and one's are identical and zero's are identical.
+ /// This equality is used in the _bddCache lookup.
+ ///
+ public override bool Equals(object? obj) =>
+ obj is BDD bdd &&
+ (this == bdd || (Ordinal == bdd.Ordinal && One == bdd.One && Zero == bdd.Zero));
+
+ ///
+ /// Returns a topologically sorted array of all the nodes (other than True or False) in this BDD
+ /// such that, all MTBDD leaves (other than True or False) appear first in the array
+ /// and all nonterminals with smaller ordinal appear before nodes with larger ordinal.
+ /// So this BDD itself (if different from True or False) appears last.
+ /// In the case of True or False returns the empty array.
+ ///
+ public BDD[] TopologicalSort()
+ {
+ if (IsFull || IsEmpty)
+ return Array.Empty();
+
+ if (IsLeaf)
+ return new BDD[] { this };
+
+ // Order the nodes according to their ordinals into the nonterminals array
+ var nonterminals = new List[Ordinal + 1];
+ var sorted = new List();
+ var toVisit = new Stack();
+ var visited = new HashSet();
+
+ toVisit.Push(this);
+
+ while (toVisit.Count > 0)
+ {
+ BDD node = toVisit.Pop();
+ // True and False are not included in the result
+ if (node.IsFull || node.IsEmpty)
+ continue;
+
+ if (node.IsLeaf)
+ {
+ // MTBDD terminals can be directly added to the sorted nodes, since they have no children that
+ // would come first in the topological ordering.
+ sorted.Add(node);
+ }
+ else
+ {
+ // Non-terminals are grouped by their ordinal so that they can be sorted into a topological order.
+ (nonterminals[node.Ordinal] ??= new List()).Add(node);
+
+ if (visited.Add(node.Zero))
+ toVisit.Push(node.Zero);
+
+ if (visited.Add(node.One))
+ toVisit.Push(node.One);
+ }
+ }
+
+ // Flush the grouped non-terminals into the sorted nodes from smallest to highest ordinal. The highest
+ // ordinal is guaranteed to have only one node, which places the root of the BDD at the end.
+ for (int i = 0; i < nonterminals.Length; i++)
+ {
+ if (nonterminals[i] != null)
+ {
+ sorted.AddRange(nonterminals[i]);
+ }
+ }
+
+ return sorted.ToArray();
+ }
+
+ #region Serialization
+ ///
+ /// Serialize this BDD in a flat ulong array. The BDD may have at most 2^k ordinals and 2^n nodes, such that k+2n < 64
+ /// BDD.False is represented by return value ulong[]{0}.
+ /// BDD.True is represented by return value ulong[]{1}.
+ /// Serializer uses more compacted representations when fewer bits are needed, which is reflected in the first
+ /// two numbers of the return value. MTBDD terminals are represented by negated numbers as -id.
+ ///
+ public long[] Serialize()
+ {
+ if (IsEmpty)
+ return s_falseRepresentation;
+
+ if (IsFull)
+ return s_trueRepresentation;
+
+ if (IsLeaf)
+ return new long[] { 0, 0, -Ordinal };
+
+ BDD[] nodes = TopologicalSort();
+
+ Debug.Assert(nodes[nodes.Length - 1] == this);
+ Debug.Assert(nodes.Length <= (1 << 24));
+
+ // As few bits as possible are used to for ordinals and node identifiers for compact serialization.
+ // Use at least a nibble (4 bits) to represent the ordinal and count how many are needed.
+ int ordinal_bits = 4;
+ while (Ordinal >= (1 << ordinal_bits))
+ {
+ ordinal_bits += 1;
+ }
+
+ // Use at least 2 bits to represent the node identifier and count how many are needed
+ int node_bits = 2;
+ while (nodes.Length >= (1 << node_bits))
+ {
+ node_bits += 1;
+ }
+
+ // Reserve space for all nodes plus 2 extra: index 0 and 1 are reserved for False and True
+ long[] res = new long[nodes.Length + 2];
+ res[0] = ordinal_bits;
+ res[1] = node_bits;
+
+ //use the following bit layout
+ BitLayout(ordinal_bits, node_bits, out int zero_node_shift, out int one_node_shift, out int ordinal_shift);
+
+ //here we know that bdd is neither False nor True
+ //but it could still be a MTBDD leaf if both children are null
+ var idmap = new Dictionary
+ {
+ [True] = 1,
+ [False] = 0
+ };
+
+ // Give all nodes ascending identifiers and produce their serializations into the result
+ for (int i = 0; i < nodes.Length; i++)
+ {
+ BDD node = nodes[i];
+ idmap[node] = i + 2;
+
+ if (node.IsLeaf)
+ {
+ // This is MTBDD leaf. Negating it should make it less than or equal to zero, as True and False are
+ // excluded here and MTBDD Ordinals are required to be non-negative.
+ res[i + 2] = -node.Ordinal;
+ }
+ else
+ {
+ // Combine ordinal and child identifiers according to the bit layout
+ long v = (((long)node.Ordinal) << ordinal_shift) | (idmap[node.One] << one_node_shift) | (idmap[node.Zero] << zero_node_shift);
+ Debug.Assert(v >= 0);
+ res[i + 2] = v; // children ids are well-defined due to the topological order of nodes
+ }
+ }
+ return res;
+ }
+
+ ///
+ /// Recreates a BDD from a ulong array that has been created using Serialize.
+ /// Is executed using a lock on algebra (if algebra != null) in a single thread mode.
+ /// If no algebra is given (algebra is null) then creates the BDD without using a BDD algebra --
+ /// which implies that all BDD nodes other than True and False are new BDD objects
+ /// that have not been internalized or cached.
+ ///
+ public static BDD Deserialize(long[] arcs, BDDAlgebra algebra)
+ {
+ if (arcs.Length == 1)
+ {
+ return arcs[0] == 0 ? False : True;
+ }
+
+ // the number of bits used for ordinals and node identifiers are stored in the first two values
+ int k = arcs.Length;
+ int ordinal_bits = (int)arcs[0];
+ int node_bits = (int)arcs[1];
+
+ // create bit masks for the sizes of ordinals and node identifiers
+ long ordinal_mask = (1 << ordinal_bits) - 1;
+ long node_mask = (1 << node_bits) - 1;
+ BitLayout(ordinal_bits, node_bits, out int zero_node_shift, out int one_node_shift, out int ordinal_shift);
+
+ // store BDD nodes by their id when they are created
+ BDD[] nodes = new BDD[k];
+ nodes[0] = False;
+ nodes[1] = True;
+
+ for (int i = 2; i < k; i++)
+ {
+ long arc = arcs[i];
+ if (arc <= 0)
+ {
+ // this is an MTBDD leaf. Its ordinal was serialized negated
+ nodes[i] = algebra.GetOrCreateBDD((int)-arc, null, null);
+ }
+ else
+ {
+ // reconstruct the ordinal and child identifiers for a non-terminal
+ int ord = (int)((arc >> ordinal_shift) & ordinal_mask);
+ int oneId = (int)((arc >> one_node_shift) & node_mask);
+ int zeroId = (int)((arc >> zero_node_shift) & node_mask);
+
+ // the BDD nodes for the children are guaranteed to exist already due to the topological order
+ nodes[i] = algebra.GetOrCreateBDD(ord, nodes[oneId], nodes[zeroId]);
+ }
+ }
+
+ //the result is the final BDD in the nodes array
+ return nodes[k - 1];
+ }
+
+ ///
+ /// Use this bit layout in the serialization
+ ///
+ private static void BitLayout(int ordinal_bits, int node_bits, out int zero_node_shift, out int one_node_shift, out int ordinal_shift)
+ {
+ //this bit layout seems to work best: zero,one,ord
+ zero_node_shift = ordinal_bits + node_bits;
+ one_node_shift = ordinal_bits;
+ ordinal_shift = 0;
+ }
+ #endregion
+
+ ///
+ /// Finds the terminal for the input in a Multi-Terminal-BDD.
+ /// Bits of the input are used to determine the path in the BDD.
+ /// Returns -1 if False is reached and -2 if True is reached,
+ /// else returns the MTBDD terminal number that is reached.
+ /// If this is a nonterminal, Find does not care about input bits > Ordinal.
+ ///
+ public int Find(int input) =>
+ IsLeaf ? Ordinal :
+ (input & (1 << Ordinal)) == 0 ? Zero.Find(input) :
+ One.Find(input);
+
+ ///
+ /// Finds the terminal for the input in a Multi-Terminal-BDD.
+ /// Bits of the input are used to determine the path in the BDD.
+ /// Returns -1 if False is reached and 0 if True is reached,
+ /// else returns the MTBDD terminal number that is reached.
+ /// If this is a nonterminal, Find does not care about input bits > Ordinal.
+ ///
+ public int Find(ulong input) =>
+ IsLeaf ? Ordinal :
+ (input & ((ulong)1 << Ordinal)) == 0 ? Zero.Find(input) :
+ One.Find(input);
+
+ ///
+ /// Assumes BDD is not MTBDD and returns true iff it contains the input.
+ /// (Otherwise use BDD.Find if this is if fact a MTBDD.)
+ ///
+ public bool Contains(int input) => Find(input) == TrueOrdinal; //-2 is the Ordinal of BDD.True
+
+ ///
+ /// Returns true if the only other terminal besides False is a MTBDD terminal that is different from True.
+ /// If this is the case, outputs that terminal.
+ ///
+ public bool IsEssentiallyBoolean([NotNullWhen(true)] out BDD? terminalActingAsTrue)
+ {
+ if (IsFull || IsEmpty)
+ {
+ terminalActingAsTrue = null;
+ return false;
+ }
+
+ if (IsLeaf)
+ {
+ terminalActingAsTrue = this;
+ return true;
+ }
+
+ var toVisit = new Stack();
+ var visited = new HashSet();
+
+ toVisit.Push(this);
+
+ // this will hold the unique MTBDD leaf
+ BDD? leaf = null;
+
+ while (toVisit.Count > 0)
+ {
+ BDD node = toVisit.Pop();
+ if (node.IsEmpty)
+ continue;
+
+ if (node.IsFull)
+ {
+ //contains the True leaf
+ terminalActingAsTrue = null;
+ return false;
+ }
+
+ if (node.IsLeaf)
+ {
+ if (leaf is null)
+ {
+ // remember the first MTBDD leaf seen
+ leaf = node;
+ }
+ else if (leaf != node)
+ {
+ // found two different MTBDD leaves
+ terminalActingAsTrue = null;
+ return false;
+ }
+ }
+ else
+ {
+ if (visited.Add(node.Zero))
+ toVisit.Push(node.Zero);
+
+ if (visited.Add(node.One))
+ toVisit.Push(node.One);
+ }
+ }
+
+ Debug.Assert(leaf is not null, "this should never happen because there must exist another leaf besides False");
+ // found an MTBDD leaf and didn't find any other (non-False) leaves
+ terminalActingAsTrue = leaf;
+ return true;
+ }
+
+ ///
+ /// All terminals precede all nonterminals. Compares Ordinals for terminals.
+ /// Compare non-terminals by comparing their minimal elements.
+ /// If minimal elements are the same, compare Ordinals.
+ /// This provides a total order for terminals.
+ ///
+ public int CompareTo(object? obj)
+ {
+ if (obj is not BDD bdd)
+ {
+ return -1;
+ }
+
+ if (IsLeaf)
+ {
+ return
+ !bdd.IsLeaf || Ordinal < bdd.Ordinal ? -1 :
+ Ordinal == bdd.Ordinal ? 0 :
+ 1;
+ }
+
+ if (bdd.IsLeaf)
+ {
+ return 1;
+ }
+
+ ulong min = GetMin();
+ ulong bdd_min = bdd.GetMin();
+ return
+ min < bdd_min ? -1 :
+ bdd_min < min ? 1 :
+ Ordinal.CompareTo(bdd.Ordinal);
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDDAlgebra.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDDAlgebra.cs
new file mode 100644
index 00000000000000..5096efe35ce6df
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDDAlgebra.cs
@@ -0,0 +1,511 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Diagnostics;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ ///
+ /// Boolean operations over BDDs.
+ ///
+ internal enum BoolOp
+ {
+ Or,
+ And,
+ Xor,
+ Not
+ }
+
+ ///
+ /// Boolean algebra for Binary Decision Diagrams. Boolean operations on BDDs are cached for efficiency. The
+ /// IBooleanAlgebra interface implemented by this class is thread safe.
+ /// TBD: policy for clearing/reducing the caches when they grow too large.
+ /// Ultimately, the caches are crucial for efficiency, not for correctness.
+ ///
+ internal abstract class BDDAlgebra : IBooleanAlgebra
+ {
+ ///
+ /// Operation cache for Boolean operations over BDDs.
+ ///
+ private readonly ConcurrentDictionary<(BoolOp op, BDD a, BDD? b), BDD> _opCache = new();
+
+ ///
+ /// Internalize the creation of BDDs so that two BDDs with same ordinal and identical children are the same object.
+ /// The algorithms do not rely on 100% internalization
+ /// (they could but this would make it difficult (or near impossible) to clear caches.
+ /// Allowing distinct but equivalent BDDs is also a tradeoff between efficiency and flexibility.
+ ///
+ private readonly ConcurrentDictionary<(int ordinal, BDD? one, BDD? zero), BDD> _bddCache = new();
+
+ ///
+ /// Generator for minterms.
+ ///
+ private readonly MintermGenerator _mintermGen;
+
+ ///
+ /// Construct a solver for BDDs.
+ ///
+ public BDDAlgebra() => _mintermGen = new MintermGenerator(this);
+
+ ///
+ /// Create a BDD with given ordinal and given one and zero child.
+ /// Returns the BDD from the cache if it already exists.
+ ///
+ public BDD GetOrCreateBDD(int ordinal, BDD? one, BDD? zero) =>
+ _bddCache.GetOrAdd((ordinal, one, zero), static key => new BDD(key.ordinal, key.one, key.zero));
+
+ #region IBooleanAlgebra members
+
+ ///
+ /// Make the union of a and b
+ ///
+ public BDD Or(BDD a, BDD b) => ApplyBinaryOp(BoolOp.Or, a, b);
+
+ ///
+ /// Make the intersection of a and b
+ ///
+ public BDD And(BDD a, BDD b) => ApplyBinaryOp(BoolOp.And, a, b);
+
+ ///
+ /// Complement a
+ ///
+ public BDD Not(BDD a) =>
+ a == False ? True :
+ a == True ? False :
+ _opCache.GetOrAdd((BoolOp.Not, a, null), static (key, algebra) => key.a.IsLeaf ?
+ algebra.GetOrCreateBDD(algebra.CombineTerminals(BoolOp.Not, key.a.Ordinal, 0), null, null) : // multi-terminal case
+ algebra.GetOrCreateBDD(key.a.Ordinal, algebra.Not(key.a.One), algebra.Not(key.a.Zero)),
+ this);
+
+ ///
+ /// Applies the binary Boolean operation op and constructs the BDD recursively from a and b.
+ ///
+ /// given binary Boolean operation
+ /// first BDD
+ /// second BDD
+ ///
+ private BDD ApplyBinaryOp(BoolOp op, BDD a, BDD b)
+ {
+ // Handle base cases
+ #region the cases when one of a or b is True or False or when a == b
+ switch (op)
+ {
+ case BoolOp.Or:
+ if (a == False)
+ return b;
+ if (b == False)
+ return a;
+ if (a == True || b == True)
+ return True;
+ if (a == b)
+ return a;
+ break;
+
+ case BoolOp.And:
+ if (a == True)
+ return b;
+ if (b == True)
+ return a;
+ if (a == False || b == False)
+ return False;
+ if (a == b)
+ return a;
+ break;
+
+ case BoolOp.Xor:
+ if (a == False)
+ return b;
+ if (b == False)
+ return a;
+ if (a == b)
+ return False;
+ if (a == True)
+ return Not(b);
+ if (b == True)
+ return Not(a);
+ break;
+
+ default:
+ Debug.Fail("Unhandled binary BoolOp case");
+ break;
+ }
+ #endregion
+
+ // Order operands by hash code to increase cache hits
+ if (a.GetHashCode() > b.GetHashCode())
+ {
+ BDD tmp = a;
+ a = b;
+ b = tmp;
+ }
+
+ return _opCache.GetOrAdd((op, a, b), static (key, algebra) =>
+ {
+ Debug.Assert(key.b is not null, "Validated it was non-null prior to calling GetOrAdd");
+
+ if (key.a.IsLeaf && key.b.IsLeaf)
+ {
+ // Multi-terminal case, we know here that a is neither True nor False
+ int ord = algebra.CombineTerminals(key.op, key.a.Ordinal, key.b.Ordinal);
+ return algebra.GetOrCreateBDD(ord, null, null);
+ }
+
+ if (key.a.IsLeaf || key.b!.Ordinal > key.a.Ordinal)
+ {
+ Debug.Assert(!key.b.IsLeaf);
+ BDD t = algebra.ApplyBinaryOp(key.op, key.a, key.b.One);
+ BDD f = algebra.ApplyBinaryOp(key.op, key.a, key.b.Zero);
+ return t == f ? t : algebra.GetOrCreateBDD(key.b.Ordinal, t, f);
+ }
+
+ if (key.b.IsLeaf || key.a.Ordinal > key.b.Ordinal)
+ {
+ Debug.Assert(!key.a.IsLeaf);
+ BDD t = algebra.ApplyBinaryOp(key.op, key.a.One, key.b);
+ BDD f = algebra.ApplyBinaryOp(key.op, key.a.Zero, key.b);
+ return t == f ? t : algebra.GetOrCreateBDD(key.a.Ordinal, t, f);
+ }
+
+ {
+ Debug.Assert(!key.a.IsLeaf);
+ Debug.Assert(!key.b.IsLeaf);
+ BDD t = algebra.ApplyBinaryOp(key.op, key.a.One, key.b.One);
+ BDD f = algebra.ApplyBinaryOp(key.op, key.a.Zero, key.b.Zero);
+ return t == f ? t : algebra.GetOrCreateBDD(key.a.Ordinal, t, f);
+ }
+ }, this);
+ }
+
+ ///
+ /// Intersect all sets in the enumeration
+ ///
+ public BDD And(IEnumerable sets)
+ {
+ BDD res = True;
+ foreach (BDD bdd in sets)
+ {
+ res = And(res, bdd);
+ }
+ return res;
+ }
+
+ ///
+ /// Take the union of all sets in the enumeration
+ ///
+ public BDD Or(IEnumerable sets)
+ {
+ BDD res = False;
+ foreach (BDD bdd in sets)
+ {
+ res = Or(res, bdd);
+ }
+ return res;
+ }
+
+ ///
+ /// Gets the full set.
+ ///
+ public BDD True => BDD.True;
+
+ ///
+ /// Gets the empty set.
+ ///
+ public BDD False => BDD.False;
+
+ ///
+ /// Returns true if the set is nonempty.
+ ///
+ public bool IsSatisfiable(BDD set) => set != False;
+
+ ///
+ /// Returns true if a and b represent equivalent BDDs.
+ ///
+ public bool AreEquivalent(BDD a, BDD b) => Xor(a, b) == False;
+
+ #endregion
+
+ ///
+ /// Make the XOR of a and b
+ ///
+ internal BDD Xor(BDD a, BDD b) => ApplyBinaryOp(BoolOp.Xor, a, b);
+
+ #region bit-shift operations
+
+ ///
+ /// Shift all elements k bits to the right.
+ /// For example if set denotes {*0000,*1110,*1111} then
+ /// ShiftRight(set) denotes {*000,*111} where * denotes any prefix of 0's or 1's.
+ ///
+ public BDD ShiftRight(BDD set, int k)
+ {
+ Debug.Assert(k >= 0);
+ return set.IsLeaf ? set : ShiftLeftImpl(new Dictionary<(BDD set, int k), BDD>(), set, 0 - k);
+ }
+
+ ///
+ /// Shift all elements k bits to the left.
+ /// For example if k=1 and set denotes {*0000,*1111} then
+ /// ShiftLeft(set) denotes {*00000,*00001,*11110,*11111} where * denotes any prefix of 0's or 1's.
+ ///
+ public BDD ShiftLeft(BDD set, int k)
+ {
+ Debug.Assert(k >= 0);
+ return set.IsLeaf ? set : ShiftLeftImpl(new Dictionary<(BDD set, int k), BDD>(), set, k);
+ }
+
+ ///
+ /// Uses shiftCache to avoid recomputations in shared BDDs (which are DAGs).
+ ///
+ private BDD ShiftLeftImpl(Dictionary<(BDD set, int k), BDD> shiftCache, BDD set, int k)
+ {
+ if (set.IsLeaf || k == 0)
+ return set;
+
+ int ordinal = set.Ordinal + k;
+
+ if (ordinal < 0)
+ return True; //this arises if k is negative
+
+ if (!shiftCache.TryGetValue((set, k), out BDD? res))
+ {
+ BDD zero = ShiftLeftImpl(shiftCache, set.Zero, k);
+ BDD one = ShiftLeftImpl(shiftCache, set.One, k);
+
+ res = (zero == one) ?
+ zero :
+ GetOrCreateBDD((ushort)ordinal, one, zero);
+ shiftCache[(set, k)] = res;
+ }
+ return res;
+ }
+
+ #endregion
+
+ ///
+ /// Generate all non-overlapping Boolean combinations of a set of BDDs.
+ ///
+ /// the BDDs to create the minterms for
+ /// BDDs for the minterm
+ public List GenerateMinterms(params BDD[] sets) => _mintermGen.GenerateMinterms(sets);
+
+ ///
+ /// Make a set containing all integers whose bits up to maxBit equal n.
+ ///
+ /// the given integer
+ /// bits above maxBit are unspecified
+ ///
+ public BDD CreateSetFrom(uint n, int maxBit) => CreateSetFromRange(n, n, maxBit);
+
+ ///
+ /// Make the set containing all values greater than or equal to m and less than or equal to n when considering bits between 0 and maxBit.
+ ///
+ /// lower bound
+ /// upper bound
+ /// bits above maxBit are unspecified
+ public BDD CreateSetFromRange(uint lower, uint upper, int maxBit)
+ {
+ Debug.Assert(0 <= maxBit && maxBit <= 31, "maxBit must be between 0 and 31");
+
+ if (upper < lower)
+ return False;
+
+ // Filter out bits greater than maxBit
+ if (maxBit < 31)
+ {
+ uint filter = (1u << (maxBit + 1)) - 1;
+ lower &= filter;
+ upper &= filter;
+ }
+
+ return CreateSetFromRangeImpl(lower, upper, maxBit);
+ }
+
+ private BDD CreateSetFromRangeImpl(uint lower, uint upper, int maxBit)
+ {
+ // Mask with 1 at position of maxBit
+ uint mask = 1u << maxBit;
+
+ if (mask == 1) // Base case for least significant bit
+ {
+ return
+ upper == 0 ? GetOrCreateBDD(maxBit, False, True) : // lower must also be 0
+ lower == 1 ? GetOrCreateBDD(maxBit, True, False) : // upper must also be 1
+ True; // Otherwise both 0 and 1 are included
+ }
+
+ // Check if range includes all numbers up to bit
+ if (lower == 0 && upper == ((mask << 1) - 1))
+ {
+ return True;
+ }
+
+ // Mask out the highest bit for the first and last elements in the range
+ uint lowerMasked = lower & mask;
+ uint upperMasked = upper & mask;
+
+ if (upperMasked == 0)
+ {
+ // Highest value in range doesn't have maxBit set, so the one branch is empty
+ BDD zero = CreateSetFromRangeImpl(lower, upper, maxBit - 1);
+ return GetOrCreateBDD(maxBit, False, zero);
+ }
+ else if (lowerMasked == mask)
+ {
+ // Lowest value in range has maxBit set, so the zero branch is empty
+ BDD one = CreateSetFromRangeImpl(lower & ~mask, upper & ~mask, maxBit - 1);
+ return GetOrCreateBDD(maxBit, one, False);
+ }
+ else // Otherwise the range straddles (1<
+ /// Convert the set into an equivalent array of uint ranges.
+ /// Bits above maxBit are ignored.
+ /// The ranges are nonoverlapping and ordered.
+ ///
+ public static (uint, uint)[] ToRanges(BDD set, int maxBit) => BDDRangeConverter.ToRanges(set, maxBit);
+
+ #region domain size and min computation
+
+ ///
+ /// Calculate the number of elements in the set. Returns 0 when set is full and maxBit is 63.
+ ///
+ /// the given set
+ /// bits above maxBit are ignored
+ /// the cardinality of the set
+ public virtual ulong ComputeDomainSize(BDD set, int maxBit)
+ {
+ if (maxBit < set.Ordinal)
+ throw new ArgumentOutOfRangeException(nameof(maxBit));
+
+ if (set == False)
+ return 0UL;
+
+ if (set == True)
+ return 1UL << maxBit << 1; // e.g. if maxBit is 15 then the return value is 1 << 16, i.e., 2^16
+
+ if (set.IsLeaf)
+ throw new NotSupportedException(); // multi-terminal case is not supported
+
+ ulong res = ComputeDomainSizeImpl(new Dictionary(), set);
+ if (maxBit > set.Ordinal)
+ {
+ res = (1UL << (maxBit - set.Ordinal)) * res;
+ }
+
+ return res;
+ }
+
+ ///
+ /// Caches previously calculated values in sizeCache so that computations are not repeated inside a BDD for the same sub-BDD.
+ /// Thus the number of internal calls is propotional to the number of nodes of the BDD, that could otherwise be exponential in the worst case.
+ ///
+ /// previously computed sizes
+ /// given set to compute size of
+ ///
+ private ulong ComputeDomainSizeImpl(Dictionary sizeCache, BDD set)
+ {
+ if (!sizeCache.TryGetValue(set, out ulong size))
+ {
+ if (set.IsLeaf)
+ throw new NotSupportedException(); //multi-terminal case is not supported
+
+ ulong sizeL;
+ ulong sizeR;
+ if (set.Zero.IsEmpty)
+ {
+ sizeL = 0;
+ sizeR = set.One.IsFull ?
+ (uint)1 << set.Ordinal :
+ ((uint)1 << (set.Ordinal - 1 - set.One.Ordinal)) * ComputeDomainSizeImpl(sizeCache, set.One);
+ }
+ else if (set.Zero.IsFull)
+ {
+ sizeL = 1UL << set.Ordinal;
+ sizeR = set.One.IsEmpty ?
+ 0UL :
+ (1UL << (set.Ordinal - 1 - set.One.Ordinal)) * ComputeDomainSizeImpl(sizeCache, set.One);
+ }
+ else
+ {
+ sizeL = (1UL << (set.Ordinal - 1 - set.Zero.Ordinal)) * ComputeDomainSizeImpl(sizeCache, set.Zero);
+ sizeR =
+ set.One == False ? 0UL :
+ set.One == True ? 1UL << set.Ordinal :
+ (1UL << (set.Ordinal - 1 - set.One.Ordinal)) * ComputeDomainSizeImpl(sizeCache, set.One);
+ }
+
+ size = sizeL + sizeR;
+ sizeCache[set] = size;
+ }
+ return size;
+ }
+
+ ///
+ /// Get the lexicographically minimum bitvector in the set as a ulong.
+ /// Assumes that the set is nonempty and that the ordinal of the BDD is at most 63.
+ ///
+ /// the given nonempty set
+ /// the lexicographically smallest bitvector in the set
+ public ulong GetMin(BDD set) => set.GetMin();
+
+ #endregion
+
+ ///
+ /// Any two BDDs that are equivalent are isomorphic and have the same hashcode.
+ ///
+ public bool HashCodesRespectEquivalence => true;
+
+ ///
+ /// Two equivalent BDDs need not be identical
+ ///
+ public bool IsExtensional => false;
+
+ ///
+ /// The returned integer must be nonegative
+ /// and will act as the combined terminal in a multi-terminal BDD.
+ /// May throw NotSupportedException.
+ ///
+ public abstract int CombineTerminals(BoolOp op, int terminal1, int terminal2);
+
+ ///
+ /// Replace the True node in the BDD by a non-Boolean terminal.
+ /// Locks the algebra for single threaded use.
+ /// Observe that the Ordinal of False is -1 and the Ordinal of True is -2.
+ ///
+ public BDD ReplaceTrue(BDD bdd, int terminal)
+ {
+ Debug.Assert(terminal >= 0);
+
+ BDD leaf = GetOrCreateBDD(terminal, null, null);
+ return ReplaceTrueImpl(bdd, leaf, new Dictionary());
+ }
+
+ private BDD ReplaceTrueImpl(BDD bdd, BDD leaf, Dictionary cache)
+ {
+ if (bdd == True)
+ return leaf;
+
+ if (bdd.IsLeaf)
+ return bdd;
+
+ if (!cache.TryGetValue(bdd, out BDD? res))
+ {
+ BDD one = ReplaceTrueImpl(bdd.One, leaf, cache);
+ BDD zero = ReplaceTrueImpl(bdd.Zero, leaf, cache);
+ res = GetOrCreateBDD(bdd.Ordinal, one, zero);
+ cache[bdd] = res;
+ }
+ return res;
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDDRangeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDDRangeConverter.cs
new file mode 100644
index 00000000000000..d6ff1d2b6aa30c
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDDRangeConverter.cs
@@ -0,0 +1,230 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Diagnostics;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ ///
+ /// Converts sets of integers expressed as BDDs into non-overlapping and ordered arrays of ranges.
+ ///
+ internal sealed class BDDRangeConverter
+ {
+ ///
+ /// Cache for all range conversions. Having this cache is required to avoid exponential running time.
+ ///
+ private readonly Dictionary _rangeCache = new Dictionary();
+
+ private BDDRangeConverter() { }
+
+ ///
+ /// Convert the set into an equivalent array of ranges.
+ /// The ranges are nonoverlapping and ordered.
+ ///
+ public static (uint, uint)[] ToRanges(BDD set, int maxBit)
+ {
+ Debug.Assert(0 <= maxBit && maxBit <= 31, "maxBit must be between 0 and 31");
+
+ if (set.IsEmpty)
+ return Array.Empty<(uint, uint)>();
+
+ if (set.IsFull)
+ return new[] { (0u, ((uint)1 << maxBit << 1) - 1) }; //note: maxBit could be 31
+
+ var rc = new BDDRangeConverter();
+ return rc.LiftRanges(maxBit + 1, maxBit - set.Ordinal, rc.ToRangesFromOrdinal(set));
+ }
+
+ ///
+ /// Extends a set of ranges to include more significant bits. The new bits are allowed to be anything and all
+ /// combinations of the new bits are included.
+ /// e.g. if toBits = 6 and newBits = 2 and ranges = (in binary form) {[0000 1010, 0000 1110]} i.e. [x0A,x0E]
+ /// then res = {[0000 1010, 0000 1110], [0001 1010, 0001 1110],
+ /// [0010 1010, 0010 1110], [0011 1010, 0011 1110]},
+ ///
+ private (uint, uint)[] LiftRanges(int toBits, int newBits, (uint, uint)[] ranges)
+ {
+ // nothing happens if no new bits are added
+ if (newBits == 0)
+ return ranges;
+
+ int fromBits = toBits - newBits;
+
+ // Iterate through all combinations of the new bits
+ var result = new (uint, uint)[(1 << newBits) * ranges.Length];
+ int resultPos = 0;
+ for (uint i = 0; i < (1 << newBits); i++)
+ {
+ // Shift the prefix to be past the existing range of bits, and
+ // generate each range with this prefix added.
+ uint prefix = i << fromBits;
+ foreach ((uint, uint) range in ranges)
+ {
+ result[resultPos++] = (range.Item1 | prefix, range.Item2 | prefix);
+ }
+ }
+
+ // lifted ranges can wrap around like this [0...][...2^fromBits-1][2^fromBits...][...2^(fromBits+1)-1]
+ uint maximal = ((uint)1 << fromBits) - 1;
+ if (ranges[0].Item1 == 0 && ranges[ranges.Length - 1].Item2 == maximal)
+ {
+ // merge consequtive ranges, we know that res has at least two elements here
+ var merged = new List<(uint, uint)>();
+ uint from = result[0].Item1;
+ uint to = result[0].Item2;
+ for (int i = 1; i < result.Length; i++)
+ {
+ if (to == result[i].Item1 - 1)
+ {
+ // merge into previous instead of adding a new range
+ to = result[i].Item2;
+ }
+ else
+ {
+ merged.Add((from, to));
+ from = result[i].Item1;
+ to = result[i].Item2;
+ }
+ }
+ merged.Add((from, to));
+ result = merged.ToArray();
+ }
+
+ return result;
+ }
+
+ private (uint, uint)[] ToRangesFromOrdinal(BDD set)
+ {
+ if (!_rangeCache.TryGetValue(set, out (uint, uint)[]? ranges))
+ {
+ Debug.Assert(!set.IsLeaf);
+
+ int b = set.Ordinal;
+ uint mask = (uint)1 << b;
+ if (set.Zero.IsEmpty)
+ {
+ #region 0-case is empty
+ if (set.One.IsFull)
+ {
+ ranges = new[] { (mask, (mask << 1) - 1) };
+ }
+ else //1-case is neither full nor empty
+ {
+ (uint, uint)[] ranges1 = LiftRanges(b, b - set.One.Ordinal - 1, ToRangesFromOrdinal(set.One));
+ ranges = new (uint, uint)[ranges1.Length];
+ for (int i = 0; i < ranges1.Length; i++)
+ {
+ ranges[i] = (ranges1[i].Item1 | mask, ranges1[i].Item2 | mask);
+ }
+ }
+ #endregion
+ }
+ else if (set.Zero.IsFull)
+ {
+ #region 0-case is full
+ if (set.One.IsEmpty)
+ {
+ ranges = new[] { (0u, mask - 1) };
+ }
+ else
+ {
+ (uint, uint)[] rangesR = LiftRanges(b, b - set.One.Ordinal - 1, ToRangesFromOrdinal(set.One));
+ (uint, uint) range = rangesR[0];
+ if (range.Item1 == 0)
+ {
+ ranges = new (uint, uint)[rangesR.Length];
+ ranges[0] = (0, range.Item2 | mask);
+ for (int i = 1; i < rangesR.Length; i++)
+ {
+ ranges[i] = (rangesR[i].Item1 | mask, rangesR[i].Item2 | mask);
+ }
+ }
+ else
+ {
+ ranges = new (uint, uint)[rangesR.Length + 1];
+ ranges[0] = (0, mask - 1);
+ for (int i = 0; i < rangesR.Length; i++)
+ {
+ ranges[i + 1] = (rangesR[i].Item1 | mask, rangesR[i].Item2 | mask);
+ }
+ }
+ }
+ #endregion
+ }
+ else
+ {
+ #region 0-case is neither full nor empty
+ (uint, uint)[] rangesL = LiftRanges(b, b - set.Zero.Ordinal - 1, ToRangesFromOrdinal(set.Zero));
+ (uint, uint) last = rangesL[rangesL.Length - 1];
+
+ if (set.One.IsEmpty)
+ {
+ ranges = rangesL;
+ }
+
+ else if (set.One.IsFull)
+ {
+ var ranges1 = new List<(uint, uint)>();
+ for (int i = 0; i < rangesL.Length - 1; i++)
+ {
+ ranges1.Add(rangesL[i]);
+ }
+
+ if (last.Item2 == (mask - 1))
+ {
+ ranges1.Add((last.Item1, (mask << 1) - 1));
+ }
+ else
+ {
+ ranges1.Add(last);
+ ranges1.Add((mask, (mask << 1) - 1));
+ }
+ ranges = ranges1.ToArray();
+ }
+ else //general case: neither 0-case, not 1-case is full or empty
+ {
+ (uint, uint)[] rangesR0 = ToRangesFromOrdinal(set.One);
+
+ (uint, uint)[] rangesR = LiftRanges(b, b - set.One.Ordinal - 1, rangesR0);
+
+ (uint, uint) first = rangesR[0];
+
+ if (last.Item2 == (mask - 1) && first.Item1 == 0) //merge together the last and first ranges
+ {
+ ranges = new (uint, uint)[rangesL.Length + rangesR.Length - 1];
+ for (int i = 0; i < rangesL.Length - 1; i++)
+ {
+ ranges[i] = rangesL[i];
+ }
+
+ ranges[rangesL.Length - 1] = (last.Item1, first.Item2 | mask);
+ for (int i = 1; i < rangesR.Length; i++)
+ {
+ ranges[rangesL.Length - 1 + i] = (rangesR[i].Item1 | mask, rangesR[i].Item2 | mask);
+ }
+ }
+ else
+ {
+ ranges = new (uint, uint)[rangesL.Length + rangesR.Length];
+ for (int i = 0; i < rangesL.Length; i++)
+ {
+ ranges[i] = rangesL[i];
+ }
+
+ for (int i = 0; i < rangesR.Length; i++)
+ {
+ ranges[rangesL.Length + i] = (rangesR[i].Item1 | mask, rangesR[i].Item2 | mask);
+ }
+ }
+
+ }
+ #endregion
+ }
+ _rangeCache[set] = ranges;
+ }
+
+ return ranges;
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BV.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BV.cs
new file mode 100644
index 00000000000000..586bf1a9981473
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BV.cs
@@ -0,0 +1,206 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ ///
+ /// Represents a bitvector of arbitrary length (i.e. number of bits).
+ ///
+ internal sealed class BV : IComparable
+ {
+ ///
+ /// Stores the bits in an array of 64-bit integers. If Length is not evenly divisible by 64 then the remaining
+ /// bits are in the least significant bits of the last element.
+ ///
+ private readonly ulong[] _blocks;
+
+ ///
+ /// Number of bits.
+ ///
+ internal readonly int Length;
+
+ ///
+ /// Cache for the lazily computed hash code.
+ ///
+ private int? _hashcode;
+
+ ///
+ /// Returns true iff the i'th bit is 1
+ ///
+ internal bool this[int i]
+ {
+ get
+ {
+ Debug.Assert(i >= 0 && i < Length);
+ int k = i / 64;
+ int j = i % 64;
+ return (_blocks[k] & (1ul << j)) != 0;
+ }
+ private set
+ {
+ Debug.Assert(i >= 0 && i < Length);
+ int k = i / 64;
+ int j = i % 64;
+ if (value)
+ {
+ //set the j'th bit of the k'th block to 1
+ _blocks[k] |= 1ul << j;
+ }
+ else
+ {
+ //set the j'th bit of the k'th block to 0
+ _blocks[k] &= ~(1ul << j);
+ }
+ }
+ }
+
+ private BV(int K)
+ {
+ Length = K;
+ _blocks = new ulong[((K - 1) / 64) + 1];
+ }
+
+ private BV(int K, ulong[] blocks)
+ {
+ Length = K;
+ _blocks = blocks;
+ }
+
+ ///
+ /// Constructs a bitvector of K bits initially all 0.
+ ///
+ public static BV CreateFalse(int K) => new(K);
+
+ ///
+ /// Constructs a bitvector of K bits initially all 1.
+ ///
+ public static BV CreateTrue(int K) => ~CreateFalse(K);
+
+ ///
+ /// Returns the bitvector of length K with its i'th bit set to 1 all other bits are 0.
+ ///
+ public static BV CreateSingleBit(int K, int i)
+ {
+ BV bv = new BV(K);
+ bv[i] = true;
+ return bv;
+ }
+
+ ///
+ /// Bitwise AND
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static BV operator &(BV x, BV y)
+ {
+ Debug.Assert(x.Length == y.Length);
+
+ var blocks = new ulong[x._blocks.Length];
+ // produce new blocks as the bitwise AND of the arguments' blocks
+ for (int i = 0; i < blocks.Length; i++)
+ {
+ blocks[i] = x._blocks[i] & y._blocks[i];
+ }
+ return new BV(x.Length, blocks);
+ }
+
+ ///
+ /// Bitwise OR
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static BV operator |(BV x, BV y)
+ {
+ Debug.Assert(x.Length == y.Length);
+
+ var blocks = new ulong[x._blocks.Length];
+ // produce new blocks as the bitwise OR of the arguments' blocks
+ for (int i = 0; i < blocks.Length; i++)
+ {
+ blocks[i] = x._blocks[i] | y._blocks[i];
+ }
+ return new BV(x.Length, blocks);
+ }
+
+ ///
+ /// Bitwise XOR
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static BV operator ^(BV x, BV y)
+ {
+ Debug.Assert(x.Length == y.Length);
+
+ var blocks = new ulong[x._blocks.Length];
+ // produce new blocks as the bitwise XOR of the arguments' blocks
+ for (int i = 0; i < blocks.Length; i++)
+ {
+ blocks[i] = x._blocks[i] ^ y._blocks[i];
+ }
+ return new BV(x.Length, blocks);
+ }
+
+ ///
+ /// Bitwise NOT
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static BV operator ~(BV x)
+ {
+ var blocks = new ulong[x._blocks.Length];
+ for (int i = 0; i < blocks.Length; i++)
+ {
+ blocks[i] = ~x._blocks[i];
+ }
+
+ int j = x.Length % 64;
+ if (j > 0)
+ {
+ // the number of bits is not a precise multiple of 64
+ // reset the extra bits in the last block to 0
+ int last = (x.Length - 1) / 64;
+ blocks[last] &= (1ul << j) - 1;
+ }
+
+ return new BV(x.Length, blocks);
+ }
+
+ public override bool Equals([NotNullWhen(true)] object? obj) => CompareTo(obj) == 0;
+
+ public override int GetHashCode()
+ {
+ // if the hash code hasn't been calculated yet, do so before returning it
+ if (_hashcode == null)
+ {
+ _hashcode = Length.GetHashCode();
+ for (int i = 0; i < _blocks.Length; i++)
+ {
+ _hashcode = HashCode.Combine(_hashcode, _blocks[i].GetHashCode());
+ }
+ }
+ return (int)_hashcode;
+ }
+
+ public int CompareTo(object? obj)
+ {
+ if (obj is not BV that)
+ return 1;
+
+ if (Length != that.Length)
+ return Length.CompareTo(that.Length);
+
+ // compare all blocks starting from the last one (i.e. most significant)
+ for (int i = _blocks.Length - 1; i >= 0; i--)
+ {
+ if (_blocks[i] < that._blocks[i])
+ return -1;
+
+ if (_blocks[i] > that._blocks[i])
+ return 1;
+ }
+
+ //all blocks were equal
+ return 0;
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BV64Algebra.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BV64Algebra.cs
new file mode 100644
index 00000000000000..269d5f765977f1
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BV64Algebra.cs
@@ -0,0 +1,171 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ ///
+ /// Bit vector algebra for up to 64 bits that uses an ulong directly as the term representation, unlike the more
+ /// general BVAlgebra that uses an array of them. This simplifies the operations making the algebra more efficient.
+ ///
+ internal sealed class BV64Algebra : BVAlgebraBase, ICharAlgebra
+ {
+ private readonly MintermGenerator _mintermGenerator;
+ private readonly ulong _false;
+ private readonly ulong _true;
+
+ ///
+ /// Return the number of characters belonging to the minterms in the given set.
+ ///
+ public ulong ComputeDomainSize(ulong set)
+ {
+ ulong size = 0;
+ for (int i = 0; i < _bits; i++)
+ {
+ // if the bit is set then include the corresponding minterm's cardinality
+ if (IsSatisfiable(set & ((ulong)1 << i)))
+ {
+ size += _cardinalities[i];
+ }
+ }
+
+ return size;
+ }
+
+ public BV64Algebra(CharSetSolver solver, BDD[] minterms) :
+ base(new MintermClassifier(solver, minterms), solver.ComputeDomainSizes(minterms), minterms)
+ {
+ Debug.Assert(minterms.Length <= 64);
+ _mintermGenerator = new MintermGenerator(this);
+ _false = 0;
+ _true = _bits == 64 ? ulong.MaxValue : ulong.MaxValue >> (64 - _bits);
+ }
+
+ public bool IsExtensional => true;
+ public bool HashCodesRespectEquivalence => true;
+
+ public CharSetSolver CharSetProvider => throw new NotSupportedException();
+
+ ulong IBooleanAlgebra.False => _false;
+
+ ulong IBooleanAlgebra.True => _true;
+
+ public bool AreEquivalent(ulong predicate1, ulong predicate2) => predicate1 == predicate2;
+
+ public List GenerateMinterms(params ulong[] constraints) => _mintermGenerator.GenerateMinterms(constraints);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public bool IsSatisfiable(ulong predicate) => predicate != _false;
+
+ public ulong And(IEnumerable predicates) => throw new NotSupportedException();
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public ulong And(ulong predicate1, ulong predicate2) => predicate1 & predicate2;
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public ulong Not(ulong predicate) => _true & ~predicate; //NOTE: must filter off unused bits
+
+ public ulong Or(IEnumerable predicates)
+ {
+ ulong res = _false;
+ foreach (ulong p in predicates)
+ {
+ res |= p;
+
+ // short circuit the evaluation on true, since 1|x=1
+ if (res == _true)
+ {
+ return _true;
+ }
+ }
+
+ return res;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public ulong Or(ulong predicate1, ulong predicate2) => predicate1 | predicate2;
+
+ public ulong RangeConstraint(char lower, char upper, bool caseInsensitive = false, string? culture = null) => throw new NotSupportedException();
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public ulong CharConstraint(char c, bool caseInsensitive = false, string? culture = null)
+ {
+ Debug.Assert(!caseInsensitive);
+ return ((ulong)1) << _classifier.GetMintermID(c);
+ }
+
+ ///
+ /// Assumes that set is a union of some minterms (or empty).
+ /// If null then 0 is returned.
+ ///
+ public ulong ConvertFromCharSet(BDDAlgebra alg, BDD? set)
+ {
+ ulong res = _false;
+
+ if (set is not null)
+ {
+ for (int i = 0; i < _bits; i++)
+ {
+ Debug.Assert(_partition is not null);
+
+ // set the i'th bit if the i'th minterm is in the set
+ if (alg.IsSatisfiable(alg.And(_partition[i], set)))
+ {
+ res |= (ulong)1 << i;
+ }
+ }
+ }
+
+ return res;
+ }
+
+ public BDD ConvertToCharSet(ICharAlgebra solver, ulong pred)
+ {
+ Debug.Assert(_partition is not null);
+
+ // the result will be the union of all minterms in the set
+ BDD res = BDD.False;
+ if (pred != _false)
+ {
+ for (int i = 0; i < _bits; i++)
+ {
+ // include the i'th minterm in the union if the i'th bit is set
+ if ((pred & ((ulong)1 << i)) != _false)
+ {
+ res = solver.Or(res, _partition[i]);
+ }
+ }
+ }
+
+ return res;
+ }
+
+ ///
+ /// Return an array of bitvectors representing each of the minterms.
+ ///
+ public ulong[] GetMinterms()
+ {
+ ulong[] minterms = new ulong[_bits];
+ for (int i = 0; i < _bits; i++)
+ {
+ minterms[i] = (ulong)1 << i;
+ }
+
+ return minterms;
+ }
+
+ public IEnumerable GenerateAllCharacters(ulong set) => throw new NotSupportedException();
+
+ /// Pretty print the bitvector bv as the character set it represents.
+ public string PrettyPrint(ulong bv)
+ {
+ ICharAlgebra bddalgebra = SymbolicRegexRunner.s_unicode._solver;
+ Debug.Assert(_partition is not null && bddalgebra is not null);
+
+ return bddalgebra.PrettyPrint(ConvertToCharSet(bddalgebra, bv));
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BVAlgebra.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BVAlgebra.cs
new file mode 100644
index 00000000000000..af33f6a8a6b942
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BVAlgebra.cs
@@ -0,0 +1,177 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ ///
+ /// Base class for bitvector algebras, which represent sets as bitvectors indexed by the elements. An element is in
+ /// the set if the corresponding bit is set.
+ ///
+ /// These bitvector algebras are used to represent sets of minterms, and thus represent sets of characters
+ /// indirectly. However, the bitvector algebras are aware of this indirection in that the cardinalities of sets
+ /// count the characters rather than the minterms. For example, the cardinality of a bitvector "110" where the bits
+ /// correspond to minterms [a-c], [0-9] and [^a-c0-9] is 13 rather than 2.
+ ///
+ internal abstract class BVAlgebraBase
+ {
+ internal readonly MintermClassifier _classifier;
+ protected readonly ulong[] _cardinalities;
+ protected readonly int _bits;
+ protected readonly BDD[]? _partition;
+
+ internal BVAlgebraBase(MintermClassifier classifier, ulong[] cardinalities, BDD[]? partition)
+ {
+ _classifier = classifier;
+ _cardinalities = cardinalities;
+ _bits = cardinalities.Length;
+ _partition = partition;
+ }
+ }
+
+ ///
+ /// Bit vector algebra
+ ///
+ internal sealed class BVAlgebra : BVAlgebraBase, ICharAlgebra
+ {
+ private readonly MintermGenerator _mintermGenerator;
+ internal BV[] _minterms;
+
+ public ulong ComputeDomainSize(BV set)
+ {
+ ulong size = 0;
+ for (int i = 0; i < _bits; i++)
+ {
+ // if the bit is set then add the minterm's size
+ if (set[i])
+ {
+ size += _cardinalities[i];
+ }
+ }
+
+ return size;
+ }
+
+ public BVAlgebra(CharSetSolver solver, BDD[] minterms) :
+ base(new MintermClassifier(solver, minterms), solver.ComputeDomainSizes(minterms), minterms)
+ {
+ _mintermGenerator = new MintermGenerator(this);
+ False = BV.CreateFalse(_bits);
+ True = BV.CreateTrue(_bits);
+
+ var singleBitVectors = new BV[_bits];
+ for (int i = 0; i < singleBitVectors.Length; i++)
+ {
+ singleBitVectors[i] = BV.CreateSingleBit(_bits, i);
+ }
+ _minterms = singleBitVectors;
+ }
+
+ public BV False { get; }
+ public BV True { get; }
+
+ public bool IsExtensional => true;
+ public bool HashCodesRespectEquivalence => true;
+ public CharSetSolver CharSetProvider => throw new NotSupportedException();
+ public bool AreEquivalent(BV predicate1, BV predicate2) => predicate1.Equals(predicate2);
+ public List GenerateMinterms(params BV[] constraints) => _mintermGenerator.GenerateMinterms(constraints);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public bool IsSatisfiable(BV predicate) => !predicate.Equals(False);
+
+ public BV And(IEnumerable predicates) => throw new NotSupportedException();
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public BV And(BV predicate1, BV predicate2) => predicate1 & predicate2;
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public BV Not(BV predicate) => ~predicate;
+
+ public BV Or(IEnumerable predicates)
+ {
+ BV res = False;
+ foreach (BV p in predicates)
+ {
+ res |= p;
+ }
+
+ return res;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public BV Or(BV predicate1, BV predicate2) => predicate1 | predicate2;
+
+ public BV RangeConstraint(char lower, char upper, bool caseInsensitive = false, string? culture = null) => throw new NotSupportedException();
+
+ public BV CharConstraint(char c, bool caseInsensitive = false, string? culture = null)
+ {
+ Debug.Assert(!caseInsensitive);
+ int i = _classifier.GetMintermID(c);
+ return _minterms[i];
+ }
+
+ ///
+ /// Assumes that set is a union of some minterms (or empty).
+ /// If null then null is returned.
+ ///
+ [return: NotNullIfNotNull("set")]
+ public BV? ConvertFromCharSet(BDDAlgebra alg, BDD set)
+ {
+ if (set == null)
+ return null;
+
+ Debug.Assert(_partition is not null);
+
+ BV res = False;
+ for (int i = 0; i < _bits; i++)
+ {
+ BDD bdd_i = _partition[i];
+ BDD conj = alg.And(bdd_i, set);
+ if (alg.IsSatisfiable(conj))
+ {
+ res |= _minterms[i];
+ }
+ }
+
+ return res;
+ }
+
+ public BDD ConvertToCharSet(ICharAlgebra solver, BV pred)
+ {
+ Debug.Assert(_partition is not null);
+
+ // the result will be the union of all minterms in the set
+ BDD res = solver.False;
+ if (!pred.Equals(False))
+ {
+ for (int i = 0; i < _bits; i++)
+ {
+ // include the i'th minterm in the union if the i'th bit is set
+ if (pred[i])
+ {
+ res = solver.Or(res, _partition[i]);
+ }
+ }
+ }
+
+ return res;
+ }
+
+ public BV[] GetMinterms() => _minterms;
+ public IEnumerable GenerateAllCharacters(BV set) => throw new NotSupportedException();
+
+ /// Pretty print the bitvector bv as the character set it represents.
+ public string PrettyPrint(BV bv)
+ {
+ //accesses the shared BDD solver
+ ICharAlgebra bddalgebra = SymbolicRegexRunner.s_unicode._solver;
+ Debug.Assert(_partition is not null && bddalgebra is not null);
+
+ return bddalgebra.PrettyPrint(ConvertToCharSet(bddalgebra, bv));
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/CharSetSolver.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/CharSetSolver.cs
new file mode 100644
index 00000000000000..344ba84680d9d6
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/CharSetSolver.cs
@@ -0,0 +1,414 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ ///
+ /// Provides functionality to build character sets, to perform boolean operations over character sets,
+ /// and to construct an SFA over character sets from a regex.
+ /// Character sets are represented by bitvector sets.
+ ///
+ internal sealed class CharSetSolver : BDDAlgebra, ICharAlgebra
+ {
+ /// BDDs for all characters for fast lookup.
+ private readonly BDD[] _charPredTable = new BDD[char.MaxValue + 1];
+ private readonly Unicode.IgnoreCaseTransformer _ignoreCase;
+ internal readonly BDD _nonAscii;
+
+ ///
+ /// Construct the solver.
+ ///
+ public CharSetSolver()
+ {
+ _nonAscii = CreateCharSetFromRange('\x80', '\uFFFF');
+ _ignoreCase = new Unicode.IgnoreCaseTransformer(this);
+ }
+
+ public BDD ApplyIgnoreCase(BDD set, string? culture = null) => _ignoreCase.Apply(set, culture);
+
+ ///
+ /// Make a character predicate for the given character c.
+ ///
+ public BDD CharConstraint(char c, bool ignoreCase = false, string? culture = null)
+ {
+ if (ignoreCase)
+ {
+ return _ignoreCase.Apply(c, culture);
+ }
+ else
+ {
+ //individual character BDDs are always fixed
+ return _charPredTable[c] ??= CreateSetFrom(c, 15);
+ }
+ }
+
+ ///
+ /// Make a CharSet from all the characters in the range from m to n.
+ /// Returns the empty set if n is less than m
+ ///
+ public BDD CreateCharSetFromRange(char m, char n) =>
+ m == n ? CharConstraint(m) :
+ CreateSetFromRange(m, n, 15);
+
+ ///
+ /// Make a character set that is the union of the character sets of the given ranges.
+ ///
+ public BDD CreateCharSetFromRanges(IEnumerable<(uint, uint)> ranges)
+ {
+ BDD res = False;
+ foreach ((uint, uint) range in ranges)
+ {
+ res = Or(res, CreateSetFromRange(range.Item1, range.Item2, 15));
+ }
+
+ return res;
+ }
+
+ ///
+ /// Make a character set of all the characters in the interval from c to d.
+ /// If ignoreCase is true ignore cases for upper and lower case characters by including both versions.
+ ///
+ public BDD RangeConstraint(char c, char d, bool ignoreCase = false, string? culture = null)
+ {
+ if (c == d)
+ {
+ return CharConstraint(c, ignoreCase, culture);
+ }
+
+ BDD res = CreateSetFromRange(c, d, 15);
+ if (ignoreCase)
+ {
+ res = _ignoreCase.Apply(res, culture);
+ }
+
+ return res;
+ }
+
+ ///
+ /// Make a BDD encoding of k least significant bits of all the integers in the ranges
+ ///
+ internal BDD CreateBddForIntRanges(IEnumerable ranges)
+ {
+ BDD bdd = False;
+ foreach (int[] range in ranges)
+ {
+ bdd = Or(bdd, CreateSetFromRange((uint)range[0], (uint)range[1], 15));
+ }
+
+ return bdd;
+ }
+
+ ///
+ /// Identity function, returns s.
+ ///
+ public BDD ConvertFromCharSet(BDDAlgebra _, BDD s) => s;
+
+ ///
+ /// Returns this character set solver.
+ ///
+ public CharSetSolver CharSetProvider => this;
+
+ public IEnumerable GenerateAllCharacters(BDD bvSet, bool inReverseOrder = false)
+ {
+ foreach (uint c in GenerateAllElements(bvSet, inReverseOrder))
+ {
+ yield return (char)c;
+ }
+ }
+
+ public IEnumerable GenerateAllCharacters(BDD set) => GenerateAllCharacters(set, false);
+
+ /// Calculate the number of elements in the set.
+ /// the given set
+ /// the cardinality of the set
+ public ulong ComputeDomainSize(BDD set) => ComputeDomainSize(set, 15);
+
+ /// Calculate the number of elements in multiple sets.
+ /// The sets
+ /// An array of the cardinality of the sets.
+ public ulong[] ComputeDomainSizes(BDD[] sets)
+ {
+ var results = new ulong[sets.Length];
+ for (int i = 0; i < sets.Length; i++)
+ {
+ results[i] = ComputeDomainSize(sets[i]);
+ }
+ return results;
+ }
+
+ ///
+ /// Returns true iff the set contains exactly one element.
+ ///
+ /// the given set
+ /// true iff the set is a singleton
+ public bool IsSingleton(BDD set) => ComputeDomainSize(set, 15) == 1;
+
+ ///
+ /// Convert the set into an equivalent array of ranges. The ranges are nonoverlapping and ordered.
+ ///
+ public (uint, uint)[] ToRanges(BDD set) => ToRanges(set, 15);
+
+ private IEnumerable GenerateAllCharactersInOrder(BDD set)
+ {
+ foreach ((uint, uint) range in ToRanges(set))
+ {
+ for (uint i = range.Item1; i <= range.Item2; i++)
+ {
+ yield return i;
+ }
+ }
+ }
+
+ private IEnumerable GenerateAllCharactersInReverseOrder(BDD set)
+ {
+ (uint, uint)[] ranges = ToRanges(set);
+ for (int j = ranges.Length - 1; j >= 0; j--)
+ {
+ for (uint i = ranges[j].Item2; i >= ranges[j].Item1; i--)
+ {
+ yield return (char)i;
+ }
+ }
+ }
+
+ ///
+ /// Generate all characters that are members of the set in alphabetical order, smallest first, provided that inReverseOrder is false.
+ ///
+ /// the given set
+ /// if true the members are generated in reverse alphabetical order with the largest first, otherwise in alphabetical order
+ /// enumeration of all characters in the set, the enumeration is empty if the set is empty
+ private IEnumerable GenerateAllElements(BDD set, bool inReverseOrder) =>
+ set == False ? Array.Empty() :
+ inReverseOrder ? GenerateAllCharactersInReverseOrder(set) :
+ GenerateAllCharactersInOrder(set);
+
+ public BDD ConvertToCharSet(ICharAlgebra _, BDD pred) => pred;
+
+ public BDD[]? GetMinterms() => null;
+
+ public string PrettyPrint(BDD pred)
+ {
+ if (pred.IsEmpty)
+ return "[]";
+
+ //check if pred is full, show this case with a dot
+ if (pred.IsFull)
+ return ".";
+
+ // try to optimize representation involving common direct use of \d \w and \s to avoid blowup of ranges
+ BDD digit = SymbolicRegexRunner.s_unicode.CategoryCondition(8);
+ if (pred == SymbolicRegexRunner.s_unicode.WordLetterCondition)
+ return @"\w";
+ if (pred == SymbolicRegexRunner.s_unicode.WhiteSpaceCondition)
+ return @"\s";
+ if (pred == digit)
+ return @"\d";
+ if (pred == Not(SymbolicRegexRunner.s_unicode.WordLetterCondition))
+ return @"\W";
+ if (pred == Not(SymbolicRegexRunner.s_unicode.WhiteSpaceCondition))
+ return @"\S";
+ if (pred == Not(digit))
+ return @"\D";
+
+ (uint, uint)[] ranges = ToRanges(pred);
+
+ if (IsSingletonRange(ranges))
+ return Escape((char)ranges[0].Item1);
+
+ #region if too many ranges try to optimize the representation using \d \w etc.
+ if (SymbolicRegexRunner.s_unicode != null && ranges.Length > 10)
+ {
+ BDD w = SymbolicRegexRunner.s_unicode.WordLetterCondition;
+ BDD W = Not(w);
+ BDD d = SymbolicRegexRunner.s_unicode.CategoryCondition(8);
+ BDD D = Not(d);
+ BDD asciiDigit = CreateCharSetFromRange('0', '9');
+ BDD nonasciiDigit = And(d, Not(asciiDigit));
+ BDD s = SymbolicRegexRunner.s_unicode.WhiteSpaceCondition;
+ BDD S = Not(s);
+ BDD wD = And(w, D);
+ BDD SW = And(S, W);
+ //s, d, wD, SW are the 4 main large minterms
+ //note: s|SW = W, d|wD = w
+ //
+ // Venn Diagram: s and w do not overlap, and d is contained in w
+ // ------------------------------------------------
+ // | |
+ // | SW ------------(w)-------- |
+ // | -------- | | |
+ // | | | | ---------- | |
+ // | | s | | wD | | | |
+ // | | | | | d | | |
+ // | -------- | | | | |
+ // | | ---------- | |
+ // | ----------------------- |
+ // ------------------------------------------------
+ //
+ //-------------------------------------------------------------------
+ // singletons
+ //---
+ if (Or(s, pred) == s)
+ return $"[^\\S{RepresentSet(And(s, Not(pred)))}]";
+ //---
+ if (Or(d, pred) == d)
+ return $"[^\\D{RepresentSet(And(d, Not(pred)))}]";
+ //---
+ if (Or(wD, pred) == wD)
+ return $"[\\w-[\\d{RepresentSet(And(wD, Not(pred)))}]]";
+ //---
+ if (Or(SW, pred) == SW)
+ return $"[^\\s\\w{RepresentSet(And(SW, Not(pred)))}]";
+ //-------------------------------------------------------------------
+ // unions of two
+ // s|SW
+ if (Or(W, pred) == W)
+ {
+ string? repr1 = null;
+ if (And(s, pred) == s)
+ {
+ //pred contains all of \s and is contained in \W
+ repr1 = $"[\\s{RepresentSet(And(S, pred))}]";
+ }
+
+ //the more common case is that pred is not \w and not some specific non-word character such as ':'
+ string repr2 = $"[^\\w{RepresentSet(And(W, Not(pred)))}]";
+ return repr1 != null && repr1.Length < repr2.Length ? repr1 : repr2;
+ }
+ //---
+ // s|d
+ BDD s_or_d = Or(s, d);
+ if (pred == s_or_d)
+ return "[\\s\\d]";
+
+ if (Or(s_or_d, pred) == s_or_d)
+ {
+ //check first if this is purely ascii range for digits
+ return And(pred, s).Equals(s) && And(pred, nonasciiDigit).IsEmpty ?
+ $"[\\s{RepresentRanges(ToRanges(And(pred, asciiDigit)), checkSingletonComlement: false)}]" :
+ $"[\\s\\d-[{RepresentSet(And(s_or_d, Not(pred)))}]]";
+ }
+ //---
+ // s|wD
+ BDD s_or_wD = Or(s, wD);
+ if (Or(s_or_wD, pred) == s_or_wD)
+ return $"[\\s\\w-[\\d{RepresentSet(And(s_or_wD, Not(pred)))}]]";
+ //---
+ // d|wD
+ if (Or(w, pred) == w)
+ return $"[\\w-[{RepresentSet(And(w, Not(pred)))}]]";
+ //---
+ // d|SW
+ BDD d_or_SW = Or(d, SW);
+ if (pred == d_or_SW)
+ return "\\d|[^\\s\\w]";
+ if (Or(d_or_SW, pred) == d_or_SW)
+ return $"[\\d-[{RepresentSet(And(d, Not(pred)))}]]|[^\\s\\w{RepresentSet(And(SW, Not(pred)))}]";
+ // wD|SW = S&D
+ BDD SD = Or(wD, SW);
+ if (Or(SD, pred) == SD)
+ return $"[^\\s\\d{RepresentSet(And(SD, Not(pred)))}]";
+ //-------------------------------------------------------------------
+ //unions of three
+ // s|SW|wD = D
+ if (Or(D, pred) == D)
+ return $"[^\\d{RepresentSet(And(D, Not(pred)))}]";
+ // SW|wD|d = S
+ if (Or(S, pred) == S)
+ return $"[^\\s{RepresentSet(And(S, Not(pred)))}]";
+ // s|SW|d = complement of wD = W|d
+ BDD W_or_d = Not(wD);
+ if (Or(W_or_d, pred) == W_or_d)
+ return $"[\\W\\d-[{RepresentSet(And(W_or_d, Not(pred)))}]]";
+ // s|wD|d = s|w
+ BDD s_or_w = Or(s, w);
+ if (Or(s_or_w, pred) == s_or_w)
+ return $"[\\s\\w-[{RepresentSet(And(s_or_w, Not(pred)))}]]";
+ //-------------------------------------------------------------------
+ //touches all four minterms, typically happens as the fallback arc in .* extension
+ }
+ #endregion
+
+ // Represent either the ranges or its complement, if the complement representation is more compact.
+ string ranges_repr = $"[{RepresentRanges(ranges, checkSingletonComlement: false)}]";
+ string ranges_compl_repr = $"[^{RepresentRanges(ToRanges(Not(pred)), checkSingletonComlement: false)}]";
+ return ranges_repr.Length <= ranges_compl_repr.Length ? ranges_repr : ranges_compl_repr;
+ }
+
+ private string RepresentSet(BDD set) =>
+ set.IsEmpty ? "" : RepresentRanges(ToRanges(set));
+
+ private static string RepresentRanges((uint, uint)[] ranges, bool checkSingletonComlement = true)
+ {
+ //check if ranges represents a complement of a singleton
+ if (checkSingletonComlement && ranges.Length == 2 &&
+ ranges[0].Item1 == 0 && ranges[1].Item2 == 0xFFFF &&
+ ranges[0].Item2 + 2 == ranges[1].Item1)
+ {
+ return "^" + Escape((char)(ranges[0].Item2 + 1));
+ }
+
+ StringBuilder sb = new();
+ for (int i = 0; i < ranges.Length; i++)
+ {
+ if (ranges[i].Item1 == ranges[i].Item2)
+ {
+ sb.Append(Escape((char)ranges[i].Item1));
+ }
+ else if (ranges[i].Item2 == ranges[i].Item1 + 1)
+ {
+ sb.Append(Escape((char)ranges[i].Item1));
+ sb.Append(Escape((char)ranges[i].Item2));
+ }
+ else
+ {
+ sb.Append(Escape((char)ranges[i].Item1));
+ sb.Append('-');
+ sb.Append(Escape((char)ranges[i].Item2));
+ }
+ }
+ return sb.ToString();
+ }
+
+ /// Make an escaped string from a character.
+ /// The character to escape.
+ private static string Escape(char c)
+ {
+ uint code = c;
+ return c switch
+ {
+ '.' => @"\.",
+ '[' => @"\[",
+ ']' => @"\]",
+ '(' => @"\(",
+ ')' => @"\)",
+ '{' => @"\{",
+ '}' => @"\}",
+ '?' => @"\?",
+ '+' => @"\+",
+ '*' => @"\*",
+ '|' => @"\|",
+ '\\' => @"\\",
+ '^' => @"\^",
+ '$' => @"\$",
+ '-' => @"\-",
+ ':' => @"\:",
+ '\"' => "\\\"",
+ '\0' => @"\0",
+ '\t' => @"\t",
+ '\r' => @"\r",
+ '\v' => @"\v",
+ '\f' => @"\f",
+ '\n' => @"\n",
+ _ when code is >= 0x20 and <= 0x7E => c.ToString(),
+ _ when code <= 0xFF => $"\\x{code:X2}",
+ _ => $"\\u{code:X4}",
+ };
+ }
+
+ private static bool IsSingletonRange((uint, uint)[] ranges) => ranges.Length == 1 && ranges[0].Item1 == ranges[0].Item2;
+
+ public override int CombineTerminals(BoolOp op, int terminal1, int terminal2) => throw new NotSupportedException();
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/IBooleanAlgebra.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/IBooleanAlgebra.cs
new file mode 100644
index 00000000000000..336871a5aa2ec3
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/IBooleanAlgebra.cs
@@ -0,0 +1,85 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ ///
+ /// Generic Boolean Algebra solver.
+ /// Provides operations for conjunction, disjunction, and negation.
+ /// Allows to decide if a predicate is satisfiable and if two predicates are equivalent.
+ ///
+ /// predicates
+ internal interface IBooleanAlgebra
+ {
+ ///
+ /// Top element of the Boolean algebra, corresponds to the value true.
+ ///
+ T True { get; }
+
+ ///
+ /// Bottom element of the Boolean algebra, corresponds to the value false.
+ ///
+ T False { get; }
+
+ ///
+ /// Make a conjunction of predicate1 and predicate2.
+ ///
+ T And(T predicate1, T predicate2);
+
+ ///
+ /// Make a conjunction of all the predicates in the enumeration.
+ /// Returns True if the enumeration is empty.
+ ///
+ T And(IEnumerable predicates);
+
+ ///
+ /// Make a disjunction of predicate1 and predicate2.
+ ///
+ T Or(T predicate1, T predicate2);
+
+ ///
+ /// Make a disjunction of all the predicates in the enumeration.
+ /// Must return False if the enumeration is empty.
+ ///
+ T Or(IEnumerable predicates);
+
+ ///
+ /// Negate the predicate.
+ ///
+ T Not(T predicate);
+
+ ///
+ /// Returns true iff the predicate is satisfiable.
+ ///
+ bool IsSatisfiable(T predicate);
+
+ ///
+ /// Returns true iff predicate1 is equivalent to predicate2.
+ ///
+ bool AreEquivalent(T predicate1, T predicate2);
+
+ ///
+ /// True means then if two predicates are equivalent then their hashcodes are equal.
+ /// This is a weak form of extensionality.
+ ///
+ bool HashCodesRespectEquivalence { get; }
+
+ ///
+ /// True means that if two predicates are equivalent then they are identical.
+ ///
+ bool IsExtensional { get; }
+
+ ///
+ /// Given an array of constraints {c_1, c_2, ..., c_n} where n>=0.
+ /// Enumerate all satisfiable Boolean combinations Tuple({b_1, b_2, ..., b_n}, c)
+ /// where c is satisfisable and equivalent to c'_1 & c'_2 & ... & c'_n,
+ /// where c'_i = c_i if b_i = true and c'_i is Not(c_i) otherwise.
+ /// If n=0 return Tuple({},True)
+ ///
+ /// array of constraints
+ /// constraints that are satisfiable
+ List GenerateMinterms(params T[] constraints);
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/ICharAlgebra.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/ICharAlgebra.cs
new file mode 100644
index 00000000000000..a0f6a87e35d75a
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/ICharAlgebra.cs
@@ -0,0 +1,66 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ ///
+ /// Extends ICharAlgebra with character predicate solving and predicate pretty printing.
+ ///
+ /// predicates
+ internal interface ICharAlgebra : IBooleanAlgebra
+ {
+ ///
+ /// Make a constraint describing the set of all characters between a (inclusive) and b (inclusive).
+ /// Add both uppercase and lowercase elelements if caseInsensitive is true using the given culture
+ /// or the current culture when the given culture is null.
+ ///
+ T RangeConstraint(char lower, char upper, bool caseInsensitive = false, string? culture = null);
+
+ ///
+ /// Make a constraint describing a singleton set containing the character c, or
+ /// a set containing also the upper and lowercase versions of c if caseInsensitive is true.
+ ///
+ /// if true include both the uppercase and the lowercase versions of the given character
+ /// the given character
+ /// given culture, if null then the current culture is assumed
+ T CharConstraint(char c, bool caseInsensitive = false, string? culture = null);
+
+ ///
+ /// Make a term that encodes the given character set.
+ ///
+ T ConvertFromCharSet(BDDAlgebra bddAlg, BDD set);
+
+ ///
+ /// Compute the number of elements in the set
+ ///
+ ulong ComputeDomainSize(T set);
+
+ ///
+ /// Enumerate all characters in the set
+ ///
+ /// given set
+ IEnumerable GenerateAllCharacters(T set);
+
+ ///
+ /// Convert a predicate into a set of characters.
+ ///
+ BDD ConvertToCharSet(ICharAlgebra bddalg, T pred);
+
+ ///
+ /// Gets the underlying character set solver.
+ ///
+ CharSetSolver CharSetProvider { get; }
+
+ ///
+ /// Returns the minterms (a partition of the full domain).
+ ///
+ T[]? GetMinterms();
+
+ ///
+ /// Pretty print the character predicate
+ ///
+ string PrettyPrint(T pred);
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/MintermGenerator.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/MintermGenerator.cs
new file mode 100644
index 00000000000000..67fdcf10e864f5
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/MintermGenerator.cs
@@ -0,0 +1,249 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ /// Provides a generic implementation for minterm generation over a given Boolean Algebra.
+ /// type of predicates
+ ///
+ /// The minterms for a set of predicates are all their non-overlapping, satisfiable Boolean combinations. For example,
+ /// if the predicates are [0-9] and [0-4], then there are three minterms: [0-4], [5-9] and [^0-9]. Notably, there is no
+ /// minterm corresponding to "[0-9] and not [0-4]", since that is unsatisfiable.
+ ///
+ internal sealed class MintermGenerator where TPredicate : notnull
+ {
+ private readonly IBooleanAlgebra _algebra;
+
+ /// Constructs a minterm generator for a given Boolean Algebra.
+ /// given Boolean Algebra
+ public MintermGenerator(IBooleanAlgebra algebra)
+ {
+ // check that we can rely on equivalent predicates having the same hashcode, which EquivClass assumes
+ Debug.Assert(algebra.HashCodesRespectEquivalence);
+
+ _algebra = algebra;
+ }
+
+ ///
+ /// Given an array of predidates {p_1, p_2, ..., p_n} where n>=0,
+ /// enumerate all satisfiable Boolean combinations Tuple({b_1, b_2, ..., b_n}, p)
+ /// where p is satisfiable and equivalent to p'_1 & p'_2 & ... & p'_n,
+ /// where p'_i = p_i if b_i = true and p'_i is Not(p_i). Otherwise, if n=0
+ /// return Tuple({},True).
+ ///
+ /// array of predicates
+ /// all minterms of the given predicate sequence
+ public List GenerateMinterms(params TPredicate[] preds)
+ {
+ if (preds.Length == 0)
+ {
+ return new List { _algebra.True };
+ }
+
+ // The minterms will be solved using non-equivalent predicates, i.e., the equivalence classes of preds. The
+ // following code maps each predicate to an equivalence class and also stores for each equivalence class the
+ // predicates belonging to it, so that a valuation for the original predicates may be reconstructed.
+
+ var tree = new PartitionTree(_algebra);
+
+ var seen = new HashSet();
+ for (int i = 0; i < preds.Length; i++)
+ {
+ // Use a wrapper that overloads Equals to be logical equivalence as the key
+ if (seen.Add(new EquivalenceClass(_algebra, preds[i])))
+ {
+ // Push each equivalence class into the partition tree
+ tree.Refine(preds[i]);
+ }
+ }
+
+ // Return all minterms as the leaves of the partition tree
+ return tree.GetLeafPredicates();
+ }
+
+ /// Wraps a predicate as an equivalence class object whose Equals method is equivalence checking.
+ private readonly struct EquivalenceClass
+ {
+ private readonly TPredicate _set;
+ private readonly IBooleanAlgebra _algebra;
+
+ internal EquivalenceClass(IBooleanAlgebra algebra, TPredicate set)
+ {
+ _set = set;
+ _algebra = algebra;
+ }
+
+ public override int GetHashCode() => _set.GetHashCode();
+
+ public override bool Equals([NotNullWhen(true)] object? obj) => obj is EquivalenceClass ec && _algebra.AreEquivalent(_set, ec._set);
+ }
+
+ /// A partition tree for efficiently solving minterms.
+ ///
+ /// Predicates are pushed into the tree with Refine(), which creates leaves in the tree for all satisfiable
+ /// and non-overlapping combinations with any previously pushed predicates. At the end of the process the
+ /// minterms can be read from the paths to the leaves of the tree.
+ ///
+ /// The valuations of the predicates are represented as follows. Given a path a^-1, a^0, a^1, ..., a^n, predicate
+ /// p^i is true in the corresponding minterm if and only if a^i is the left child of a^i-1.
+ ///
+ /// This class assumes that all predicates passed to Refine() are non-equivalent.
+ ///
+ private sealed class PartitionTree
+ {
+ internal readonly TPredicate _pred;
+ private readonly IBooleanAlgebra _solver;
+ private PartitionTree? _left;
+ private PartitionTree? _right; // complement
+
+ /// Create the root of the partition tree.
+ /// Nodes below this will be indexed starting from 0. The initial predicate is true.
+ internal PartitionTree(IBooleanAlgebra solver) : this(solver, solver.True, null, null) { }
+
+ private PartitionTree(IBooleanAlgebra solver, TPredicate pred, PartitionTree? left, PartitionTree? right)
+ {
+ _solver = solver;
+ _pred = pred;
+ _left = left;
+ _right = right;
+ }
+
+ internal void Refine(TPredicate other)
+ {
+ if (_left is null && _right is null)
+ {
+ // If this is a leaf node create left and/or right children for the new predicate
+ TPredicate thisAndOther = _solver.And(_pred, other);
+ if (_solver.IsSatisfiable(thisAndOther))
+ {
+ // The predicates overlap, now check if this is contained in other
+ TPredicate thisMinusOther = _solver.And(_pred, _solver.Not(other));
+ if (_solver.IsSatisfiable(thisMinusOther))
+ {
+ // This is not contained in other, both children are needed
+ _left = new PartitionTree(_solver, thisAndOther, null, null);
+
+ // The right child corresponds to a conjunction with a negation, which matches thisMinusOther
+ _right = new PartitionTree(_solver, thisMinusOther, null, null);
+ }
+ else // [[this]] subset of [[other]]
+ {
+ // Other implies this, so populate the left child with this
+ _left = new PartitionTree(_solver, _pred, null, null);
+ }
+ }
+ else // [[this]] subset of [[not(other)]]
+ {
+ // negation of other implies this, so populate the right child with this
+ _right = new PartitionTree(_solver, _pred, null, null); //other must be false
+ }
+ }
+ else if (_left is null)
+ {
+ // No choice has to be made here, refine the single child that exists
+ _right!.Refine(other);
+ }
+ else if (_right is null)
+ {
+ // No choice has to be made here, refine the single child that exists
+ _left!.Refine(other);
+ }
+ else
+ {
+ TPredicate thisAndOther = _solver.And(_pred, other);
+ if (_solver.IsSatisfiable(thisAndOther))
+ {
+ // Other is satisfiable in this subtree
+ TPredicate thisMinusOther = _solver.And(_pred, _solver.Not(other));
+ if (_solver.IsSatisfiable(thisMinusOther))
+ {
+ // But other does not imply this whole subtree, refine both children
+ _left.Refine(other);
+ _right.Refine(other);
+ }
+ else // [[this]] subset of [[other]]
+ {
+ // And other implies the whole subtree, include it in all minterms under here
+ _left.ExtendLeft();
+ _right.ExtendLeft();
+ }
+ }
+ else // [[this]] subset of [[not(other)]]
+ {
+ // Other is not satisfiable in this subtree, include its negation in all minterms under here
+ _left.ExtendRight();
+ _right.ExtendRight();
+ }
+ }
+ }
+
+ ///
+ /// Include the next predicate in all minterms under this node. Assumes the next predicate implies the predicate
+ /// of this node.
+ ///
+ private void ExtendLeft()
+ {
+ if (_left is null && _right is null)
+ {
+ _left = new PartitionTree(_solver, _pred, null, null);
+ }
+ else
+ {
+ _left?.ExtendLeft();
+ _right?.ExtendLeft();
+ }
+ }
+
+ ///
+ /// Include the negation of next predicate in all minterms under this node. Assumes the negation of the next
+ /// predicate implies the predicate of this node.
+ ///
+ private void ExtendRight()
+ {
+ if (_left is null && _right is null)
+ {
+ _right = new PartitionTree(_solver, _pred, null, null);
+ }
+ else
+ {
+ _left?.ExtendRight();
+ _right?.ExtendRight();
+ }
+ }
+
+ /// Get the predicates from all of the leaves in the tree.
+ internal List GetLeafPredicates()
+ {
+ var leaves = new List();
+
+ var stack = new Stack();
+ stack.Push(this);
+ while (stack.TryPop(out PartitionTree? node))
+ {
+ if (node._left is null && node._right is null)
+ {
+ leaves.Add(node._pred);
+ }
+ else
+ {
+ if (node._left is not null)
+ {
+ stack.Push(node._left);
+ }
+
+ if (node._right is not null)
+ {
+ stack.Push(node._right);
+ }
+ }
+ }
+
+ return leaves;
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BooleanClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BooleanClassifier.cs
new file mode 100644
index 00000000000000..b10fdd174efb72
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BooleanClassifier.cs
@@ -0,0 +1,62 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Runtime.CompilerServices;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ /// Classifies characters as true or false based on a supplied .
+ ///
+ /// The classification is determined entirely by the BDD used to construct the classifier, and in fact
+ /// simply calling Contains on the BDD instead of using the classifier would suffice from a correctness
+ /// perspective. The classifier as a wrapper for the BDD is valuable in order to optimize for ASCII, as
+ /// it precomputes the results for ASCII inputs and stores them in a separate table, only falling back
+ /// to using the BDD for non-ASCII.
+ ///
+ internal sealed class BooleanClassifier
+ {
+ /// Lookup table used for ASCII characters.
+ private readonly bool[] _ascii;
+ /// BDD used for non-ASCII characters.
+ private readonly BDD _nonAscii;
+
+ /// Create a Boolean classifier.
+ /// Character algebra (the algebra is not stored in the classifier)
+ /// Elements that map to true.
+ public BooleanClassifier(CharSetSolver solver, BDD bdd)
+ {
+ // We want to optimize for ASCII, so query the BDD for each ASCII character in
+ // order to precompute a lookup table we'll use at match time.
+ var ascii = new bool[128];
+ for (int i = 0; i < ascii.Length; i++)
+ {
+ ascii[i] = bdd.Contains(i);
+ }
+
+ // At this point, we'll never consult the BDD for ASCII characters, so as an
+ // optimization we can remove them from the BDD in hopes of simplifying it and making
+ // it faster to query for the non-ASCII characters we will use it for. However, while
+ // this is typically an optimization, it isn't always: the act of removing some
+ // characters from the BDD can actually make the branching more complicated. The
+ // extreme case of this is when the BDD is True, meaning everything maps to True, which
+ // is as simple a BDD as you can get. In such a case, even though it's rare, this would
+ // definitively be a deoptimization, so we avoid doing so. Other trivial cases are handled
+ // by And itself, e.g. if the BDD == False, then And will just return False.
+ if (!bdd.IsFull)
+ {
+ bdd = solver.And(solver._nonAscii, bdd);
+ }
+
+ _ascii = ascii;
+ _nonAscii = bdd;
+ }
+
+ /// Gets whether the specified character is classified as true.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public bool IsTrue(char c)
+ {
+ bool[] ascii = _ascii;
+ return c < ascii.Length ? ascii[c] : _nonAscii.Contains(c);
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs
new file mode 100644
index 00000000000000..39f95920da31b3
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs
@@ -0,0 +1,44 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ internal static class CharKind
+ {
+ /// Number of kinds of chars.
+ internal const int CharKindCount = 5;
+
+ /// All characters other than those in the four other kinds.
+ internal const uint General = 0;
+
+ /// Start or Stop of input (bit 0 is 1)
+ internal const uint StartStop = 1;
+
+ /// New line character (\n) (bit 1 is 1)
+ internal const uint Newline = 2;
+
+ /// Last \n or first \n in reverse mode (both Newline and StartStop bits are 1)
+ internal const uint NewLineS = 3;
+
+ /// Word letter (bit 2 is 1)
+ internal const uint WordLetter = 4;
+
+ /// Gets the previous character kind from a context
+ internal static uint Prev(uint context) => context & 0x7;
+
+ /// Gets the next character kind from a context
+ internal static uint Next(uint context) => context >> 3;
+
+ /// Creates the context of the previous and the next character kinds.
+ internal static uint Context(uint prevKind, uint nextKind) => (nextKind << 3) | prevKind;
+
+ internal static string DescribePrev(uint i) => i switch
+ {
+ StartStop => @"\A",
+ Newline => @"\n",
+ NewLineS => @"\A\n",
+ WordLetter => @"\w",
+ _ => string.Empty,
+ };
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs
new file mode 100644
index 00000000000000..254a7fd94407d2
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs
@@ -0,0 +1,138 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Net;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ /// Captures a state of a DFA explored during matching.
+ internal sealed class DfaMatchingState where T : notnull
+ {
+ internal DfaMatchingState(SymbolicRegexNode node, uint prevCharKind)
+ {
+ Node = node;
+ PrevCharKind = prevCharKind;
+ }
+
+ internal SymbolicRegexNode Node { get; }
+ internal uint PrevCharKind { get; }
+
+ internal int Id { get; set; }
+ internal bool IsInitialState { get; set; }
+
+ /// State is lazy
+ internal bool IsLazy => Node._info.IsLazy;
+
+ /// This is a deadend state
+ internal bool IsDeadend => Node.IsNothing;
+
+ /// The node must be nullable here
+ internal int WatchDog
+ {
+ get
+ {
+ if (Node._kind == SymbolicRegexKind.WatchDog)
+ {
+ return Node._lower;
+ }
+
+ if (Node._kind == SymbolicRegexKind.Or)
+ {
+ Debug.Assert(Node._alts is not null);
+ return Node._alts._watchdog;
+ }
+
+ return -1;
+ }
+ }
+
+ /// If true then the state is a dead-end, rejects all inputs.
+ internal bool IsNothing => Node.IsNothing;
+
+ /// If true then state starts with a ^ or $ or \A or \z or \Z
+ internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor;
+
+ ///
+ /// Compute the target state for the given input minterm.
+ /// If is False this means that this is \n and it is the last character of the input.
+ ///
+ /// minterm corresponding to some input character or False corresponding to last \n
+ internal DfaMatchingState Next(T minterm)
+ {
+ ICharAlgebra alg = Node._builder._solver;
+ T wordLetterPredicate = Node._builder._wordLetterPredicateForAnchors;
+ T newLinePredicate = Node._builder._newLinePredicate;
+
+ // minterm == solver.False is used to represent the very last \n
+ uint nextCharKind = 0;
+ if (alg.False.Equals(minterm))
+ {
+ nextCharKind = CharKind.NewLineS;
+ minterm = newLinePredicate;
+ }
+ else if (newLinePredicate.Equals(minterm))
+ {
+ // If the previous state was the start state, mark this as the very FIRST \n.
+ // Essentially, this looks the same as the very last \n and is used to nullify
+ // rev(\Z) in the conext of a reversed automaton.
+ nextCharKind = PrevCharKind == CharKind.StartStop ?
+ CharKind.NewLineS :
+ CharKind.Newline;
+ }
+ else if (alg.IsSatisfiable(alg.And(wordLetterPredicate, minterm)))
+ {
+ nextCharKind = CharKind.WordLetter;
+ }
+
+ // Combined character context
+ uint context = CharKind.Context(PrevCharKind, nextCharKind);
+
+ // Compute the derivative of the node for the given context
+ SymbolicRegexNode derivative = Node.MkDerivative(minterm, context);
+
+ // nextCharKind will be the PrevCharKind of the target state
+ // use an existing state instead if one exists already
+ // otherwise create a new new id for it
+ return Node._builder.MkState(derivative, nextCharKind);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal bool IsNullable(uint nextCharKind)
+ {
+ Debug.Assert(nextCharKind is 0 or CharKind.StartStop or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS);
+ uint context = CharKind.Context(PrevCharKind, nextCharKind);
+ return Node.IsNullableFor(context);
+ }
+
+ public override bool Equals(object? obj) =>
+ obj is DfaMatchingState s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node);
+
+ public override int GetHashCode() => (PrevCharKind, Node).GetHashCode();
+
+ public override string ToString() =>
+ PrevCharKind == 0 ? Node.ToString() :
+ $"({CharKind.DescribePrev(PrevCharKind)},{Node})";
+
+ internal string DgmlView
+ {
+ get
+ {
+ string info = CharKind.DescribePrev(PrevCharKind);
+ if (info != string.Empty)
+ {
+ info = $"Previous: {info}
";
+ }
+
+ string deriv = WebUtility.HtmlEncode(Node.ToString());
+ if (deriv == string.Empty)
+ {
+ deriv = "()";
+ }
+
+ return $"{info}{deriv}";
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/DgmlWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/DgmlWriter.cs
new file mode 100644
index 00000000000000..cbd134be453635
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/DgmlWriter.cs
@@ -0,0 +1,241 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#if DEBUG
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+
+namespace System.Text.RegularExpressions.Symbolic.DGML
+{
+ internal sealed class DgmlWriter
+ {
+ private readonly int _maxDgmlTransitionLabelLength;
+ private readonly TextWriter _tw;
+ private readonly bool _hideStateInfo;
+ private readonly bool _onlyDFAinfo;
+
+ internal DgmlWriter(TextWriter tw, bool hideStateInfo, int maxDgmlTransitionLabelLength = -1, bool onlyDFAinfo = false)
+ {
+ _maxDgmlTransitionLabelLength = maxDgmlTransitionLabelLength;
+ _tw = tw;
+ _hideStateInfo = hideStateInfo;
+ _onlyDFAinfo = onlyDFAinfo;
+ }
+
+ ///
+ /// Write the automaton in dgml format into the textwriter.
+ ///
+ public void Write(IAutomaton fa)
+ {
+ var nonEpsilonMoves = new Dictionary<(int, int), List>();
+ var epsilonmoves = new List>();
+
+ var nonEpsilonStates = new HashSet();
+
+ foreach (Move move in fa.GetMoves())
+ {
+ if (move.IsEpsilon)
+ {
+ epsilonmoves.Add(move);
+ }
+ else
+ {
+ nonEpsilonStates.Add(move.SourceState);
+ var p = (move.SourceState, move.TargetState);
+ if (!nonEpsilonMoves.TryGetValue(p, out List? rules))
+ {
+ rules = new List();
+ nonEpsilonMoves[p] = rules;
+ }
+
+ Debug.Assert(move.Label is not null);
+ rules.Add(move.Label);
+ }
+ }
+
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("", GetDFAInfo(fa));
+ _tw.WriteLine("", GetDFAInfo(fa));
+ if (_onlyDFAinfo)
+ {
+ _tw.WriteLine("");
+ }
+ else
+ {
+ foreach (int state in fa.GetStates())
+ {
+ _tw.WriteLine("", state, _hideStateInfo ? "Collapsed" : "Expanded", GetStateInfo(fa, state));
+ if (state == fa.InitialState)
+ {
+ _tw.WriteLine("");
+ }
+ if (fa.IsFinalState(state))
+ {
+ _tw.WriteLine("");
+ }
+ _tw.WriteLine("");
+ _tw.WriteLine("", state, GetStateInfo(fa, state));
+ }
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("", fa.InitialState, fa.DescribeStartLabel());
+ _tw.WriteLine("");
+
+ foreach (Move move in epsilonmoves)
+ {
+ _tw.WriteLine("", move.SourceState, move.TargetState);
+ }
+
+ foreach (KeyValuePair<(int, int), List> move in nonEpsilonMoves)
+ {
+ _tw.WriteLine(GetNonFinalRuleInfo(fa, move.Key.Item1, move.Key.Item2, move.Value));
+ }
+
+ foreach (int state in fa.GetStates())
+ {
+ _tw.WriteLine("", state);
+ }
+
+ _tw.WriteLine("");
+ WriteCategoriesAndStyles();
+ }
+ _tw.WriteLine("");
+ }
+
+ private string GetDFAInfo(IAutomaton fa)
+ {
+ StringBuilder sb = new();
+ sb.Append("|Q|=");
+ sb.Append(fa.StateCount);
+ sb.Append("
");
+ sb.Append('|');
+ sb.Append(DeltaCapital);
+ sb.Append("|=");
+ sb.Append(fa.TransitionCount);
+ sb.Append("
");
+ sb.Append('|');
+ sb.Append(SigmalCapital);
+ sb.Append("|=");
+ sb.Append(fa.Alphabet.Length);
+ sb.Append("
");
+ sb.Append(SigmalCapital);
+ sb.Append('=');
+ for (int i = 0; i < fa.Alphabet.Length; i++)
+ {
+ if (i > 0)
+ sb.Append(',');
+ sb.Append(fa.DescribeLabel(fa.Alphabet[i]));
+ }
+ return sb.ToString();
+ }
+
+ private const string DeltaCapital = "Δ";
+ private const string SigmalCapital = "Σ";
+
+ private static string GetStateInfo(IAutomaton fa, int state)
+ {
+ StringBuilder sb = new();
+ sb.Append(fa.DescribeState(state));
+ return sb.ToString();
+ }
+
+ private string GetNonFinalRuleInfo(IAutomaton aut, int source, int target, List rules)
+ {
+ string lab = "";
+ string info = "";
+ for (int i = 0; i < rules.Count; i++)
+ {
+ lab += (lab == "" ? "" : ",\n ") + aut.DescribeLabel(rules[i]);
+ }
+
+ int lab_length = lab.Length;
+ if (_maxDgmlTransitionLabelLength >= 0 && lab_length > _maxDgmlTransitionLabelLength)
+ {
+ info += $" FullLabel = \"{lab}\"";
+ lab = string.Concat(lab.AsSpan(0, _maxDgmlTransitionLabelLength), "..");
+ }
+
+ return $"";
+ }
+
+ private void WriteCategoriesAndStyles()
+ {
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ //_tw.WriteLine("");
+ //_tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ _tw.WriteLine("");
+ }
+ }
+}
+#endif
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/IAutomaton.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/IAutomaton.cs
new file mode 100644
index 00000000000000..f6237967905f49
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/IAutomaton.cs
@@ -0,0 +1,66 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#if DEBUG
+using System.Collections.Generic;
+
+namespace System.Text.RegularExpressions.Symbolic.DGML
+{
+ ///
+ /// For accessing the key components of an automaton.
+ ///
+ /// type of labels in moves
+ internal interface IAutomaton
+ {
+ ///
+ /// Enumerates all moves of the automaton.
+ ///
+ IEnumerable> GetMoves();
+
+ ///
+ /// Enumerates all states of the automaton.
+ ///
+ IEnumerable GetStates();
+
+ ///
+ /// Returns the minterm partition of the alphabet.
+ ///
+ TLabel[] Alphabet { get; }
+
+ ///
+ /// Provides a description of the state for visualization purposes.
+ ///
+ string DescribeState(int state);
+
+ ///
+ /// Provides a description of the label for visualization purposes.
+ ///
+ string DescribeLabel(TLabel lab);
+
+ ///
+ /// Provides a description of the start label for visualization purposes.
+ ///
+ string DescribeStartLabel();
+
+ ///
+ /// The initial state of the automaton.
+ ///
+ int InitialState { get; }
+
+ ///
+ /// The number of states of the automaton.
+ ///
+ int StateCount { get; }
+
+ ///
+ /// The number of transitions of the automaton.
+ ///
+ int TransitionCount { get; }
+
+ ///
+ /// Returns true iff the state is a final state.
+ ///
+ bool IsFinalState(int state);
+ }
+}
+#endif
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/Move.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/Move.cs
new file mode 100644
index 00000000000000..e25e3b720a9017
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/Move.cs
@@ -0,0 +1,77 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#if DEBUG
+using System.Diagnostics.CodeAnalysis;
+
+namespace System.Text.RegularExpressions.Symbolic.DGML
+{
+ ///
+ /// Represents a move of a symbolic finite automaton.
+ /// The value default(L) is reserved to represent the label of an epsilon move.
+ /// Thus if S is a reference type the label of an epsilon move is null.
+ ///
+ /// the type of the labels on moves
+ internal sealed class Move
+ {
+ ///
+ /// Source state of the move
+ ///
+ public readonly int SourceState;
+ ///
+ /// Target state of the move
+ ///
+ public readonly int TargetState;
+ ///
+ /// Label of the move
+ ///
+ public readonly TLabel? Label;
+
+ ///
+ /// Transition of an automaton.
+ ///
+ /// source state of the transition
+ /// target state of the transition
+ /// label of the transition
+ public Move(int sourceState, int targetState, TLabel? lab)
+ {
+ SourceState = sourceState;
+ TargetState = targetState;
+ Label = lab;
+ }
+
+ ///
+ /// Creates a move. Creates an epsilon move if label is default(L).
+ ///
+ public static Move Create(int sourceState, int targetState, TLabel condition) => new Move(sourceState, targetState, condition);
+
+ ///
+ /// Creates an epsilon move. Same as Create(sourceState, targetState, default(L)).
+ ///
+ public static Move Epsilon(int sourceState, int targetState) => new Move(sourceState, targetState, default);
+
+ ///
+ /// Returns true if label equals default(S).
+ ///
+ public bool IsEpsilon => Equals(Label, default(TLabel));
+
+ ///
+ /// Returns true if the source state and the target state are identical
+ ///
+ public bool IsSelfLoop => SourceState == TargetState;
+
+ ///
+ /// Returns true if obj is a move with the same source state, target state, and label.
+ ///
+ public override bool Equals([NotNullWhen(false)] object? obj) =>
+ obj is Move t &&
+ t.SourceState == SourceState &&
+ t.TargetState == TargetState &&
+ (t.Label is null ? Label is null : t.Label.Equals(Label));
+
+ public override int GetHashCode() => (SourceState, Label, TargetState).GetHashCode();
+
+ public override string ToString() => $"({SourceState},{(Equals(Label, default(TLabel)) ? "" : Label + ",")}{TargetState})";
+ }
+}
+#endif
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/RegexAutomaton.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/RegexAutomaton.cs
new file mode 100644
index 00000000000000..56273393331812
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/RegexAutomaton.cs
@@ -0,0 +1,140 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#if DEBUG
+using System.Collections.Generic;
+using System.Diagnostics;
+
+namespace System.Text.RegularExpressions.Symbolic.DGML
+{
+ ///
+ /// Used by DgmlWriter to unwind a regex into a DFA up to a bound that limits the number of states
+ ///
+ internal sealed class RegexAutomaton : IAutomaton<(SymbolicRegexNode?, T)> where T : notnull
+ {
+ private readonly DfaMatchingState _q0;
+ private readonly List _states = new();
+ private readonly HashSet _stateSet = new();
+ private readonly List?, T)>> _moves = new();
+ private readonly SymbolicRegexBuilder _builder;
+ private SymbolicNFA? _nfa;
+
+ internal RegexAutomaton(SymbolicRegexMatcher srm, int bound, bool addDotStar, bool inReverse, bool asNFA)
+ {
+ _builder = srm._builder;
+ uint startId = inReverse ?
+ (srm._reversePattern._info.StartsWithLineAnchor ? CharKind.StartStop : 0) :
+ (srm._pattern._info.StartsWithLineAnchor ? CharKind.StartStop : 0);
+
+ //inReverse only matters if Ar contains some line anchor
+ _q0 = _builder.MkState(inReverse ? srm._reversePattern : (addDotStar ? srm._dotStarredPattern : srm._pattern), startId);
+
+ if (asNFA)
+ {
+ _nfa = _q0.Node.Explore(bound);
+ for (int q = 0; q < _nfa.StateCount; q++)
+ {
+ _states.Add(q);
+ foreach ((T, SymbolicRegexNode?, int) branch in _nfa.EnumeratePaths(q))
+ _moves.Add(Move<(SymbolicRegexNode?, T)>.Create(q, branch.Item3, (branch.Item2, branch.Item1)));
+ }
+ }
+ else
+ {
+ Dictionary<(int, int), T> normalizedmoves = new();
+ Stack> stack = new();
+ stack.Push(_q0);
+ _states.Add(_q0.Id);
+ _stateSet.Add(_q0.Id);
+
+ T[]? partition = _builder._solver.GetMinterms();
+ Debug.Assert(partition is not null);
+ //unwind until the stack is empty or the bound has been reached
+ while (stack.Count > 0 && (bound <= 0 || _states.Count < bound))
+ {
+ DfaMatchingState q = stack.Pop();
+ foreach (T c in partition)
+ {
+ DfaMatchingState p = q.Next(c);
+
+ // check that p is not a dead-end
+ if (!p.IsNothing)
+ {
+ if (_stateSet.Add(p.Id))
+ {
+ stack.Push(p);
+ _states.Add(p.Id);
+ }
+
+ var qp = (q.Id, p.Id);
+ normalizedmoves[qp] = normalizedmoves.ContainsKey(qp) ?
+ _builder._solver.Or(normalizedmoves[qp], c) :
+ c;
+ }
+ }
+ }
+
+ foreach (KeyValuePair<(int, int), T> entry in normalizedmoves)
+ _moves.Add(Move<(SymbolicRegexNode?, T)>.Create(entry.Key.Item1, entry.Key.Item2, (null, entry.Value)));
+ }
+ }
+
+ public (SymbolicRegexNode?, T)[] Alphabet
+ {
+ get
+ {
+ T[]? alphabet = _builder._solver.GetMinterms();
+ Debug.Assert(alphabet is not null);
+ var results = new (SymbolicRegexNode?, T)[alphabet.Length];
+ for (int i = 0; i < alphabet.Length; i++)
+ {
+ results[i] = (null, alphabet[i]);
+ }
+ return results;
+ }
+ }
+
+ public int InitialState => _nfa is not null ? 0 : _q0.Id;
+
+ public int StateCount => _states.Count;
+
+ public int TransitionCount => _moves.Count;
+
+ public string DescribeLabel((SymbolicRegexNode?, T) lab) =>
+ lab.Item1 is null ? Net.WebUtility.HtmlEncode(_builder._solver.PrettyPrint(lab.Item2)) :
+ // Conditional nullability based on anchors
+ Net.WebUtility.HtmlEncode($"{lab.Item1}/{_builder._solver.PrettyPrint(lab.Item2)}");
+
+ public string DescribeStartLabel() => "";
+
+ public string DescribeState(int state)
+ {
+ if (_nfa is not null)
+ {
+ Debug.Assert(state < _nfa.StateCount);
+ var str = Net.WebUtility.HtmlEncode(_nfa.GetNode(state).ToString());
+ return _nfa.IsUnexplored(state) ? $"Unexplored:{str}" : str;
+ }
+
+ Debug.Assert(_builder._statearray is not null);
+ return _builder._statearray[state].DgmlView;
+ }
+
+ public IEnumerable GetStates() => _states;
+
+ public bool IsFinalState(int state)
+ {
+ if (_nfa is not null)
+ {
+ Debug.Assert(state < _nfa.StateCount);
+ return _nfa.CanBeNullable(state);
+ }
+
+ Debug.Assert(_builder._statearray is not null && state < _builder._statearray.Length);
+ return _builder._statearray[state].Node.CanBeNullable;
+ }
+
+ public IEnumerable?, T)>> GetMoves() => _moves;
+ }
+}
+#endif
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
new file mode 100644
index 00000000000000..e42576421e3ce2
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -0,0 +1,100 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ /// Classifies characters as their corresponding minterm IDs.
+ ///
+ /// Minterms are a mechanism for compressing the input character space, or "alphabet",
+ /// by creating an equivalence class for all characters treated the same. For example,
+ /// in the expression "[0-9]*", the 10 digits 0 through 9 are all treated the same as each
+ /// other, and every other of the 65,526 characters are treated the same as each other,
+ /// so there are two minterms, one for the digits, and one for everything else. Minterms
+ /// are computed in such a way that every character maps to one and only one minterm.
+ /// While in the limit there could be one minterm per character, in practice the number
+ /// of minterms for any reasonable expression is way less, and in fact is typically
+ /// less than 64.
+ ///
+ internal sealed class MintermClassifier
+ {
+ /// An array used when there's a single minterm, in order to map every ASCII character to it trivially.
+ private static readonly int[] AllAsciiIsZeroMintermArray = new int[128];
+
+ /// Array providing fast mapping from an ASCII character (the array index) to its corresponding minterm ID.
+ private readonly int[] _ascii;
+ /// A multi-terminal BDD for mapping any non-ASCII character to its associated minterm ID.
+ ///
+ /// The use of a multi-terminal BDD here is an implementation detail. Should we decide its important to optimize non-ASCII inputs further,
+ /// or to consolidate the mechanism with the other engines, an alternatie lookup algorithm / data structure could be employed.
+ ///
+ private readonly BDD _nonAscii;
+
+ /// Create a classifier that maps a character to the ID of its associated minterm.
+ /// Character algebra
+ /// A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.
+ public MintermClassifier(CharSetSolver solver, BDD[] minterms)
+ {
+ Debug.Assert(minterms.Length > 0, "Requires at least");
+
+ if (minterms.Length == 1)
+ {
+ // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
+ // For ASCII, use an array containing all zeros. For non-ASCII, use a BDD that maps everything to 0.
+ _ascii = AllAsciiIsZeroMintermArray;
+ _nonAscii = solver.ReplaceTrue(BDD.True, 0);
+ return;
+ }
+
+ // Create a multi-terminal BDD for mapping any character to its associated minterm.
+ BDD anyCharacterToMintermId = BDD.False;
+ for (int i = 0; i < minterms.Length; i++)
+ {
+ // Each supplied minterm BDD decides whether a given character maps to it or not.
+ // We need to combine all of those into a multi-terminal BDD that decides which
+ // minterm a character maps to. To do that, we take each minterm BDD and replace
+ // its True result with the ID of the minterm, such that a character that would
+ // have returned True for that BDD now returns the minterm ID.
+ BDD charToTargetMintermId = solver.ReplaceTrue(minterms[i], i);
+
+ // Now union this BDD with the multi-terminal BDD we've built up thus far. Unioning
+ // is valid because every character belongs to exactly one minterm and thus will
+ // only map to an ID instead of False in exactly one of the input BDDs.
+ anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId);
+ }
+
+ // Now that we have our mapping that supports any input character, we want to optimize for
+ // ASCII inputs. Rather than forcing every input ASCII character to consult the BDD at match
+ // time, we precompute a lookup table, where each ASCII character can be used to index into the
+ // array to determine the ID for its corresponding minterm.
+ var ascii = new int[128];
+ for (int i = 0; i < ascii.Length; i++)
+ {
+ ascii[i] = anyCharacterToMintermId.Find(i);
+ }
+ _ascii = ascii;
+
+ // We can also further optimize the BDD in two ways:
+ // 1. We can now remove the ASCII characters from it, as we'll always consult the lookup table first
+ // for ASCII inputs and thus will never use the BDD for them. While optional (skipping this step will not
+ // affect correctness), removing the ASCII values from the BDD reduces the size of the multi-terminal BDD.
+ // 2. We can check if every character now maps to the same minterm ID (the same terminal in the
+ // multi-terminal BDD). This can be relatively common after (1) above is applied, as many
+ // patterns don't distinguish between any non-ASCII characters (e.g. "[0-9]*"). If every character
+ // in the BDD now maps to the same minterm, we can replace the BDD with a much simpler/faster/smaller one.
+ BDD nonAsciiBDD = solver.And(anyCharacterToMintermId, solver._nonAscii);
+ nonAsciiBDD = nonAsciiBDD.IsEssentiallyBoolean(out BDD? singleTerminalBDD) ? singleTerminalBDD : nonAsciiBDD;
+ _nonAscii = nonAsciiBDD;
+ }
+
+ /// Gets the ID of the minterm associated with the specified character.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public int GetMintermID(int c)
+ {
+ int[] ascii = _ascii;
+ return (uint)c < ascii.Length ? ascii[c] : _nonAscii.Find(c);
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs
new file mode 100644
index 00000000000000..7b88e04f29c969
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs
@@ -0,0 +1,510 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Globalization;
+using System.Runtime.CompilerServices;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ /// Provides functionality to convert s to corresponding s.
+ internal sealed class RegexNodeToSymbolicConverter
+ {
+ internal readonly Unicode.UnicodeCategoryTheory _categorizer;
+ internal readonly SymbolicRegexBuilder _builder;
+ private readonly CultureInfo _culture;
+ private readonly Dictionary<(bool, string), BDD> _createConditionFromSet_Cache = new();
+
+ /// Constructs a regex to symbolic finite automata converter
+ public RegexNodeToSymbolicConverter(Unicode.UnicodeCategoryTheory categorizer, CultureInfo culture)
+ {
+ _categorizer = categorizer;
+ _culture = culture;
+ Solver = categorizer._solver;
+ _builder = new SymbolicRegexBuilder(Solver);
+ }
+
+ /// The character solver associated with the regex converter
+ public ICharAlgebra Solver { get; }
+
+ private BDD CreateConditionFromSet(bool ignoreCase, string set)
+ {
+ (bool ignoreCase, string set) key = (ignoreCase, set);
+ if (!_createConditionFromSet_Cache.TryGetValue(key, out BDD? result))
+ {
+ _createConditionFromSet_Cache[key] = result = Compute(ignoreCase, set);
+ }
+
+ return result;
+
+ BDD Compute(bool ignoreCase, string set)
+ {
+ // Char at position 0 is 1 iff the set is negated
+ bool negate = RegexCharClass.IsNegated(set);
+
+ // The set is divided into three pieces: ranges, conditions, subtraction
+
+ // Handle ranges:
+ // Following are conditions over characters in the set.
+ // These will become disjuncts of a single disjunction
+ // or conjuncts of a conjunction in case negate is true.
+ // Negation is pushed in when the conditions are created.
+ List conditions = new List();
+ foreach ((char first, char last) in ComputeRanges(set))
+ {
+ BDD cond = Solver.RangeConstraint(first, last, ignoreCase, _culture.Name);
+ conditions.Add(negate ? Solver.Not(cond) : cond);
+ }
+
+ // Handle categories:
+ int setLength = set[RegexCharClass.SetLengthIndex];
+ int catLength = set[RegexCharClass.CategoryLengthIndex];
+ int catStart = setLength + RegexCharClass.SetStartIndex;
+ int j = catStart;
+ while (j < catStart + catLength)
+ {
+ // Singleton categories are stored as unicode characters whose code is 1 + the
+ // unicode category code as a short. Thus -1 is applied to extract the actual
+ // code of the category. The category itself may be negated, e.g. \D instead of \d.
+ short catCode = (short)set[j++];
+ if (catCode != 0)
+ {
+ // Note that double negation cancels out the negation of the category.
+ BDD cond = MapCategoryCodeToCondition(Math.Abs(catCode) - 1);
+ conditions.Add(catCode < 0 ^ negate ? Solver.Not(cond) : cond);
+ }
+ else
+ {
+ // Special case for a whole group G of categories surrounded by 0's.
+ // Essentially 0 C1 C2 ... Cn 0 ==> G = (C1 | C2 | ... | Cn)
+ catCode = (short)set[j++];
+ if (catCode == 0)
+ {
+ continue; //empty set of categories
+ }
+
+ // Collect individual category codes into this set
+ var catCodes = new HashSet();
+
+ // If the first catCode is negated, the group as a whole is negated
+ bool negGroup = catCode < 0;
+
+ while (catCode != 0)
+ {
+ catCodes.Add(Math.Abs(catCode) - 1);
+ catCode = (short)set[j++];
+ }
+
+ // C1 | C2 | ... | Cn
+ BDD catCondDisj = MapCategoryCodeSetToCondition(catCodes);
+
+ BDD catGroupCond = negate ^ negGroup ? Solver.Not(catCondDisj) : catCondDisj;
+ conditions.Add(catGroupCond);
+ }
+ }
+
+ // Handle subtraction
+ BDD? subtractorCond = null;
+ if (set.Length > j)
+ {
+ // The set has a subtractor-set at the end.
+ // All characters in the subtractor-set are excluded from the set.
+ // Note that the subtractor sets may be nested, e.g. in r=[a-z-[b-g-[cd]]]
+ // the subtractor set [b-g-[cd]] has itself a subtractor set [cd].
+ // Thus r is the set of characters between a..z except b,e,f,g
+ subtractorCond = CreateConditionFromSet(ignoreCase, set.Substring(j));
+ }
+
+ // If there are no ranges and no groups then there are no conditions.
+ // This situation arises for SingleLine regegex option and .
+ // and means that all characters are accepted.
+ BDD moveCond = conditions.Count == 0 ?
+ (negate ? Solver.False : Solver.True) :
+ (negate ? Solver.And(conditions) : Solver.Or(conditions));
+
+ // Subtlety of regex sematics:
+ // The subtractor is not within the scope of the negation (if there is a negation).
+ // Thus the negated subtractor is conjuncted with moveCond after the negation has been
+ // performed above.
+ if (subtractorCond is not null)
+ {
+ moveCond = Solver.And(moveCond, Solver.Not(subtractorCond));
+ }
+
+ return moveCond;
+
+ static List<(char First, char Last)> ComputeRanges(string set)
+ {
+ int setLength = set[RegexCharClass.SetLengthIndex];
+
+ var ranges = new List<(char, char)>(setLength);
+ int i = RegexCharClass.SetStartIndex;
+ int end = i + setLength;
+ while (i < end)
+ {
+ char first = set[i];
+ i++;
+
+ char last = i < end ?
+ (char)(set[i] - 1) :
+ RegexCharClass.LastChar;
+ i++;
+
+ ranges.Add((first, last));
+ }
+
+ return ranges;
+ }
+
+ BDD MapCategoryCodeSetToCondition(HashSet catCodes)
+ {
+ // TBD: perhaps other common cases should be specialized similarly
+ // check first if all word character category combinations are covered
+ // which is the most common case, then use the combined predicate \w
+ // rather than a disjunction of the component category predicates
+ // the word character class \w covers categories 0,1,2,3,4,8,18
+ BDD? catCond = null;
+ if (catCodes.Contains(0) && catCodes.Contains(1) && catCodes.Contains(2) && catCodes.Contains(3) &&
+ catCodes.Contains(4) && catCodes.Contains(8) && catCodes.Contains(18))
+ {
+ catCodes.Remove(0);
+ catCodes.Remove(1);
+ catCodes.Remove(2);
+ catCodes.Remove(3);
+ catCodes.Remove(4);
+ catCodes.Remove(8);
+ catCodes.Remove(18);
+ catCond = _categorizer.WordLetterCondition;
+ }
+
+ foreach (int cat in catCodes)
+ {
+ BDD cond = MapCategoryCodeToCondition(cat);
+ catCond = catCond is null ? cond : Solver.Or(catCond, cond);
+ }
+
+ Debug.Assert(catCodes.Count != 0);
+ return catCond!;
+ }
+
+ BDD MapCategoryCodeToCondition(int code) =>
+ code switch
+ {
+ 99 => _categorizer.WhiteSpaceCondition, // whitespace has special code 99
+ < 0 or > 29 => throw new ArgumentOutOfRangeException(nameof(code), "Must be in the range 0..29 or equal to 99"), // TODO-NONBACKTRACKING: Remove message or put it into the .resx
+ _ => _categorizer.CategoryCondition(code)
+ };
+ }
+ }
+
+ public SymbolicRegexNode Convert(RegexNode node, bool topLevel)
+ {
+ // Guard against stack overflow due to deep recursion
+ if (!RuntimeHelpers.TryEnsureSufficientExecutionStack())
+ {
+ RegexNode localNode = node;
+ bool localTopLevel = topLevel;
+ return StackHelper.CallOnEmptyStack(() => Convert(localNode, localTopLevel));
+ }
+
+ switch (node.Type)
+ {
+ case RegexNode.Alternate:
+ {
+ var nested = new SymbolicRegexNode[node.ChildCount()];
+ for (int i = 0; i < nested.Length; i++)
+ {
+ nested[i] = Convert(node.Child(i), topLevel);
+ }
+ return _builder.MkOr(nested);
+ }
+
+ case RegexNode.Beginning:
+ return _builder._startAnchor;
+
+ case RegexNode.Bol:
+ EnsureNewlinePredicateInitialized();
+ return _builder._bolAnchor;
+
+ case RegexNode.Capture when node.N == -1:
+ return Convert(node.Child(0), topLevel); // treat as non-capturing group (...)
+
+ case RegexNode.Concatenate:
+ {
+ List nested = FlattenNestedConcatenations(node);
+ var converted = new SymbolicRegexNode[nested.Count];
+ for (int i = 0; i < converted.Length; i++)
+ {
+ converted[i] = Convert(nested[i], topLevel: false);
+ }
+ return _builder.MkConcat(converted, topLevel);
+ }
+
+ case RegexNode.Empty:
+ case RegexNode.UpdateBumpalong: // optional directive that behaves the same as Empty
+ return _builder._epsilon;
+
+ case RegexNode.End: // \z anchor
+ return _builder._endAnchor;
+
+ case RegexNode.EndZ: // \Z anchor
+ EnsureNewlinePredicateInitialized();
+ return _builder._endAnchorZ;
+
+ case RegexNode.Eol:
+ EnsureNewlinePredicateInitialized();
+ return _builder._eolAnchor;
+
+ case RegexNode.Loop:
+ return _builder.MkLoop(Convert(node.Child(0), topLevel: false), isLazy: false, node.M, node.N);
+
+ case RegexNode.Lazyloop:
+ return _builder.MkLoop(Convert(node.Child(0), topLevel: false), isLazy: true, node.M, node.N);
+
+ case RegexNode.Multi:
+ return ConvertMulti(node, topLevel);
+
+ case RegexNode.Notone:
+ return _builder.MkSingleton(Solver.Not(Solver.CharConstraint(node.Ch, (node.Options & RegexOptions.IgnoreCase) != 0, _culture.Name)));
+
+ case RegexNode.Notoneloop:
+ case RegexNode.Notonelazy:
+ return ConvertNotoneloop(node, node.Type == RegexNode.Notonelazy);
+
+ case RegexNode.One:
+ return _builder.MkSingleton(Solver.CharConstraint(node.Ch, (node.Options & RegexOptions.IgnoreCase) != 0, _culture.Name));
+
+ case RegexNode.Oneloop:
+ case RegexNode.Onelazy:
+ return ConvertOneloop(node, node.Type == RegexNode.Onelazy);
+
+ case RegexNode.Set:
+ return ConvertSet(node);
+
+ case RegexNode.Setloop:
+ case RegexNode.Setlazy:
+ return ConvertSetloop(node, node.Type == RegexNode.Setlazy);
+
+ // TBD: ECMA case intersect predicate with ascii range ?
+ case RegexNode.Boundary:
+ case RegexNode.ECMABoundary:
+ EnsureWordLetterPredicateInitialized();
+ return _builder._wbAnchor;
+
+ // TBD: ECMA case intersect predicate with ascii range ?
+ case RegexNode.NonBoundary:
+ case RegexNode.NonECMABoundary:
+ EnsureWordLetterPredicateInitialized();
+ return _builder._nwbAnchor;
+
+ case RegexNode.Nothing:
+ return _builder._nothing;
+
+#if DEBUG
+ case RegexNode.Testgroup:
+ // Try to extract the special case representing complement or intersection
+ if (IsComplementedNode(node))
+ {
+ return _builder.MkNot(Convert(node.Child(0), topLevel: false));
+ }
+
+ if (TryGetIntersection(node, out List? conjuncts))
+ {
+ var nested = new SymbolicRegexNode[conjuncts.Count];
+ for (int i = 0; i < nested.Length; i++)
+ {
+ nested[i] = Convert(conjuncts[i], topLevel: false);
+ }
+ return _builder.MkAnd(nested);
+ }
+
+ goto default;
+#endif
+
+ default:
+ throw new NotSupportedException(SR.Format(SR.NotSupported_NonBacktrackingConflictingExpression, node.Type switch
+ {
+ RegexNode.Capture => SR.ExpressionDescription_BalancingGroup,
+ RegexNode.Testgroup => SR.ExpressionDescription_IfThenElse,
+ RegexNode.Ref => SR.ExpressionDescription_Backreference,
+ RegexNode.Testref => SR.ExpressionDescription_Conditional,
+ RegexNode.Require => SR.ExpressionDescription_PositiveLookaround,
+ RegexNode.Prevent => SR.ExpressionDescription_NegativeLookaround,
+ RegexNode.Start => SR.ExpressionDescription_ContiguousMatches,
+ RegexNode.Atomic or
+ RegexNode.Setloopatomic or
+ RegexNode.Oneloopatomic or
+ RegexNode.Notoneloopatomic => SR.ExpressionDescription_AtomicSubexpressions,
+ _ => UnexpectedNodeType(node)
+ }));
+
+ static string UnexpectedNodeType(RegexNode node)
+ {
+ // The default should never arise, since other node types are either supported
+ // or have been removed (e.g. Group) from the final parse tree.
+ string description = $"Unexpected node type ({nameof(RegexNode)}:{node.Type})";
+ Debug.Fail(description);
+ return description;
+ }
+ }
+
+ void EnsureNewlinePredicateInitialized()
+ {
+ // Update the \n predicate in the builder if it has not been updated already
+ if (_builder._newLinePredicate.Equals(_builder._solver.False))
+ {
+ _builder._newLinePredicate = _builder._solver.CharConstraint('\n');
+ }
+ }
+
+ void EnsureWordLetterPredicateInitialized()
+ {
+ // Update the word letter predicate based on the Unicode definition of it if it was not updated already
+ if (_builder._wordLetterPredicateForAnchors.Equals(_builder._solver.False))
+ {
+ // Use the predicate including joiner and non joiner
+ _builder._wordLetterPredicateForAnchors = _categorizer.WordLetterConditionForAnchors;
+ }
+ }
+
+ List FlattenNestedConcatenations(RegexNode concat)
+ {
+ var results = new List();
+
+ var todo = new Stack();
+ todo.Push(concat);
+
+ while (todo.TryPop(out RegexNode? node))
+ {
+ if (node.Type == RegexNode.Concatenate)
+ {
+ // Flatten nested concatenations
+ for (int i = node.ChildCount() - 1; i >= 0; i--)
+ {
+ todo.Push(node.Child(i));
+ }
+ }
+ else if (node.Type == RegexNode.Capture)
+ {
+ if (node.N == -1)
+ {
+ // Unwrap nonbalancing capture groups
+ todo.Push(node.Child(0));
+ }
+ else
+ {
+ // Balancing groups are not supported
+ throw new NotSupportedException(SR.Format(SR.NotSupported_NonBacktrackingConflictingExpression, SR.ExpressionDescription_BalancingGroup));
+ }
+ }
+ else
+ {
+ results.Add(node);
+ }
+ }
+
+ return results;
+ }
+
+ SymbolicRegexNode ConvertMulti(RegexNode node, bool topLevel)
+ {
+ Debug.Assert(node.Type == RegexNode.Multi);
+
+ string? sequence = node.Str;
+ Debug.Assert(sequence is not null);
+
+ bool ignoreCase = (node.Options & RegexOptions.IgnoreCase) != 0;
+
+ var conds = new BDD[sequence.Length];
+ for (int i = 0; i < conds.Length; i++)
+ {
+ conds[i] = Solver.CharConstraint(sequence[i], ignoreCase, _culture.Name);
+ }
+
+ return _builder.MkSequence(conds, topLevel);
+ }
+
+ SymbolicRegexNode ConvertOneloop(RegexNode node, bool isLazy)
+ {
+ Debug.Assert(node.Type is RegexNode.Oneloop or RegexNode.Onelazy);
+
+ bool ignoreCase = (node.Options & RegexOptions.IgnoreCase) != 0;
+ BDD cond = Solver.CharConstraint(node.Ch, ignoreCase, _culture.Name);
+
+ SymbolicRegexNode body = _builder.MkSingleton(cond);
+ SymbolicRegexNode loop = _builder.MkLoop(body, isLazy, node.M, node.N);
+ return loop;
+ }
+
+ SymbolicRegexNode ConvertNotoneloop(RegexNode node, bool isLazy)
+ {
+ Debug.Assert(node.Type is RegexNode.Notoneloop or RegexNode.Notonelazy);
+
+ bool ignoreCase = (node.Options & RegexOptions.IgnoreCase) != 0;
+ BDD cond = Solver.Not(Solver.CharConstraint(node.Ch, ignoreCase, _culture.Name));
+
+ SymbolicRegexNode body = _builder.MkSingleton(cond);
+ SymbolicRegexNode loop = _builder.MkLoop(body, isLazy, node.M, node.N);
+ return loop;
+ }
+
+ SymbolicRegexNode ConvertSet(RegexNode node)
+ {
+ Debug.Assert(node.Type == RegexNode.Set);
+
+ string? set = node.Str;
+ Debug.Assert(set is not null);
+
+ BDD moveCond = CreateConditionFromSet((node.Options & RegexOptions.IgnoreCase) != 0, set);
+
+ return _builder.MkSingleton(moveCond);
+ }
+
+ SymbolicRegexNode ConvertSetloop(RegexNode node, bool isLazy)
+ {
+ Debug.Assert(node.Type is RegexNode.Setloop or RegexNode.Setlazy);
+
+ string? set = node.Str;
+ Debug.Assert(set is not null);
+
+ BDD moveCond = CreateConditionFromSet((node.Options & RegexOptions.IgnoreCase) != 0, set);
+
+ SymbolicRegexNode body = _builder.MkSingleton(moveCond);
+ return _builder.MkLoop(body, isLazy, node.M, node.N);
+ }
+
+#if DEBUG
+ // TODO-NONBACKTRACKING: recognizing strictly only [] (RegexNode.Nothing), for example [0-[0]] would not be recognized
+ bool IsNothing(RegexNode node) => node.Type == RegexNode.Nothing || (node.Type == RegexNode.Set && ConvertSet(node).IsNothing);
+
+ bool IsDotStar(RegexNode node) => node.Type == RegexNode.Setloop && Convert(node, topLevel: false).IsAnyStar;
+
+ bool IsIntersect(RegexNode node) => node.Type == RegexNode.Testgroup && node.ChildCount() > 2 && IsNothing(node.Child(2));
+
+ bool TryGetIntersection(RegexNode node, [Diagnostics.CodeAnalysis.NotNullWhen(true)] out List? conjuncts)
+ {
+ if (!IsIntersect(node))
+ {
+ conjuncts = null;
+ return false;
+ }
+
+ conjuncts = new();
+ conjuncts.Add(node.Child(0));
+ node = node.Child(1);
+ while (IsIntersect(node))
+ {
+ conjuncts.Add(node.Child(0));
+ node = node.Child(1);
+ }
+
+ conjuncts.Add(node);
+ return true;
+ }
+
+ bool IsComplementedNode(RegexNode node) => IsNothing(node.Child(1)) && IsDotStar(node.Child(2));
+#endif
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StackHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StackHelper.cs
new file mode 100644
index 00000000000000..254c0d5e28dfff
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StackHelper.cs
@@ -0,0 +1,31 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ /// Provides tools for avoiding stack overflows.
+ internal static class StackHelper
+ {
+ // Queues the supplied delegate to the thread pool, then block waiting for it to complete.
+ // It does so in a way that prevents task inlining (which would defeat the purpose) but that
+ // also plays nicely with the thread pool's sync-over-async aggressive thread injection policies.
+
+ /// Calls the provided function on the stack of a different thread pool thread.
+ /// The return type of the function.
+ /// The function to invoke.
+ public static T CallOnEmptyStack(Func func) =>
+ Task.Run(func)
+ .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
+ .GetAwaiter().GetResult();
+
+ /// Calls the provided action on the stack of a different thread pool thread.
+ /// The action to invoke.
+ public static void CallOnEmptyStack(Action action) =>
+ Task.Run(action)
+ .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default)
+ .GetAwaiter().GetResult();
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicMatch.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicMatch.cs
new file mode 100644
index 00000000000000..c6754b8577b0dc
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicMatch.cs
@@ -0,0 +1,37 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics.CodeAnalysis;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ internal readonly struct SymbolicMatch
+ {
+ /// Indicates failure to find a match.
+ internal static SymbolicMatch NoMatch => new SymbolicMatch(-1, -1);
+
+ /// Indicates a match was found but without meaningful details about where.
+ internal static SymbolicMatch QuickMatch => new SymbolicMatch(0, 0);
+
+ public SymbolicMatch(int index, int length)
+ {
+ Index = index;
+ Length = length;
+ }
+
+ public int Index { get; }
+ public int Length { get; }
+ public bool Success => Index >= 0;
+
+ public static bool operator ==(SymbolicMatch left, SymbolicMatch right) =>
+ left.Index == right.Index && left.Length == right.Length;
+
+ public static bool operator !=(SymbolicMatch left, SymbolicMatch right) =>
+ !(left == right);
+
+ public override bool Equals([NotNullWhen(true)] object? obj) =>
+ obj is SymbolicMatch other && this == other;
+
+ public override int GetHashCode() => HashCode.Combine(Index, Length);
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicNFA.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicNFA.cs
new file mode 100644
index 00000000000000..aaf1d57a4eae06
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicNFA.cs
@@ -0,0 +1,391 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections;
+using System.Collections.Generic;
+using System.Diagnostics;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ /// Represents the exploration of a symbolic regex as a symbolic NFA
+ internal sealed class SymbolicNFA where S : notnull
+ {
+ private readonly IBooleanAlgebra _solver;
+ private readonly Transition[] _transitionFunction;
+ private readonly SymbolicRegexNode[] _finalCondition;
+ private readonly HashSet _unexplored;
+ private readonly SymbolicRegexNode[] _nodes;
+
+ private const int DeadendState = -1;
+ private const int UnexploredState = -2;
+
+ /// If true then some states have not been explored
+ public bool IsIncomplete => _unexplored.Count > 0;
+
+ private SymbolicNFA(IBooleanAlgebra solver, Transition[] transitionFunction, HashSet unexplored, SymbolicRegexNode[] nodes)
+ {
+ Debug.Assert(transitionFunction.Length > 0 && nodes.Length == transitionFunction.Length);
+ _solver = solver;
+ _transitionFunction = transitionFunction;
+ _finalCondition = new SymbolicRegexNode[nodes.Length];
+ for (int i = 0; i < nodes.Length; i++)
+ {
+ _finalCondition[i] = nodes[i].ExtractNullabilityTest();
+ }
+ _unexplored = unexplored;
+ _nodes = nodes;
+ }
+
+ /// Total number of states, 0 is the initial state, states are numbered from 0 to StateCount-1
+ public int StateCount => _transitionFunction.Length;
+
+ /// If true then the state has not been explored
+ public bool IsUnexplored(int state) => _transitionFunction[state]._leaf == UnexploredState;
+
+ /// If true then the state has no outgoing transitions
+ public bool IsDeadend(int state) => _transitionFunction[state]._leaf == DeadendState;
+
+ /// If true then the state involves lazy loops or has no loops
+ public bool IsLazy(int state) => _nodes[state].IsLazy;
+
+ /// Returns true if the state is nullable in the given context
+ public bool IsFinal(int state, uint context) => _finalCondition[state].IsNullableFor(context);
+
+ /// Returns true if the state is nullable for some context
+ public bool CanBeNullable(int state) => _finalCondition[state].CanBeNullable;
+
+ /// Returns true if the state is nullable for all contexts
+ public bool IsNullable(int state) => _finalCondition[state].IsNullable;
+
+ /// Gets the underlying node of the state
+ public SymbolicRegexNode GetNode(int state) => _nodes[state];
+
+ /// Enumerates all target states from the given source state
+ /// must be a an integer between 0 and StateCount-1
+ /// must be a value that acts as a minterm for the transitions emanating from the source state
+ /// reflects the immediate surrounding of the input and is used to determine nullability of anchors
+ public IEnumerable EnumerateTargetStates(int sourceState, S input, uint context)
+ {
+ Debug.Assert(sourceState >= 0 && sourceState < _transitionFunction.Length);
+
+ // First operate in a mode assuming no Union happens by finding the target leaf state if one exists
+ Transition transition = _transitionFunction[sourceState];
+ while (transition._kind != TransitionRegexKind.Union)
+ {
+ switch (transition._kind)
+ {
+ case TransitionRegexKind.Leaf:
+ // deadend and unexplored are negative
+ if (transition._leaf >= 0)
+ {
+ Debug.Assert(transition._leaf < _transitionFunction.Length);
+ yield return transition._leaf;
+ }
+ // The single target (or no target) state was found, so exit the whole enumeration
+ yield break;
+
+ case TransitionRegexKind.Conditional:
+ Debug.Assert(transition._test is not null && transition._first is not null && transition._second is not null);
+ // Branch according to the input condition in relation to the test condition
+ if (_solver.IsSatisfiable(_solver.And(input, transition._test)))
+ {
+ // in a conditional transition input must be exclusive
+ Debug.Assert(!_solver.IsSatisfiable(_solver.And(input, _solver.Not(transition._test))));
+ transition = transition._first;
+ }
+ else
+ {
+ transition = transition._second;
+ }
+ break;
+
+ default:
+ Debug.Assert(transition._kind == TransitionRegexKind.Lookaround && transition._look is not null && transition._first is not null && transition._second is not null);
+ // Branch according to nullability of the lookaround condition in the given context
+ transition = transition._look.IsNullableFor(context) ?
+ transition._first :
+ transition._second;
+ break;
+ }
+ }
+
+ // Continue operating in a mode where several target states can be yielded
+ Debug.Assert(transition._first is not null && transition._second is not null);
+ Stack todo = new();
+ todo.Push(transition._second);
+ todo.Push(transition._first);
+ while (todo.TryPop(out Transition? top))
+ {
+ switch (transition._kind)
+ {
+ case TransitionRegexKind.Leaf:
+ // dead-end
+ if (transition._leaf >= 0)
+ {
+ Debug.Assert(transition._leaf < _transitionFunction.Length);
+ yield return transition._leaf;
+ }
+ break;
+
+ case TransitionRegexKind.Conditional:
+ Debug.Assert(transition._test is not null && transition._first is not null && transition._second is not null);
+ // Branch according to the input condition in relation to the test condition
+ if (_solver.IsSatisfiable(_solver.And(input, transition._test)))
+ {
+ // in a conditional transition input must be exclusive
+ Debug.Assert(!_solver.IsSatisfiable(_solver.And(input, _solver.Not(transition._test))));
+ todo.Push(transition._first);
+ }
+ else
+ {
+ todo.Push(transition._second);
+ }
+ break;
+
+ case TransitionRegexKind.Lookaround:
+ Debug.Assert(transition._look is not null && transition._first is not null && transition._second is not null);
+ // Branch according to nullability of the lookaround condition in the given context
+ todo.Push(transition._look.IsNullableFor(context) ? transition._first : transition._second);
+ break;
+
+
+ default:
+ Debug.Assert(transition._kind == TransitionRegexKind.Union && transition._first is not null && transition._second is not null);
+ todo.Push(transition._second);
+ todo.Push(transition._first);
+ break;
+ }
+ }
+ }
+
+ public IEnumerable<(S, SymbolicRegexNode?, int)> EnumeratePaths(int sourceState) =>
+ _transitionFunction[sourceState].EnumeratePaths(_solver, _solver.True);
+
+ ///
+ /// TODO: Explore an unexplored state on transition further.
+ ///
+ public void ExploreState(int state) => new NotImplementedException();
+
+ public static SymbolicNFA Explore(SymbolicRegexNode root, int bound)
+ {
+ (Dictionary, Transition> cache,
+ Dictionary, int> statemap,
+ List> nodes,
+ Stack front) workState = (new(), new(), new(), new());
+
+ workState.nodes.Add(root);
+ workState.statemap[root] = 0;
+ workState.front.Push(0);
+
+ Dictionary transitions = new();
+ Stack front = new();
+
+ while (workState.front.Count > 0)
+ {
+ Debug.Assert(front.Count == 0);
+
+ // Work Breadth-First in layers, swap front with workState.front
+ Stack tmp = front;
+ front = workState.front;
+ workState.front = tmp;
+
+ // Process all the states in front first
+ // Any new states detected in Convert are added to workState.front
+ while (front.Count > 0 && (bound <= 0 || workState.nodes.Count < bound))
+ {
+ int q = front.Pop();
+
+ // If q was on the front it must be associated with a node but not have a transition yet
+ Debug.Assert(q >= 0 && q < workState.nodes.Count && !transitions.ContainsKey(q));
+ transitions[q] = Convert(workState.nodes[q].MkDerivative(), workState);
+ }
+
+ if (front.Count > 0)
+ {
+ // The state bound was reached without completing the exploration so exit the loop
+ break;
+ }
+ }
+
+ SymbolicRegexNode[] nodes_array = workState.nodes.ToArray();
+
+ // All states are numbered from 0 to nodes.Count-1
+ Transition[] transition_array = new Transition[nodes_array.Length];
+ foreach (var entry in transitions)
+ {
+ transition_array[entry.Key] = entry.Value;
+ }
+
+ HashSet unexplored = new(front);
+ unexplored.UnionWith(workState.front);
+ foreach (int q in unexplored)
+ {
+ transition_array[q] = Transition.s_unexplored;
+ }
+
+ // At this point no entry can be null in the transition array
+ Debug.Assert(Array.TrueForAll(transition_array, tr => tr is not null));
+
+ var nfa = new SymbolicNFA(root._builder._solver, transition_array, unexplored, nodes_array);
+ return nfa;
+ }
+
+ private static Transition Convert(TransitionRegex tregex,
+ (Dictionary, Transition> cache,
+ Dictionary, int> statemap,
+ List> nodes,
+ Stack front) args)
+ {
+ Transition? transition;
+ if (args.cache.TryGetValue(tregex, out transition))
+ {
+ return transition;
+ }
+
+ Stack<(TransitionRegex, bool)> work = new();
+ work.Push((tregex, false));
+
+ while (work.TryPop(out (TransitionRegex, bool) top))
+ {
+ TransitionRegex tr = top.Item1;
+ bool wasPushedSecondTime = top.Item2;
+ if (wasPushedSecondTime)
+ {
+ Debug.Assert(tr._kind != TransitionRegexKind.Leaf && tr._first is not null && tr._second is not null);
+ transition = new Transition(kind: tr._kind,
+ test: tr._test,
+ look: tr._node,
+ first: args.cache[tr._first],
+ second: args.cache[tr._second]);
+ args.cache[tr] = transition;
+ }
+ else
+ {
+ switch (tr._kind)
+ {
+ case TransitionRegexKind.Leaf:
+ Debug.Assert(tr._node is not null);
+
+ if (tr._node.IsNothing)
+ {
+ args.cache[tr] = Transition.s_deadend;
+ }
+ else
+ {
+ int state;
+ if (!args.statemap.TryGetValue(tr._node, out state))
+ {
+ state = args.nodes.Count;
+ args.nodes.Add(tr._node);
+ args.statemap[tr._node] = state;
+ args.front.Push(state);
+ }
+ transition = new Transition(kind: TransitionRegexKind.Leaf, leaf: state);
+ args.cache[tr] = transition;
+ }
+ break;
+
+ default:
+ Debug.Assert(tr._first is not null && tr._second is not null);
+
+ // Push the tr for the second time
+ work.Push((tr, true));
+
+ // Push the branches also, unless they have been computed already
+ if (!args.cache.ContainsKey(tr._second))
+ {
+ work.Push((tr._second, false));
+ }
+
+ if (!args.cache.ContainsKey(tr._first))
+ {
+ work.Push((tr._first, false));
+ }
+
+ break;
+ }
+ }
+ }
+
+ return args.cache[tregex];
+ }
+
+ /// Representation of transitions inside the parent class
+ private class Transition
+ {
+ public readonly TransitionRegexKind _kind;
+ public readonly int _leaf;
+ public readonly S? _test;
+ public readonly SymbolicRegexNode? _look;
+ public readonly Transition? _first;
+ public readonly Transition? _second;
+
+ public static readonly Transition s_deadend = new Transition(TransitionRegexKind.Leaf, leaf: DeadendState);
+ public static readonly Transition s_unexplored = new Transition(TransitionRegexKind.Leaf, leaf: UnexploredState);
+
+ internal Transition(TransitionRegexKind kind, int leaf = 0, S? test = default(S), SymbolicRegexNode? look = null, Transition? first = null, Transition? second = null)
+ {
+ _kind = kind;
+ _leaf = leaf;
+ _test = test;
+ _look = look;
+ _first = first;
+ _second = second;
+ }
+
+ /// Enumerates all the paths in this transition excluding paths to dead-ends (and unexplored states if any)
+ internal IEnumerable<(S, SymbolicRegexNode?, int)> EnumeratePaths(IBooleanAlgebra solver, S pathCondition)
+ {
+ switch (_kind)
+ {
+ case TransitionRegexKind.Leaf:
+ // Omit any path that leads to a deadend or is unexplored
+ if (_leaf >= 0)
+ {
+ yield return (pathCondition, null, _leaf);
+ }
+ break;
+
+ case TransitionRegexKind.Union:
+ Debug.Assert(_first is not null && _second is not null);
+ foreach ((S, SymbolicRegexNode?, int) path in _first.EnumeratePaths(solver, pathCondition))
+ {
+ yield return path;
+ }
+ foreach ((S, SymbolicRegexNode?, int) path in _second.EnumeratePaths(solver, pathCondition))
+ {
+ yield return path;
+ }
+ break;
+
+ case TransitionRegexKind.Conditional:
+ Debug.Assert(_test is not null && _first is not null && _second is not null);
+ foreach ((S, SymbolicRegexNode?, int) path in _first.EnumeratePaths(solver, solver.And(pathCondition, _test)))
+ {
+ yield return path;
+ }
+ foreach ((S, SymbolicRegexNode?, int) path in _second.EnumeratePaths(solver, solver.And(pathCondition, solver.Not(_test))))
+ {
+ yield return path;
+ }
+ break;
+
+ default:
+ Debug.Assert(_kind is TransitionRegexKind.Lookaround && _look is not null && _first is not null && _second is not null);
+ foreach ((S, SymbolicRegexNode?, int) path in _first.EnumeratePaths(solver, pathCondition))
+ {
+ SymbolicRegexNode nullabilityTest = path.Item2 is null ? _look : _look._builder.MkAnd(path.Item2, _look);
+ yield return (path.Item1, nullabilityTest, path.Item3);
+ }
+ foreach ((S, SymbolicRegexNode?, int) path in _second.EnumeratePaths(solver, pathCondition))
+ {
+ // Complement the nullability test
+ SymbolicRegexNode nullabilityTest = path.Item2 is null ? _look._builder.MkNot(_look) : _look._builder.MkAnd(path.Item2, _look._builder.MkNot(_look));
+ yield return (path.Item1, nullabilityTest, path.Item3);
+ }
+ break;
+ }
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs
new file mode 100644
index 00000000000000..2e01a330bb84e7
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs
@@ -0,0 +1,418 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Diagnostics;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ ///
+ /// Builder of symbolic regexes over TElement.
+ /// TElement is the type of elements of an effective Boolean algebra.
+ /// Used to convert .NET regexes to symbolic regexes.
+ ///
+ internal sealed class SymbolicRegexBuilder where TElement : notnull
+ {
+ internal readonly ICharAlgebra _solver;
+
+ internal readonly SymbolicRegexNode _epsilon;
+ internal readonly SymbolicRegexNode _nothing;
+ internal readonly SymbolicRegexNode _startAnchor;
+ internal readonly SymbolicRegexNode _endAnchor;
+ internal readonly SymbolicRegexNode _endAnchorZ;
+ internal readonly SymbolicRegexNode _endAnchorZRev;
+ internal readonly SymbolicRegexNode _bolAnchor;
+ internal readonly SymbolicRegexNode _eolAnchor;
+ internal readonly SymbolicRegexNode _anyChar;
+ internal readonly SymbolicRegexNode _anyStar;
+ internal readonly SymbolicRegexNode _wbAnchor;
+ internal readonly SymbolicRegexNode _nwbAnchor;
+ internal readonly SymbolicRegexSet _fullSet;
+ internal readonly SymbolicRegexSet _emptySet;
+ internal readonly SymbolicRegexNode _eagerEmptyLoop;
+
+ internal TElement _wordLetterPredicateForAnchors;
+ internal TElement _newLinePredicate;
+
+ /// Partition of the input space of predicates.
+ internal TElement[]? _minterms;
+
+ private readonly Dictionary> _singletonCache = new();
+
+ // states that have been created
+ internal HashSet> _stateCache = new();
+
+ internal readonly Dictionary<(SymbolicRegexKind,
+ SymbolicRegexNode?, // _left
+ SymbolicRegexNode?, // _right
+ int, int, TElement?, // _lower, _upper, _set
+ SymbolicRegexSet?,
+ SymbolicRegexInfo), SymbolicRegexNode> _nodeCache = new();
+
+ internal readonly Dictionary<(TransitionRegexKind, // _kind
+ TElement?, // _test
+ TransitionRegex?, // _first
+ TransitionRegex?, // _second
+ SymbolicRegexNode?), // _leaf
+ TransitionRegex> _trCache = new();
+
+ ///
+ /// Maps state ids to states, initial capacity is 1024 states.
+ /// Each time more states are needed the length is increased by 1024.
+ ///
+ internal DfaMatchingState[]? _statearray;
+ internal DfaMatchingState[]? _delta;
+ private const int InitialStateLimit = 1024;
+
+ ///
+ /// is the smallest k s.t. 2^k >= minterms.Length + 1
+ ///
+ internal int _mintermsCount;
+
+ ///
+ /// If true then delta is used in a mode where
+ /// each target state represents a set of states.
+ ///
+ internal bool _antimirov;
+
+ /// Create a new symbolic regex builder.
+ internal SymbolicRegexBuilder(ICharAlgebra solver)
+ {
+ // Solver must be set first, else it will cause null reference exception in the following
+ _solver = solver;
+ _epsilon = SymbolicRegexNode.MkEpsilon(this);
+ _startAnchor = SymbolicRegexNode.MkStartAnchor(this);
+ _endAnchor = SymbolicRegexNode.MkEndAnchor(this);
+ _endAnchorZ = SymbolicRegexNode.MkEndAnchorZ(this);
+ _endAnchorZRev = SymbolicRegexNode