diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 63871f58d0ff3b..5f49841104b5bd 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -17,7 +17,6 @@ - @@ -64,6 +63,7 @@ + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Reference.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Reference.cs deleted file mode 100644 index d5032e968dc0f5..00000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Reference.cs +++ /dev/null @@ -1,88 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Threading; - -namespace System.Text.RegularExpressions -{ - /// - /// Used to cache one exclusive runner reference - /// - internal sealed class ExclusiveReference - { - private RegexRunner? _ref; - private RegexRunner? _obj; - private volatile int _locked; - - /// - /// Return an object and grab an exclusive lock. - /// - /// If the exclusive lock can't be obtained, null is returned; - /// if the object can't be returned, the lock is released. - /// - public RegexRunner? Get() - { - // try to obtain the lock - - if (0 == Interlocked.Exchange(ref _locked, 1)) - { - // grab reference - RegexRunner? obj = _ref; - - // release the lock and return null if no reference - if (obj == null) - { - _locked = 0; - - return null; - } - - // remember the reference and keep the lock - _obj = obj; - - return obj; - } - - return null; - } - - /// - /// Release an object back to the cache. - /// - /// If the object is the one that's under lock, the lock is released. - /// If there is no cached object, then the lock is obtained and the object is placed in the cache. - /// - public void Release(RegexRunner obj) - { - if (obj == null) - throw new ArgumentNullException(nameof(obj)); - - // if this reference owns the lock, release it - if (_obj == obj) - { - _obj = null; - _locked = 0; - - return; - } - - // if no reference owns the lock, try to cache this reference - if (_obj == null) - { - // try to obtain the lock - if (0 == Interlocked.Exchange(ref _locked, 1)) - { - // if there's really no reference, cache this reference - if (_ref == null) - _ref = obj; - - // release the lock - _locked = 0; - - return; - } - } - } - } -} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs index 9469f3c27fd7c9..0049aff187c2fb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs @@ -2,11 +2,11 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Globalization; -using System.Runtime.CompilerServices; using System.Threading; namespace System.Text.RegularExpressions @@ -15,59 +15,88 @@ public partial class Regex { public static int CacheSize { - get => RegexCache.CacheSize; - set => RegexCache.CacheSize = value; + get => RegexCache.MaxCacheSize; + set + { + if (value < 0) + { + throw new ArgumentOutOfRangeException(nameof(value)); + } + + RegexCache.MaxCacheSize = value; + } } } + /// Cache used to store Regex instances used by the static methods on Regex. internal sealed class RegexCache { + // The implementation is optimized to make cache hits fast and lock-free, only taking a global lock + // when adding a new Regex to the cache. Previous implementations of the cache took a global lock + // on all accesses, negatively impacting scalability, in order to minimize costs when the cache + // limit was hit and items needed to be dropped. In such situations, however, we're having to + // pay the relatively hefty cost of creating a new Regex, anyway, and if the consuming app cares + // about such costs, it should either increase Regex.CacheSize or do its own Regex instance caching. + + /// The default maximum number of items to store in the cache. private const int DefaultMaxCacheSize = 15; - private const int CacheDictionarySwitchLimit = 10; - - private static readonly Dictionary s_cache = new Dictionary(DefaultMaxCacheSize); - private static Node? s_cacheFirst, s_cacheLast; // linked list for LRU and for small cache + /// The maximum number of cached items to examine when we need to replace an existing one in the cache with a new one. + /// This is a somewhat arbitrary value, chosen to be small but at least as large as DefaultMaxCacheSize. + private const int MaxExamineOnDrop = 30; + + /// A read-through cache of one element, representing the most recently used regular expression. + private static volatile Node? s_lastAccessed; + /// The thread-safe dictionary storing all the items in the cache. + /// + /// The concurrency level is initialized to 1 as we're using our own global lock for all mutations, so we don't need ConcurrentDictionary's + /// striped locking. Capacity is initialized to 31, which is the same as (the private) ConcurrentDictionary.DefaultCapacity. + /// + private static readonly ConcurrentDictionary s_cacheDictionary = new ConcurrentDictionary(concurrencyLevel: 1, capacity: 31); + /// A list of all the items in the cache. Protected by . + private static readonly List s_cacheList = new List(DefaultMaxCacheSize); + /// Random number generator used to examine a subset of items when we need to drop one from a large list. Protected by . + private static readonly Random s_random = new Random(); + /// The current maximum number of items allowed in the cache. This rarely changes. Mostly protected by . private static int s_maxCacheSize = DefaultMaxCacheSize; - private static int s_cacheCount = 0; - public static int CacheSize + /// Lock used to protect shared state on mutations. + private static object SyncObj => s_cacheDictionary; + + /// Gets or sets the maximum size of the cache. + public static int MaxCacheSize { get => s_maxCacheSize; set { - if (value < 0) - { - throw new ArgumentOutOfRangeException(nameof(value)); - } + Debug.Assert(value >= 0); - lock (s_cache) + lock (SyncObj) { - s_maxCacheSize = value; // not to allow other thread to change it while we use cache - while (s_cacheCount > s_maxCacheSize) - { - Node last = s_cacheLast!; - if (s_cacheCount >= CacheDictionarySwitchLimit) - { - Debug.Assert(s_cache.ContainsKey(last.Key)); - s_cache.Remove(last.Key); - } + // Store the new max cache size + s_maxCacheSize = value; - // update linked list: - s_cacheLast = last.Next; - if (last.Next != null) - { - Debug.Assert(s_cacheFirst != null); - Debug.Assert(s_cacheFirst != last); - Debug.Assert(last.Next.Previous == last); - last.Next.Previous = null; - } - else // last one removed + if (value == 0) + { + // If the value is being changed to zero, just clear out the cache. + s_cacheDictionary.Clear(); + s_cacheList.Clear(); + s_lastAccessed = null; + } + else if (value < s_cacheList.Count) + { + // If the value is being changed to less than the number of items we're currently storing, + // sort the entries descending by last access stamp, and remove the excess. This is expensive, but + // this should be exceedingly rare, as CacheSize is generally set once (if at all) and then left unchanged. + s_cacheList.Sort((n1, n2) => Volatile.Read(ref n2.LastAccessStamp).CompareTo(Volatile.Read(ref n1.LastAccessStamp))); + s_lastAccessed = s_cacheList[0]; + for (int i = value; i < s_cacheList.Count; i++) { - Debug.Assert(s_cacheFirst == last); - s_cacheFirst = null; + s_cacheDictionary.TryRemove(s_cacheList[i].Key, out _); } + s_cacheList.RemoveRange(value, s_cacheList.Count - value); - s_cacheCount--; + Debug.Assert(s_cacheList.Count == value); + Debug.Assert(s_cacheDictionary.Count == value); } } } @@ -75,6 +104,10 @@ public static int CacheSize public static Regex GetOrAdd(string pattern) { + // Does not delegate to GetOrAdd(..., RegexOptions, ...) in order to avoid having + // a statically-reachable path to the 'new Regex(..., RegexOptions, ...)', which + // will force the Regex compiler to be reachable and thus rooted for the linker. + Regex.ValidatePattern(pattern); CultureInfo culture = CultureInfo.CurrentCulture; @@ -109,170 +142,122 @@ public static Regex GetOrAdd(string pattern, RegexOptions options, TimeSpan matc private static bool TryGet(Key key, [NotNullWhen(true)] out Regex? regex) { - Node? cachedRegex = s_cacheFirst; - if (cachedRegex != null) + long lastAccessedStamp = 0; + + // We optimize for repeated usage of the same regular expression over and over, + // by having a fast-path that stores the most recently used instance. Check + // to see if that instance is the one we want; if it is, we're done. + Node? lastAccessed = s_lastAccessed; + if (lastAccessed != null) { - if (cachedRegex.Key.Equals(key)) + if (lastAccessed.Key.Equals(key)) { - regex = cachedRegex.Regex; + regex = lastAccessed.Regex; return true; } - if (s_maxCacheSize != 0) - { - lock (s_cache) - { - cachedRegex = LookupCachedAndPromote(key); - if (cachedRegex != null) - { - regex = cachedRegex.Regex; - return true; - } - } - } + // We had a last accessed item, but it didn't match the one being requested. + // In case we need to replace the last accessed node, remember this one's stamp; + // we'll use it to compute the new access value for the new node replacing it. + lastAccessedStamp = Volatile.Read(ref lastAccessed.LastAccessStamp); } + // Now consult the full cache. + if (s_maxCacheSize != 0 && // hot-read of s_maxCacheSize to try to avoid the cost of the dictionary lookup if the cache is disabled + s_cacheDictionary.TryGetValue(key, out Node? node)) + { + // We found our item in the cache. Make this node's last access stamp one higher than + // the previous one. It's ok if multiple threads racing to update the last access cause + // multiple nodes to have the same value; it's an approximate value meant only to help + // remove the least valuable items when an item needs to be dropped from the cache. We + // do, however, need to read the old value and write the new value using Volatile.Read/Write, + // in order to prevent tearing of the 64-bit value on 32-bit platforms, and to help ensure + // that another thread subsequently sees this updated value. + Volatile.Write(ref node.LastAccessStamp, lastAccessedStamp + 1); + + // Update our fast-path single-field cache. + s_lastAccessed = node; + + // Return the cached regex. + regex = node.Regex; + return true; + } + + // Not in the cache. regex = null; return false; } private static void Add(Key key, Regex regex) { - lock (s_cache) + lock (SyncObj) { - // If we're not supposed to cache, or if the entry is already cached, we're done. - if (s_maxCacheSize == 0 || LookupCachedAndPromote(key) != null) + Debug.Assert(s_cacheList.Count == s_cacheDictionary.Count); + + // If the cache has been disabled, there's nothing to add. And if between just checking + // the cache in the caller and taking the lock, another thread could have added the regex. + // If that occurred, there's also nothing to add, and we don't bother to update any of the + // time stamp / fast-path field information, because hitting this race condition means it + // was just updated, and we gain little by updating it again. + if (s_maxCacheSize == 0 || s_cacheDictionary.TryGetValue(key, out _)) { return; } - // Create the entry for caching. - var entry = new Node(key, regex); - - // Put it at the beginning of the linked list, as it is the most-recently used. - if (s_cacheFirst != null) + // If the cache is full, remove an item to make room for the new one. + if (s_cacheList.Count == s_maxCacheSize) { - Debug.Assert(s_cacheFirst.Next == null); - s_cacheFirst.Next = entry; - entry.Previous = s_cacheFirst; - } - s_cacheFirst = entry; - s_cacheCount++; + int itemsToExamine; + bool useRandom; - // If we've graduated to using the dictionary for lookups, add it to the dictionary. - if (s_cacheCount >= CacheDictionarySwitchLimit) - { - if (s_cacheCount == CacheDictionarySwitchLimit) + if (s_maxCacheSize <= MaxExamineOnDrop) { - // If we just hit the threshold, we need to populate the dictionary from the list. - s_cache.Clear(); - for (Node? next = s_cacheFirst; next != null; next = next.Previous) - { - s_cache.Add(next.Key, next); - } + // Our maximum cache size is <= the number of items we're willing to examine (which is kept small simply + // to avoid spending a lot of time). As such, we can just examine the whole list. + itemsToExamine = s_cacheList.Count; + useRandom = false; } else { - // If we've already populated the dictionary, just add this one entry. - s_cache.Add(key, entry); + // Our maximum cache size is > the number of items we're willing to examine, so we'll instead + // examine a random subset. This isn't perfect: if the size of the list is only a tiny bit + // larger than the max we're willing to examine, there's a good chance we'll look at some of + // the same items twice. That's fine; this doesn't need to be perfect. We do not need a perfect LRU + // cache, just one that generally gets rid of older things when new things come in. + itemsToExamine = MaxExamineOnDrop; + useRandom = true; } - Debug.Assert(s_cacheCount == s_cache.Count); - } + // Pick the first item to use as the min. + int minListIndex = useRandom ? s_random.Next(s_cacheList.Count) : 0; + long min = Volatile.Read(ref s_cacheList[minListIndex].LastAccessStamp); - // Update the tail of the linked list. If nothing was cached, just set the tail. - // If we're over our cache limit, remove the tail. - if (s_cacheLast == null) - { - s_cacheLast = entry; - } - else if (s_cacheCount > s_maxCacheSize) - { - Node last = s_cacheLast; - if (s_cacheCount >= CacheDictionarySwitchLimit) + // Now examine the rest, keeping track of the smallest access stamp we find. + for (int i = 1; i < itemsToExamine; i++) { - Debug.Assert(s_cache[last.Key] == s_cacheLast); - s_cache.Remove(last.Key); + int nextIndex = useRandom ? s_random.Next(s_cacheList.Count) : i; + long next = Volatile.Read(ref s_cacheList[nextIndex].LastAccessStamp); + if (next < min) + { + minListIndex = nextIndex; + min = next; + } } - Debug.Assert(last.Previous == null); - Debug.Assert(last.Next != null); - Debug.Assert(last.Next.Previous == last); - - last.Next.Previous = null; - s_cacheLast = last.Next; - - s_cacheCount--; - } - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] // single call site, separated out for convenience - private static bool TryGetCacheValueAfterFirst(Key key, [NotNullWhen(true)] out Node? entry) - { - Debug.Assert(Monitor.IsEntered(s_cache)); - Debug.Assert(s_cacheFirst != null); - Debug.Assert(s_cacheLast != null); - - if (s_cacheCount >= CacheDictionarySwitchLimit) - { - Debug.Assert(s_cache.Count == s_cacheCount); - return s_cache.TryGetValue(key, out entry); - } - - for (Node? current = s_cacheFirst.Previous; // s_cacheFirst already checked by caller, so skip it here - current != null; - current = current.Previous) - { - if (current.Key.Equals(key)) - { - entry = current; - return true; + // Remove the key found to have the smallest access stamp. + s_cacheDictionary.TryRemove(s_cacheList[minListIndex].Key, out _); + s_cacheList.RemoveAt(minListIndex); } - } - - entry = null; - return false; - } - - private static Node? LookupCachedAndPromote(Key key) - { - Debug.Assert(Monitor.IsEntered(s_cache)); - - Node? entry = s_cacheFirst; - if (entry != null && - !entry.Key.Equals(key) && // again check this as could have been promoted by other thread - TryGetCacheValueAfterFirst(key, out entry)) - { - // We found the item and it wasn't the first; it needs to be promoted. - Debug.Assert(s_cacheFirst != entry, "key should not get s_livecode_first"); - Debug.Assert(s_cacheFirst != null, "as Dict has at least one"); - Debug.Assert(s_cacheFirst.Next == null); - Debug.Assert(s_cacheFirst.Previous != null); - Debug.Assert(entry.Next != null, "not first so Next should exist"); - Debug.Assert(entry.Next.Previous == entry); + // Finally add the regex. + var node = new Node(key, regex); + s_lastAccessed = node; + s_cacheList.Add(node); + s_cacheDictionary.TryAdd(key, node); - if (s_cacheLast == entry) - { - Debug.Assert(entry.Previous == null, "last"); - s_cacheLast = entry.Next; - } - else - { - Debug.Assert(entry.Previous != null, "in middle"); - Debug.Assert(entry.Previous.Next == entry); - entry.Previous.Next = entry.Next; - } - entry.Next.Previous = entry.Previous; - - s_cacheFirst.Next = entry; - entry.Previous = s_cacheFirst; - entry.Next = null; - s_cacheFirst = entry; + Debug.Assert(s_cacheList.Count <= s_maxCacheSize); + Debug.Assert(s_cacheList.Count == s_cacheDictionary.Count); } - - return entry; } /// Used as a key for . @@ -316,13 +301,15 @@ public override int GetHashCode() => // no need to include timeout in the hashcode; it'll almost always be the same } - /// Used to cache Regex instances. + /// Node for a cached Regex instance. private sealed class Node { + /// The key associated with this cached instance. public readonly Key Key; + /// The cached Regex instance. public readonly Regex Regex; - public Node? Next; - public Node? Previous; + /// A "time" stamp representing the approximate last access time for this Regex. + public long LastAccessStamp; public Node(Key key, Regex regex) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index f828a377177672..13a741dfddbd16 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -11,6 +11,7 @@ using System.Runtime.CompilerServices; #endif using System.Runtime.Serialization; +using System.Threading; namespace System.Text.RegularExpressions { @@ -31,7 +32,7 @@ public partial class Regex : ISerializable protected internal int capsize; // the size of the capture array internal WeakReference? _replref; // cached parsed replacement pattern - private ExclusiveReference? _runnerref; // cached runner + private volatile RegexRunner? _runner; // cached runner private RegexCode? _code; // if interpreted, this is the code for RegexInterpreter private bool _refsInitialized = false; @@ -64,7 +65,8 @@ public Regex(string pattern, RegexOptions options, TimeSpan matchTimeout) : internal Regex(string pattern, CultureInfo? culture) { // Call Init directly rather than delegating to a Regex ctor that takes - // options to avoid rooting the Regex compiler unless necessary. + // options to enable linking / tree shaking to remove the Regex compiler + // if it may not be used. Init(pattern, RegexOptions.None, s_defaultMatchTimeout, culture); } @@ -85,8 +87,9 @@ internal Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, Cult /// Initializes the instance. /// - /// This is separated out of the constructor to allow the Regex ctor that doesn't - /// take a RegexOptions to avoid rooting the regex compiler, such that it can be trimmed away. + /// This is separated out of the constructor so that an app only using 'new Regex(pattern)' + /// rather than 'new Regex(pattern, options)' can avoid statically referencing the Regex + /// compiler, such that a tree shaker / linker can trim it away if it's not otherwise used. /// private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo? culture) { @@ -412,7 +415,6 @@ protected void InitializeReferences() throw new NotSupportedException(SR.OnlyAllowedOnce); _refsInitialized = true; - _runnerref = new ExclusiveReference(); _replref = new WeakReference(null); } @@ -428,35 +430,24 @@ protected void InitializeReferences() if (length < 0 || length > input.Length) throw new ArgumentOutOfRangeException(nameof(length), SR.LengthNotNegative); - // There may be a cached runner; grab ownership of it if we can. - RegexRunner? runner = _runnerref!.Get(); - - // Create a RegexRunner instance if we need to - if (runner == null) - { - // Use the compiled RegexRunner factory if the code was compiled to MSIL - runner = factory != null ? - factory.CreateInstance() : - new RegexInterpreter(_code!, UseOptionInvariant() ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture); - } - - Match? match; + RegexRunner runner = + Interlocked.Exchange(ref _runner, null) ?? // use a cached runner if there is one + (factory != null ? factory.CreateInstance() : // use the compiled RegexRunner factory if there is one + new RegexInterpreter(_code!, UseOptionInvariant() ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture)); try { // Do the scan starting at the requested position - match = runner.Scan(this, input, beginning, beginning + length, startat, prevlen, quick, internalMatchTimeout); + Match? match = runner.Scan(this, input, beginning, beginning + length, startat, prevlen, quick, internalMatchTimeout); +#if DEBUG + if (Debug) match?.Dump(); +#endif + return match; } finally { - // Release or fill the cache slot - _runnerref.Release(runner); + // Release the runner back to the cache + _runner = runner; } - -#if DEBUG - if (Debug && match != null) - match.Dump(); -#endif - return match; } protected bool UseOptionC() => (roptions & RegexOptions.Compiled) != 0; diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Cache.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Cache.Tests.cs index 03de6ce1b0edc2..db10fe8f512fca 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Cache.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Cache.Tests.cs @@ -2,8 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System.Collections.Generic; -using System.Diagnostics; +using System.Collections; using System.Globalization; using System.Reflection; using Microsoft.DotNet.RemoteExecutor; @@ -137,10 +136,10 @@ void Remove(int n) private int GetCachedItemsNum() { - return (int)typeof(Regex).Assembly + return ((ICollection)typeof(Regex).Assembly .GetType("System.Text.RegularExpressions.RegexCache") - .GetField("s_cacheCount", BindingFlags.NonPublic | BindingFlags.Static) - .GetValue(null); + .GetField("s_cacheList", BindingFlags.NonPublic | BindingFlags.Static) + .GetValue(null)).Count; } } }