From 439bf50829f20b116adbae5fc4c5a42128153333 Mon Sep 17 00:00:00 2001 From: "Scott G. Miller" Date: Fri, 30 Aug 2024 14:37:39 -0500 Subject: [PATCH 01/20] Drop in replacement for the Go regexp package that does temporal caching of pattern compilation --- regexp/go.mod | 10 +++++ regexp/go.sum | 14 +++++++ regexp/regexp.go | 103 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 regexp/go.mod create mode 100644 regexp/go.sum create mode 100644 regexp/regexp.go diff --git a/regexp/go.mod b/regexp/go.mod new file mode 100644 index 0000000..7e3bbc1 --- /dev/null +++ b/regexp/go.mod @@ -0,0 +1,10 @@ +module github.com/hashicorp/go-secure-stdlib/regexp + +go 1.23 + +require ( + github.com/ReneKroon/ttlcache v1.7.0 // indirect + github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect + github.com/jellydator/ttlcache/v3 v3.3.0 // indirect + golang.org/x/sync v0.8.0 // indirect +) diff --git a/regexp/go.sum b/regexp/go.sum new file mode 100644 index 0000000..6e709b5 --- /dev/null +++ b/regexp/go.sum @@ -0,0 +1,14 @@ +github.com/ReneKroon/ttlcache v1.7.0 h1:8BkjFfrzVFXyrqnMtezAaJ6AHPSsVV10m6w28N/Fgkk= +github.com/ReneKroon/ttlcache v1.7.0/go.mod h1:8BGGzdumrIjWxdRx8zpK6L3oGMWvIXdvB2GD1cfvd+I= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/jellydator/ttlcache/v3 v3.3.0 h1:BdoC9cE81qXfrxeb9eoJi9dWrdhSuwXMAnHTbnBm4Wc= +github.com/jellydator/ttlcache/v3 v3.3.0/go.mod h1:bj2/e0l4jRnQdrnSTaGTsh4GSXvMjQcy41i7th0GVGw= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +go.uber.org/goleak v0.10.0/go.mod h1:VCZuO8V8mFPlL0F5J5GK1rtHV3DrFcQ1R8ryq7FK0aI= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= diff --git a/regexp/regexp.go b/regexp/regexp.go new file mode 100644 index 0000000..cffbb5b --- /dev/null +++ b/regexp/regexp.go @@ -0,0 +1,103 @@ +package regexp + +import ( + "github.com/jellydator/ttlcache/v3" + "regexp" + "sync" + "time" +) + +// Caches regexp compilation to avoid CPU and RAM usage for many duplicate regexps + +const defaultTTL = 2 * time.Minute + +var ( + regexpCache *ttlcache.Cache[string, *regexp.Regexp] + posixRegexpCache *ttlcache.Cache[string, *regexp.Regexp] + ticker *time.Ticker + stopchan chan struct{} + setupLock sync.Mutex +) + +func init() { + SetCompileCacheTTL(defaultTTL) +} + +func SetCompileCacheTTL(ttl time.Duration) { + Stop() + setupLock.Lock() + defer setupLock.Unlock() + + regexpCache = ttlcache.New[string, *regexp.Regexp](ttlcache.WithTTL[string, *regexp.Regexp](ttl)) + posixRegexpCache = ttlcache.New[string, *regexp.Regexp](ttlcache.WithTTL[string, *regexp.Regexp](ttl)) + ticker = time.NewTicker(ttl) + stopchan = make(chan struct{}) + + go expire() +} + +func expire() { + for { + select { + case <-ticker.C: + regexpCache.DeleteExpired() + posixRegexpCache.DeleteExpired() + case <-stopchan: + ticker.Stop() + break + } + } +} +func MustCompile(pattern string) *regexp.Regexp { + item := regexpCache.Get(pattern) + if item != nil { + return item.Value() + } + regex := regexp.MustCompile(pattern) + regexpCache.Set(pattern, regex, ttlcache.DefaultTTL) + return regex +} + +func MustCompilePOSIX(pattern string) *regexp.Regexp { + item := posixRegexpCache.Get(pattern) + if item != nil { + return item.Value() + } + regex := regexp.MustCompilePOSIX(pattern) + posixRegexpCache.Set(pattern, regex, ttlcache.DefaultTTL) + return regex +} + +func Compile(pattern string) (*regexp.Regexp, error) { + item := regexpCache.Get(pattern) + if item != nil { + return item.Value(), nil + } + regex, err := regexp.Compile(pattern) + if err != nil { + return nil, err + } + regexpCache.Set(pattern, regex, ttlcache.DefaultTTL) + return regex, nil +} + +func CompilePOSIX(pattern string) (*regexp.Regexp, error) { + item := posixRegexpCache.Get(pattern) + if item != nil { + return item.Value(), nil + } + regex, err := regexp.CompilePOSIX(pattern) + if err != nil { + return nil, err + } + posixRegexpCache.Set(pattern, regex, ttlcache.DefaultTTL) + return regex, nil +} + +func Stop() { + setupLock.Lock() + defer setupLock.Unlock() + if ticker != nil { + stopchan <- struct{}{} + } +} From c35e29d54cf682130c2312cf1576c2bc16d94ae1 Mon Sep 17 00:00:00 2001 From: "Scott G. Miller" Date: Tue, 3 Sep 2024 11:38:05 -0500 Subject: [PATCH 02/20] Weak value version: Implements a poor man's weak valued map, which captures finalization of the Regexps to clear the maps --- regexp/go.mod | 4 ++ regexp/go.sum | 7 +++ regexp/regexp.go | 110 +++++++++++++++++------------------------- regexp/regexp_test.go | 47 ++++++++++++++++++ 4 files changed, 102 insertions(+), 66 deletions(-) create mode 100644 regexp/regexp_test.go diff --git a/regexp/go.mod b/regexp/go.mod index 7e3bbc1..3c5592f 100644 --- a/regexp/go.mod +++ b/regexp/go.mod @@ -4,7 +4,11 @@ go 1.23 require ( github.com/ReneKroon/ttlcache v1.7.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect github.com/jellydator/ttlcache/v3 v3.3.0 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/stretchr/testify v1.9.0 // indirect golang.org/x/sync v0.8.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/regexp/go.sum b/regexp/go.sum index 6e709b5..5504f69 100644 --- a/regexp/go.sum +++ b/regexp/go.sum @@ -1,14 +1,21 @@ github.com/ReneKroon/ttlcache v1.7.0 h1:8BkjFfrzVFXyrqnMtezAaJ6AHPSsVV10m6w28N/Fgkk= github.com/ReneKroon/ttlcache v1.7.0/go.mod h1:8BGGzdumrIjWxdRx8zpK6L3oGMWvIXdvB2GD1cfvd+I= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/jellydator/ttlcache/v3 v3.3.0 h1:BdoC9cE81qXfrxeb9eoJi9dWrdhSuwXMAnHTbnBm4Wc= github.com/jellydator/ttlcache/v3 v3.3.0/go.mod h1:bj2/e0l4jRnQdrnSTaGTsh4GSXvMjQcy41i7th0GVGw= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= go.uber.org/goleak v0.10.0/go.mod h1:VCZuO8V8mFPlL0F5J5GK1rtHV3DrFcQ1R8ryq7FK0aI= golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/regexp/regexp.go b/regexp/regexp.go index cffbb5b..3c659fc 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -1,10 +1,12 @@ package regexp import ( - "github.com/jellydator/ttlcache/v3" + "reflect" "regexp" + "runtime" "sync" "time" + "unsafe" ) // Caches regexp compilation to avoid CPU and RAM usage for many duplicate regexps @@ -12,92 +14,68 @@ import ( const defaultTTL = 2 * time.Minute var ( - regexpCache *ttlcache.Cache[string, *regexp.Regexp] - posixRegexpCache *ttlcache.Cache[string, *regexp.Regexp] - ticker *time.Ticker - stopchan chan struct{} - setupLock sync.Mutex + weakMap = make(map[string]uintptr) + posixWeakMap = make(map[string]uintptr) + reverseMap = make(map[uintptr]string) + l sync.RWMutex ) -func init() { - SetCompileCacheTTL(defaultTTL) +func Compile(pattern string) (*regexp.Regexp, error) { + return compile(pattern, regexp.Compile) } -func SetCompileCacheTTL(ttl time.Duration) { - Stop() - setupLock.Lock() - defer setupLock.Unlock() - - regexpCache = ttlcache.New[string, *regexp.Regexp](ttlcache.WithTTL[string, *regexp.Regexp](ttl)) - posixRegexpCache = ttlcache.New[string, *regexp.Regexp](ttlcache.WithTTL[string, *regexp.Regexp](ttl)) - ticker = time.NewTicker(ttl) - stopchan = make(chan struct{}) - - go expire() +func CompilePOSIX(pattern string) (*regexp.Regexp, error) { + return compile(pattern, regexp.CompilePOSIX) } -func expire() { - for { - select { - case <-ticker.C: - regexpCache.DeleteExpired() - posixRegexpCache.DeleteExpired() - case <-stopchan: - ticker.Stop() - break - } - } -} func MustCompile(pattern string) *regexp.Regexp { - item := regexpCache.Get(pattern) - if item != nil { - return item.Value() - } - regex := regexp.MustCompile(pattern) - regexpCache.Set(pattern, regex, ttlcache.DefaultTTL) - return regex + return mustCompile(pattern, regexp.MustCompile) } func MustCompilePOSIX(pattern string) *regexp.Regexp { - item := posixRegexpCache.Get(pattern) - if item != nil { - return item.Value() - } - regex := regexp.MustCompilePOSIX(pattern) - posixRegexpCache.Set(pattern, regex, ttlcache.DefaultTTL) - return regex + return mustCompile(pattern, regexp.MustCompilePOSIX) } -func Compile(pattern string) (*regexp.Regexp, error) { - item := regexpCache.Get(pattern) - if item != nil { - return item.Value(), nil +func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error)) (*regexp.Regexp, error) { + l.RLock() + defer l.RUnlock() + if itemPtr, ok := weakMap[pattern]; ok { + return (*regexp.Regexp)(unsafe.Pointer(itemPtr)), nil } - regex, err := regexp.Compile(pattern) + regex, err := compileFunc(pattern) if err != nil { return nil, err } - regexpCache.Set(pattern, regex, ttlcache.DefaultTTL) + v := reflect.ValueOf(regex) + ptr := v.Pointer() + weakMap[pattern] = ptr + reverseMap[ptr] = pattern + runtime.SetFinalizer(regex, finalizer) return regex, nil } -func CompilePOSIX(pattern string) (*regexp.Regexp, error) { - item := posixRegexpCache.Get(pattern) - if item != nil { - return item.Value(), nil - } - regex, err := regexp.CompilePOSIX(pattern) - if err != nil { - return nil, err +func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp) *regexp.Regexp { + l.RLock() + defer l.RUnlock() + if itemPtr, ok := weakMap[pattern]; ok { + return (*regexp.Regexp)(unsafe.Pointer(itemPtr)) } - posixRegexpCache.Set(pattern, regex, ttlcache.DefaultTTL) - return regex, nil + regex := compileFunc(pattern) + v := reflect.ValueOf(regex) + ptr := v.Pointer() + weakMap[pattern] = ptr + reverseMap[ptr] = pattern + runtime.SetFinalizer(regex, finalizer) + return regex } -func Stop() { - setupLock.Lock() - defer setupLock.Unlock() - if ticker != nil { - stopchan <- struct{}{} +func finalizer(k *regexp.Regexp) { + l.Lock() + defer l.Unlock() + ptr := reflect.ValueOf(k).Pointer() + if s, ok := reverseMap[ptr]; ok { + delete(weakMap, s) + delete(posixWeakMap, s) + delete(reverseMap, ptr) } } diff --git a/regexp/regexp_test.go b/regexp/regexp_test.go new file mode 100644 index 0000000..6f062ad --- /dev/null +++ b/regexp/regexp_test.go @@ -0,0 +1,47 @@ +package regexp + +import ( + "github.com/stretchr/testify/require" + "regexp" + "testing" +) + +func TestRegexpCompilation(t *testing.T) { + t.Run("must", func(t *testing.T) { + testMust(t, regexp.MustCompile, MustCompile) + }) + t.Run("must-posix", func(t *testing.T) { + testMust(t, regexp.MustCompilePOSIX, MustCompilePOSIX) + }) + t.Run("errorable", func(t *testing.T) { + test(t, regexp.Compile, Compile) + }) + t.Run("errorable-posix", func(t *testing.T) { + test(t, regexp.CompilePOSIX, CompilePOSIX) + }) + // Unfortunately, GC behavior is untestably flaky +} + +func test(t *testing.T, compile, cachedCompile func(string) (*regexp.Regexp, error)) { + r1, err := compile(".*") + require.NoError(t, err) + r2, err := compile(".*") + require.NoError(t, err) + require.True(t, r1 != r2) + + r1, err = cachedCompile(".*") + require.NoError(t, err) + r2, err = cachedCompile(".*") + require.NoError(t, err) + require.True(t, r1 == r2) +} + +func testMust(t *testing.T, compile, cachedCompile func(string) *regexp.Regexp) { + r1 := compile(".*") + r2 := compile(".*") + require.True(t, r1 != r2) + + r1 = cachedCompile(".*") + r2 = cachedCompile(".*") + require.True(t, r1 == r2) +} From 84aceb1cf3eee6e46571311b082cf92b70be5d64 Mon Sep 17 00:00:00 2001 From: "Scott G. Miller" Date: Tue, 3 Sep 2024 11:41:19 -0500 Subject: [PATCH 03/20] fix locking semantics --- regexp/regexp.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/regexp/regexp.go b/regexp/regexp.go index 3c659fc..c5401a5 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -56,10 +56,14 @@ func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error)) ( func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp) *regexp.Regexp { l.RLock() - defer l.RUnlock() if itemPtr, ok := weakMap[pattern]; ok { + l.RUnlock() return (*regexp.Regexp)(unsafe.Pointer(itemPtr)) } + l.RUnlock() + l.Lock() + defer l.Unlock() + regex := compileFunc(pattern) v := reflect.ValueOf(regex) ptr := v.Pointer() From 892773d5e0b4e0662161f1d6c9c6871d18361609 Mon Sep 17 00:00:00 2001 From: "Scott G. Miller" Date: Tue, 3 Sep 2024 11:44:49 -0500 Subject: [PATCH 04/20] fix locking in other func, and use the right base maps --- regexp/regexp.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/regexp/regexp.go b/regexp/regexp.go index c5401a5..eba3791 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -21,22 +21,22 @@ var ( ) func Compile(pattern string) (*regexp.Regexp, error) { - return compile(pattern, regexp.Compile) + return compile(pattern, regexp.Compile, weakMap) } func CompilePOSIX(pattern string) (*regexp.Regexp, error) { - return compile(pattern, regexp.CompilePOSIX) + return compile(pattern, regexp.CompilePOSIX, posixWeakMap) } func MustCompile(pattern string) *regexp.Regexp { - return mustCompile(pattern, regexp.MustCompile) + return mustCompile(pattern, regexp.MustCompile, weakMap) } func MustCompilePOSIX(pattern string) *regexp.Regexp { - return mustCompile(pattern, regexp.MustCompilePOSIX) + return mustCompile(pattern, regexp.MustCompilePOSIX, posixWeakMap) } -func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error)) (*regexp.Regexp, error) { +func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), weakMap map[string]uintptr) (*regexp.Regexp, error) { l.RLock() defer l.RUnlock() if itemPtr, ok := weakMap[pattern]; ok { @@ -54,7 +54,7 @@ func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error)) ( return regex, nil } -func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp) *regexp.Regexp { +func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp, weakMap map[string]uintptr) *regexp.Regexp { l.RLock() if itemPtr, ok := weakMap[pattern]; ok { l.RUnlock() From cf697c9b00f4ccfe9aa5fd95e24cc2b0892d4845 Mon Sep 17 00:00:00 2001 From: "Scott G. Miller" Date: Tue, 3 Sep 2024 13:28:02 -0500 Subject: [PATCH 05/20] Rename to be distinct from the original functions and thus not as subtle to later authors where used --- regexp/regexp.go | 17 +++++++++-------- regexp/regexp_test.go | 29 +++++++++++++++++++++++------ 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/regexp/regexp.go b/regexp/regexp.go index eba3791..4810e06 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -5,13 +5,14 @@ import ( "regexp" "runtime" "sync" - "time" "unsafe" ) -// Caches regexp compilation to avoid CPU and RAM usage for many duplicate regexps - -const defaultTTL = 2 * time.Minute +// "Interns" compilation of Regular Expressions. If two regexs with the same pattern are compiled, the result +// is the same *regexp.Regexp. This avoids the compilation cost but more importantly the memory usage. +// +// Regexps produced from this package are backed by a form of weak-valued map, upon a regex becoming +// unreachable, they will be eventually removed from the map and memory reclaimed. var ( weakMap = make(map[string]uintptr) @@ -20,19 +21,19 @@ var ( l sync.RWMutex ) -func Compile(pattern string) (*regexp.Regexp, error) { +func CompileInterned(pattern string) (*regexp.Regexp, error) { return compile(pattern, regexp.Compile, weakMap) } -func CompilePOSIX(pattern string) (*regexp.Regexp, error) { +func CompilePOSIXInterned(pattern string) (*regexp.Regexp, error) { return compile(pattern, regexp.CompilePOSIX, posixWeakMap) } -func MustCompile(pattern string) *regexp.Regexp { +func MustCompileInterned(pattern string) *regexp.Regexp { return mustCompile(pattern, regexp.MustCompile, weakMap) } -func MustCompilePOSIX(pattern string) *regexp.Regexp { +func MustCompilePOSIXInterned(pattern string) *regexp.Regexp { return mustCompile(pattern, regexp.MustCompilePOSIX, posixWeakMap) } diff --git a/regexp/regexp_test.go b/regexp/regexp_test.go index 6f062ad..0b271aa 100644 --- a/regexp/regexp_test.go +++ b/regexp/regexp_test.go @@ -6,20 +6,37 @@ import ( "testing" ) -func TestRegexpCompilation(t *testing.T) { +func TestInterenedRegexps(t *testing.T) { t.Run("must", func(t *testing.T) { - testMust(t, regexp.MustCompile, MustCompile) + testMust(t, regexp.MustCompile, MustCompileInterned) }) t.Run("must-posix", func(t *testing.T) { - testMust(t, regexp.MustCompilePOSIX, MustCompilePOSIX) + testMust(t, regexp.MustCompilePOSIX, MustCompilePOSIXInterned) }) t.Run("errorable", func(t *testing.T) { - test(t, regexp.Compile, Compile) + test(t, regexp.Compile, CompileInterned) }) t.Run("errorable-posix", func(t *testing.T) { - test(t, regexp.CompilePOSIX, CompilePOSIX) + test(t, regexp.CompilePOSIX, CompilePOSIXInterned) }) - // Unfortunately, GC behavior is untestably flaky + // Check errors + _, err := CompileInterned("(") + require.Error(t, err) + + // Unfortunately, GC behavior is non-deterministic, this section of code works, but not reliably: + /* + ptr1 := reflect.ValueOf(r1).Pointer() + r1 = nil + r2 = nil + runtime.GC() + runtime.GC() + r2, err = MustCompile(".*") + require.NoError(t, err) + ptr2 := reflect.ValueOf(r2).Pointer() + // If GC occurred, this will be a brand new pointer as the regex was removed from maps + require.True(t, ptr1 != ptr2) + + */ } func test(t *testing.T, compile, cachedCompile func(string) (*regexp.Regexp, error)) { From bda7c56211685a8a19e45ebfdd7a1fcf85cead52 Mon Sep 17 00:00:00 2001 From: "Scott G. Miller" Date: Tue, 3 Sep 2024 14:52:51 -0500 Subject: [PATCH 06/20] Simplify locking, add benchmarks --- regexp/regexp.go | 12 +++++------- regexp/regexp_test.go | 45 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/regexp/regexp.go b/regexp/regexp.go index 4810e06..6670d8c 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -38,11 +38,12 @@ func MustCompilePOSIXInterned(pattern string) *regexp.Regexp { } func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), weakMap map[string]uintptr) (*regexp.Regexp, error) { - l.RLock() - defer l.RUnlock() + l.Lock() + defer l.Unlock() if itemPtr, ok := weakMap[pattern]; ok { return (*regexp.Regexp)(unsafe.Pointer(itemPtr)), nil } + regex, err := compileFunc(pattern) if err != nil { return nil, err @@ -56,14 +57,11 @@ func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), w } func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp, weakMap map[string]uintptr) *regexp.Regexp { - l.RLock() + l.Lock() + defer l.Unlock() if itemPtr, ok := weakMap[pattern]; ok { - l.RUnlock() return (*regexp.Regexp)(unsafe.Pointer(itemPtr)) } - l.RUnlock() - l.Lock() - defer l.Unlock() regex := compileFunc(pattern) v := reflect.ValueOf(regex) diff --git a/regexp/regexp_test.go b/regexp/regexp_test.go index 0b271aa..6382e9e 100644 --- a/regexp/regexp_test.go +++ b/regexp/regexp_test.go @@ -3,6 +3,9 @@ package regexp import ( "github.com/stretchr/testify/require" "regexp" + "runtime" + "strconv" + "sync" "testing" ) @@ -62,3 +65,45 @@ func testMust(t *testing.T, compile, cachedCompile func(string) *regexp.Regexp) r2 = cachedCompile(".*") require.True(t, r1 == r2) } + +func BenchmarkRegexps(b *testing.B) { + s := make([]*regexp.Regexp, b.N) + for i := 0; i < b.N; i++ { + s[i] = regexp.MustCompile(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)`) + } +} + +func BenchmarkInternedRegexps(b *testing.B) { + s := make([]*regexp.Regexp, b.N) + for i := 0; i < b.N; i++ { + s[i] = MustCompileInterned(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)`) + } +} + +func BenchmarkConcurrentRegexps(b *testing.B) { + var wg sync.WaitGroup + for j := 0; j < runtime.NumCPU(); j++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < b.N; i++ { + regexp.MustCompile(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)` + strconv.Itoa(i) + "-" + strconv.Itoa(j)) + } + }() + } + wg.Wait() +} + +func BenchmarkConcurrentInternedRegexps(b *testing.B) { + var wg sync.WaitGroup + for j := 0; j < runtime.NumCPU(); j++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < b.N; i++ { + MustCompileInterned(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)` + strconv.Itoa(i) + "-" + strconv.Itoa(j)) + } + }() + } + wg.Wait() +} From fb4f51d4f91558c29b223e11bccde3058a052146 Mon Sep 17 00:00:00 2001 From: "Scott G. Miller" Date: Tue, 18 Feb 2025 12:16:55 -0600 Subject: [PATCH 07/20] remove unneeded deps --- regexp/go.mod | 7 ++----- regexp/go.sum | 13 +------------ 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/regexp/go.mod b/regexp/go.mod index 3c5592f..0378f24 100644 --- a/regexp/go.mod +++ b/regexp/go.mod @@ -2,13 +2,10 @@ module github.com/hashicorp/go-secure-stdlib/regexp go 1.23 +require github.com/stretchr/testify v1.9.0 + require ( - github.com/ReneKroon/ttlcache v1.7.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect - github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect - github.com/jellydator/ttlcache/v3 v3.3.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/stretchr/testify v1.9.0 // indirect - golang.org/x/sync v0.8.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/regexp/go.sum b/regexp/go.sum index 5504f69..60ce688 100644 --- a/regexp/go.sum +++ b/regexp/go.sum @@ -1,21 +1,10 @@ -github.com/ReneKroon/ttlcache v1.7.0 h1:8BkjFfrzVFXyrqnMtezAaJ6AHPSsVV10m6w28N/Fgkk= -github.com/ReneKroon/ttlcache v1.7.0/go.mod h1:8BGGzdumrIjWxdRx8zpK6L3oGMWvIXdvB2GD1cfvd+I= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= -github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= -github.com/jellydator/ttlcache/v3 v3.3.0 h1:BdoC9cE81qXfrxeb9eoJi9dWrdhSuwXMAnHTbnBm4Wc= -github.com/jellydator/ttlcache/v3 v3.3.0/go.mod h1:bj2/e0l4jRnQdrnSTaGTsh4GSXvMjQcy41i7th0GVGw= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -go.uber.org/goleak v0.10.0/go.mod h1:VCZuO8V8mFPlL0F5J5GK1rtHV3DrFcQ1R8ryq7FK0aI= -golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= -golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= From 006c38253c8f35feebd786faf334b93699951f99 Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Wed, 19 Feb 2025 18:33:38 +0000 Subject: [PATCH 08/20] Refactor the code to use weak pointers for weak map implementation --- regexp/regexp.go | 73 ++++++++++++++++----------- regexp/regexp_test.go | 114 ++++++++++++++++++++++++------------------ 2 files changed, 110 insertions(+), 77 deletions(-) diff --git a/regexp/regexp.go b/regexp/regexp.go index 6670d8c..191d8c7 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -1,81 +1,98 @@ package regexp import ( - "reflect" "regexp" "runtime" "sync" - "unsafe" + "weak" ) // "Interns" compilation of Regular Expressions. If two regexs with the same pattern are compiled, the result // is the same *regexp.Regexp. This avoids the compilation cost but more importantly the memory usage. // -// Regexps produced from this package are backed by a form of weak-valued map, upon a regex becoming +// Regular expressions produced from this package are backed by a form of weak-valued map, upon a regexp becoming // unreachable, they will be eventually removed from the map and memory reclaimed. var ( - weakMap = make(map[string]uintptr) - posixWeakMap = make(map[string]uintptr) - reverseMap = make(map[uintptr]string) + weakMap = make(map[string]weak.Pointer[regexp.Regexp]) + posixWeakMap = make(map[string]weak.Pointer[regexp.Regexp]) + reverseMap = make(map[weak.Pointer[regexp.Regexp]]string) l sync.RWMutex ) +// CompileInterned compiles and interns a regular expression and returns a +// pointer to it or an error. func CompileInterned(pattern string) (*regexp.Regexp, error) { return compile(pattern, regexp.Compile, weakMap) } +// CompilePOSIXInterned compiles and interns a regular expression using POSIX +// syntax and returns a pointer to it or an error. func CompilePOSIXInterned(pattern string) (*regexp.Regexp, error) { return compile(pattern, regexp.CompilePOSIX, posixWeakMap) } +// MustCompileInterned compiles and interns a regular expression and returns a pointer to it or panics. func MustCompileInterned(pattern string) *regexp.Regexp { return mustCompile(pattern, regexp.MustCompile, weakMap) } +// MustCompilePOSIXInterned compiles and interns a regular expression using +// POSIX syntax and returns a pointer to it or panics. func MustCompilePOSIXInterned(pattern string) *regexp.Regexp { return mustCompile(pattern, regexp.MustCompilePOSIX, posixWeakMap) } -func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), weakMap map[string]uintptr) (*regexp.Regexp, error) { +// compile handles compiling and interning regular expressions. If the regexp is +// already interned, a pointer to it is returned from the map. If the regexp is +// not interned or is nil (since it's a weak pointer), it is compiled and stored +// in the maps. The regexp is stored in the maps as a weak pointer, so that it +// can be garbage collected. +func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), weakMap map[string]weak.Pointer[regexp.Regexp]) (*regexp.Regexp, error) { l.Lock() defer l.Unlock() if itemPtr, ok := weakMap[pattern]; ok { - return (*regexp.Regexp)(unsafe.Pointer(itemPtr)), nil + ptr := itemPtr.Value() + if ptr != nil { + return itemPtr.Value(), nil + } } - - regex, err := compileFunc(pattern) + r, err := compileFunc(pattern) if err != nil { return nil, err } - v := reflect.ValueOf(regex) - ptr := v.Pointer() - weakMap[pattern] = ptr - reverseMap[ptr] = pattern - runtime.SetFinalizer(regex, finalizer) - return regex, nil + weakPointer := weak.Make(r) + weakMap[pattern] = weakPointer + reverseMap[weakPointer] = pattern + runtime.AddCleanup(r, cleanup, weakPointer) + return r, nil } -func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp, weakMap map[string]uintptr) *regexp.Regexp { +// mustCompile handles compiling and interning regular expressions just like +// compile, but it will panic instead of returning an error. If the regexp is +// already interned, a pointer to it is returned from the map. If the regexp is +// not interned or is nil (since it's a weak pointer), it is compiled and stored +// in the maps. The regexp is stored in the maps as a weak pointer, so that it +// can be garbage collected. +func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp, weakMap map[string]weak.Pointer[regexp.Regexp]) *regexp.Regexp { l.Lock() defer l.Unlock() if itemPtr, ok := weakMap[pattern]; ok { - return (*regexp.Regexp)(unsafe.Pointer(itemPtr)) + return itemPtr.Value() } - - regex := compileFunc(pattern) - v := reflect.ValueOf(regex) - ptr := v.Pointer() - weakMap[pattern] = ptr - reverseMap[ptr] = pattern - runtime.SetFinalizer(regex, finalizer) - return regex + r := compileFunc(pattern) + weakPointer := weak.Make(r) + weakMap[pattern] = weakPointer + reverseMap[weakPointer] = pattern + runtime.AddCleanup(r, cleanup, weakPointer) + return r } -func finalizer(k *regexp.Regexp) { +// cleanup is a cleanup function for *regexp.Regexp. It removes the entries from the +// weak maps when the regexp object they point to is garbage collected. +func cleanup(ptr weak.Pointer[regexp.Regexp]) { l.Lock() defer l.Unlock() - ptr := reflect.ValueOf(k).Pointer() if s, ok := reverseMap[ptr]; ok { delete(weakMap, s) delete(posixWeakMap, s) diff --git a/regexp/regexp_test.go b/regexp/regexp_test.go index 6382e9e..117a817 100644 --- a/regexp/regexp_test.go +++ b/regexp/regexp_test.go @@ -9,61 +9,77 @@ import ( "testing" ) -func TestInterenedRegexps(t *testing.T) { - t.Run("must", func(t *testing.T) { - testMust(t, regexp.MustCompile, MustCompileInterned) - }) - t.Run("must-posix", func(t *testing.T) { - testMust(t, regexp.MustCompilePOSIX, MustCompilePOSIXInterned) - }) - t.Run("errorable", func(t *testing.T) { - test(t, regexp.Compile, CompileInterned) - }) - t.Run("errorable-posix", func(t *testing.T) { - test(t, regexp.CompilePOSIX, CompilePOSIXInterned) - }) - // Check errors - _, err := CompileInterned("(") - require.Error(t, err) +// TestInternedRegexps tests that the regular expressions are compiled correctly, +// are interned, and that the cleanup of interned regexps works as expected. +// +// Since this test depends on the garbage collector, it is not really +// deterministic and might flake in the future. If that happens, the calls to +// the garbage collector and the rest of the test should be removed. +func TestInternedRegexps(t *testing.T) { + testCases := map[string]struct { + compileFunc func(string) (*regexp.Regexp, error) + mustCompileFunc func(string) *regexp.Regexp + mustCompile bool + }{ + "CompileInterned": { + compileFunc: CompileInterned, + mustCompileFunc: nil, + mustCompile: false, + }, + "MustCompileInterned": { + compileFunc: nil, + mustCompileFunc: MustCompileInterned, + mustCompile: true, + }, + "CompilePOSIXInterned": { + compileFunc: CompilePOSIXInterned, + mustCompileFunc: nil, + mustCompile: false, + }, + "MustCompilePOSIXInterned": { + compileFunc: nil, + mustCompileFunc: MustCompilePOSIXInterned, + mustCompile: true, + }, + } + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + // Compile two identical regular expressions, their pointers should be the same + var r1, r2 *regexp.Regexp + var err error + if tc.mustCompile { + r1 = tc.mustCompileFunc(".*") + r2 = tc.mustCompileFunc(".*") + } else { + r1, err = tc.compileFunc(".*") + require.NoError(t, err) + r2, err = tc.compileFunc(".*") + require.NoError(t, err) + require.True(t, r1 == r2) - // Unfortunately, GC behavior is non-deterministic, this section of code works, but not reliably: - /* - ptr1 := reflect.ValueOf(r1).Pointer() + // While we're here, check that errors work as expected + _, err = tc.compileFunc("(") + require.Error(t, err) + } + require.True(t, r1 == r2) + // Remove references to the regexps and run the garbage collector r1 = nil r2 = nil + + // Run the garbage collector twice to increase chances of the cleanup happening. + // This still doesn't make it deterministic, but in local testing it was enough + // to not flake a single time in over two million runs, so it should be good enough. + // A single call to runtime.GC() was flaky very frequently in local testing. runtime.GC() runtime.GC() - r2, err = MustCompile(".*") - require.NoError(t, err) - ptr2 := reflect.ValueOf(r2).Pointer() - // If GC occurred, this will be a brand new pointer as the regex was removed from maps - require.True(t, ptr1 != ptr2) - - */ -} -func test(t *testing.T, compile, cachedCompile func(string) (*regexp.Regexp, error)) { - r1, err := compile(".*") - require.NoError(t, err) - r2, err := compile(".*") - require.NoError(t, err) - require.True(t, r1 != r2) - - r1, err = cachedCompile(".*") - require.NoError(t, err) - r2, err = cachedCompile(".*") - require.NoError(t, err) - require.True(t, r1 == r2) -} - -func testMust(t *testing.T, compile, cachedCompile func(string) *regexp.Regexp) { - r1 := compile(".*") - r2 := compile(".*") - require.True(t, r1 != r2) - - r1 = cachedCompile(".*") - r2 = cachedCompile(".*") - require.True(t, r1 == r2) + // Ensure that the cleanup happened and the maps used for interning regexp are empty + l.Lock() + require.Len(t, weakMap, 0) + require.Len(t, reverseMap, 0) + l.Unlock() + }) + } } func BenchmarkRegexps(b *testing.B) { From b2aa9bd6adeeffc987c0c0fbded46821cb8b2234 Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Wed, 19 Feb 2025 19:04:16 +0000 Subject: [PATCH 09/20] If a weak pointer is nil, remove the entry from maps --- regexp/regexp.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/regexp/regexp.go b/regexp/regexp.go index 191d8c7..87f17d2 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -56,6 +56,9 @@ func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), w if ptr != nil { return itemPtr.Value(), nil } + delete(weakMap, pattern) + delete(posixWeakMap, pattern) + delete(reverseMap, itemPtr) } r, err := compileFunc(pattern) if err != nil { @@ -78,7 +81,13 @@ func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp, weakMa l.Lock() defer l.Unlock() if itemPtr, ok := weakMap[pattern]; ok { - return itemPtr.Value() + ptr := itemPtr.Value() + if ptr != nil { + return itemPtr.Value() + } + delete(weakMap, pattern) + delete(posixWeakMap, pattern) + delete(reverseMap, itemPtr) } r := compileFunc(pattern) weakPointer := weak.Make(r) From d583b640d0a8c8609ebb95d6af8685e94c28b5c3 Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Thu, 20 Feb 2025 16:13:44 +0000 Subject: [PATCH 10/20] Add regexp to the module list in the GitHub Actions go.yml workflow file, so that the tests for it are run in CI --- .github/workflows/go.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index cefe191..c24ddbe 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -23,6 +23,7 @@ jobs: "plugincontainer", "pluginutil", "random", + "regexp", "reloadutil", "strutil", "temperror", From 199d4648724bac134862d050023d2f623ddef39e Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Thu, 20 Feb 2025 17:54:55 +0000 Subject: [PATCH 11/20] Update the Go directive in go.mod to version 1.24, so that it's clear what stdlib version to use with this module --- regexp/go.mod | 4 ++-- regexp/go.sum | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/regexp/go.mod b/regexp/go.mod index 0378f24..54a5eb9 100644 --- a/regexp/go.mod +++ b/regexp/go.mod @@ -1,8 +1,8 @@ module github.com/hashicorp/go-secure-stdlib/regexp -go 1.23 +go 1.24 -require github.com/stretchr/testify v1.9.0 +require github.com/stretchr/testify v1.10.0 require ( github.com/davecgh/go-spew v1.1.1 // indirect diff --git a/regexp/go.sum b/regexp/go.sum index 60ce688..713a0b4 100644 --- a/regexp/go.sum +++ b/regexp/go.sum @@ -2,8 +2,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= From 071bc6fb408a627c06f061f6288efeafedae4b99 Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Thu, 20 Feb 2025 17:55:40 +0000 Subject: [PATCH 12/20] Swap RWMutex for a Mutex --- regexp/regexp.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regexp/regexp.go b/regexp/regexp.go index 87f17d2..8ba7125 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -17,7 +17,7 @@ var ( weakMap = make(map[string]weak.Pointer[regexp.Regexp]) posixWeakMap = make(map[string]weak.Pointer[regexp.Regexp]) reverseMap = make(map[weak.Pointer[regexp.Regexp]]string) - l sync.RWMutex + l sync.Mutex ) // CompileInterned compiles and interns a regular expression and returns a From 7eaf9deae048396f36ae8edda05805dda6d0aa73 Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Thu, 20 Feb 2025 17:57:28 +0000 Subject: [PATCH 13/20] Tidying up based on PR feedback; rename some variables to fix shadowing issues, fix returns in compile and mustComplile, edit a comment --- regexp/regexp.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/regexp/regexp.go b/regexp/regexp.go index 8ba7125..ff53bc2 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -7,7 +7,7 @@ import ( "weak" ) -// "Interns" compilation of Regular Expressions. If two regexs with the same pattern are compiled, the result +// Interns the compilation of Regular Expressions. If two regexs with the same pattern are compiled, the result // is the same *regexp.Regexp. This avoids the compilation cost but more importantly the memory usage. // // Regular expressions produced from this package are backed by a form of weak-valued map, upon a regexp becoming @@ -48,15 +48,15 @@ func MustCompilePOSIXInterned(pattern string) *regexp.Regexp { // not interned or is nil (since it's a weak pointer), it is compiled and stored // in the maps. The regexp is stored in the maps as a weak pointer, so that it // can be garbage collected. -func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), weakMap map[string]weak.Pointer[regexp.Regexp]) (*regexp.Regexp, error) { +func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), m map[string]weak.Pointer[regexp.Regexp]) (*regexp.Regexp, error) { l.Lock() defer l.Unlock() - if itemPtr, ok := weakMap[pattern]; ok { + if itemPtr, ok := m[pattern]; ok { ptr := itemPtr.Value() if ptr != nil { - return itemPtr.Value(), nil + return ptr, nil } - delete(weakMap, pattern) + delete(m, pattern) delete(posixWeakMap, pattern) delete(reverseMap, itemPtr) } @@ -65,7 +65,7 @@ func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), w return nil, err } weakPointer := weak.Make(r) - weakMap[pattern] = weakPointer + m[pattern] = weakPointer reverseMap[weakPointer] = pattern runtime.AddCleanup(r, cleanup, weakPointer) return r, nil @@ -77,13 +77,13 @@ func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), w // not interned or is nil (since it's a weak pointer), it is compiled and stored // in the maps. The regexp is stored in the maps as a weak pointer, so that it // can be garbage collected. -func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp, weakMap map[string]weak.Pointer[regexp.Regexp]) *regexp.Regexp { +func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp, m map[string]weak.Pointer[regexp.Regexp]) *regexp.Regexp { l.Lock() defer l.Unlock() - if itemPtr, ok := weakMap[pattern]; ok { + if itemPtr, ok := m[pattern]; ok { ptr := itemPtr.Value() if ptr != nil { - return itemPtr.Value() + return ptr } delete(weakMap, pattern) delete(posixWeakMap, pattern) @@ -91,7 +91,7 @@ func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp, weakMa } r := compileFunc(pattern) weakPointer := weak.Make(r) - weakMap[pattern] = weakPointer + m[pattern] = weakPointer reverseMap[weakPointer] = pattern runtime.AddCleanup(r, cleanup, weakPointer) return r From 02308598369490ac8326230aefcb43fb15ffb22f Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Thu, 20 Feb 2025 18:55:06 +0000 Subject: [PATCH 14/20] Fix the formatting of go.yml to resolve a git conflict with main branch --- .github/workflows/go.yml | 45 +++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index c24ddbe..98f232a 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -8,27 +8,30 @@ jobs: strategy: fail-fast: false matrix: - module: ["awsutil", - "base62", - "configutil", - "fileutil", - "gatedwriter", - "httputil", - "kv-builder", - "listenerutil", - "mlock", - "nonceutil", - "parseutil", - "password", - "plugincontainer", - "pluginutil", - "random", - "regexp", - "reloadutil", - "strutil", - "temperror", - "tlsutil", - "toggledlogger"] + module: + - "awsutil" + - "base62" + - "configutil" + - "cryptoutil" + - "fileutil" + - "gatedwriter" + - "httputil" + - "kv-builder" + - "listenerutil" + - "mlock" + - "nonceutil" + - "parseutil" + - "password" + - "permitpool" + - "plugincontainer" + - "pluginutil" + - "random" + - "regexp" + - "reloadutil" + - "strutil" + - "temperror" + - "tlsutil" + - "toggledlogger" runs-on: ubuntu-latest steps: - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3 From d99b0b1371c5b0bea8ab398a7bc82c978ca7a342 Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Thu, 20 Feb 2025 21:08:16 +0000 Subject: [PATCH 15/20] Refactor mustCompile to wrap compile in a closure to reduce code duplication --- regexp/regexp.go | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/regexp/regexp.go b/regexp/regexp.go index ff53bc2..52e8dc1 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -77,24 +77,12 @@ func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), m // not interned or is nil (since it's a weak pointer), it is compiled and stored // in the maps. The regexp is stored in the maps as a weak pointer, so that it // can be garbage collected. -func mustCompile(pattern string, compileFunc func(string) *regexp.Regexp, m map[string]weak.Pointer[regexp.Regexp]) *regexp.Regexp { - l.Lock() - defer l.Unlock() - if itemPtr, ok := m[pattern]; ok { - ptr := itemPtr.Value() - if ptr != nil { - return ptr - } - delete(weakMap, pattern) - delete(posixWeakMap, pattern) - delete(reverseMap, itemPtr) +func mustCompile(pattern string, mustCompileFunc func(string) *regexp.Regexp, weakMap map[string]weak.Pointer[regexp.Regexp]) *regexp.Regexp { + compileFunc := func(string) (*regexp.Regexp, error) { + return mustCompileFunc(pattern), nil } - r := compileFunc(pattern) - weakPointer := weak.Make(r) - m[pattern] = weakPointer - reverseMap[weakPointer] = pattern - runtime.AddCleanup(r, cleanup, weakPointer) - return r + res, _ := compile(pattern, compileFunc, weakMap) + return res } // cleanup is a cleanup function for *regexp.Regexp. It removes the entries from the From 9c844f2ea1502369964e908ca7af8c4e476aeeac Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Thu, 20 Feb 2025 21:42:55 +0000 Subject: [PATCH 16/20] Tidy up imports in regexp/regexp_test.go --- regexp/regexp_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/regexp/regexp_test.go b/regexp/regexp_test.go index 117a817..b979e9f 100644 --- a/regexp/regexp_test.go +++ b/regexp/regexp_test.go @@ -1,12 +1,13 @@ package regexp import ( - "github.com/stretchr/testify/require" "regexp" "runtime" "strconv" "sync" "testing" + + "github.com/stretchr/testify/require" ) // TestInternedRegexps tests that the regular expressions are compiled correctly, From b21b980129c6857a1e6e6fe782790cf53212abf0 Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Fri, 21 Feb 2025 15:10:01 +0000 Subject: [PATCH 17/20] Refactor compile and mustCompile to only delete the pointers from the map passed in as an argument and the reverseMap; clean up some comments --- regexp/regexp.go | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/regexp/regexp.go b/regexp/regexp.go index 52e8dc1..099cafd 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -48,16 +48,15 @@ func MustCompilePOSIXInterned(pattern string) *regexp.Regexp { // not interned or is nil (since it's a weak pointer), it is compiled and stored // in the maps. The regexp is stored in the maps as a weak pointer, so that it // can be garbage collected. -func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), m map[string]weak.Pointer[regexp.Regexp]) (*regexp.Regexp, error) { +func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), internedPointers map[string]weak.Pointer[regexp.Regexp]) (*regexp.Regexp, error) { l.Lock() defer l.Unlock() - if itemPtr, ok := m[pattern]; ok { + if itemPtr, ok := internedPointers[pattern]; ok { ptr := itemPtr.Value() if ptr != nil { return ptr, nil } - delete(m, pattern) - delete(posixWeakMap, pattern) + delete(internedPointers, pattern) delete(reverseMap, itemPtr) } r, err := compileFunc(pattern) @@ -65,23 +64,23 @@ func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), m return nil, err } weakPointer := weak.Make(r) - m[pattern] = weakPointer + internedPointers[pattern] = weakPointer reverseMap[weakPointer] = pattern runtime.AddCleanup(r, cleanup, weakPointer) return r, nil } -// mustCompile handles compiling and interning regular expressions just like -// compile, but it will panic instead of returning an error. If the regexp is -// already interned, a pointer to it is returned from the map. If the regexp is -// not interned or is nil (since it's a weak pointer), it is compiled and stored -// in the maps. The regexp is stored in the maps as a weak pointer, so that it -// can be garbage collected. -func mustCompile(pattern string, mustCompileFunc func(string) *regexp.Regexp, weakMap map[string]weak.Pointer[regexp.Regexp]) *regexp.Regexp { +// mustCompile is a wrapper around compile that is used when we want to panic +// instead of returning an error. If the regexp is already interned, a pointer +// to it is returned from the map. If the regexp is not interned or is nil +// (since it's a weak pointer), it is compiled and stored in the maps. The +// regexp is stored in the maps as a weak pointer, so that it can be garbage +// collected. +func mustCompile(pattern string, mustCompileFunc func(string) *regexp.Regexp, internedPointers map[string]weak.Pointer[regexp.Regexp]) *regexp.Regexp { compileFunc := func(string) (*regexp.Regexp, error) { return mustCompileFunc(pattern), nil } - res, _ := compile(pattern, compileFunc, weakMap) + res, _ := compile(pattern, compileFunc, internedPointers) return res } From ea0066a56d06888a2027f22e7dc22ef0948ba047 Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Fri, 21 Feb 2025 16:34:20 +0000 Subject: [PATCH 18/20] Rename cleanup to cleanupCollectedPointers and refactor it to only delete entries from either the POSIX or non-POSIX map, rather than both --- regexp/regexp.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/regexp/regexp.go b/regexp/regexp.go index 099cafd..f258694 100644 --- a/regexp/regexp.go +++ b/regexp/regexp.go @@ -66,7 +66,13 @@ func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), i weakPointer := weak.Make(r) internedPointers[pattern] = weakPointer reverseMap[weakPointer] = pattern + + // Register a cleanup function for the regexp object + cleanup := func(ptr weak.Pointer[regexp.Regexp]) { + cleanupCollectedPointers(ptr, internedPointers) + } runtime.AddCleanup(r, cleanup, weakPointer) + return r, nil } @@ -84,14 +90,14 @@ func mustCompile(pattern string, mustCompileFunc func(string) *regexp.Regexp, in return res } -// cleanup is a cleanup function for *regexp.Regexp. It removes the entries from the -// weak maps when the regexp object they point to is garbage collected. -func cleanup(ptr weak.Pointer[regexp.Regexp]) { +// cleanupCollectedPointers is a cleanup function for *regexp.Regexp. It removes +// the entries from relevant maps when the regexp object they point to is +// garbage collected. +func cleanupCollectedPointers(ptr weak.Pointer[regexp.Regexp], internedPointers map[string]weak.Pointer[regexp.Regexp]) { l.Lock() defer l.Unlock() if s, ok := reverseMap[ptr]; ok { - delete(weakMap, s) - delete(posixWeakMap, s) + delete(internedPointers, s) delete(reverseMap, ptr) } } From 0b4566a3139256d37ae554f6e66dd89e395b4de5 Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Mon, 24 Feb 2025 13:48:51 +0000 Subject: [PATCH 19/20] Add a retry mechanism with a timeout to TestInternedRegexps --- regexp/regexp_test.go | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/regexp/regexp_test.go b/regexp/regexp_test.go index b979e9f..fbfa527 100644 --- a/regexp/regexp_test.go +++ b/regexp/regexp_test.go @@ -6,6 +6,7 @@ import ( "strconv" "sync" "testing" + "time" "github.com/stretchr/testify/require" ) @@ -45,7 +46,7 @@ func TestInternedRegexps(t *testing.T) { } for name, tc := range testCases { t.Run(name, func(t *testing.T) { - // Compile two identical regular expressions, their pointers should be the same + // Compile two identical regular expressions, their pointers should be the same. var r1, r2 *regexp.Regexp var err error if tc.mustCompile { @@ -56,29 +57,40 @@ func TestInternedRegexps(t *testing.T) { require.NoError(t, err) r2, err = tc.compileFunc(".*") require.NoError(t, err) - require.True(t, r1 == r2) - // While we're here, check that errors work as expected + // While we're here, check that errors work as expected. _, err = tc.compileFunc("(") require.Error(t, err) } require.True(t, r1 == r2) - // Remove references to the regexps and run the garbage collector + + // Remove references to the regexps, and run the garbage collector in a loop to see if the cleanup happens. r1 = nil r2 = nil + deadline := time.Now().Add(10 * time.Second) + for { + // Run the garbage collector twice to increase chances of the cleanup happening. + // This still doesn't make it deterministic, but in local testing it was enough + // to not flake a single time in over two million runs, so it should be good enough. + // A single call to runtime.GC() was flaking very frequently in local testing. + runtime.GC() + runtime.GC() - // Run the garbage collector twice to increase chances of the cleanup happening. - // This still doesn't make it deterministic, but in local testing it was enough - // to not flake a single time in over two million runs, so it should be good enough. - // A single call to runtime.GC() was flaky very frequently in local testing. - runtime.GC() - runtime.GC() + // Ensure that the cleanup happened and the maps used for interning regexp are empty. + l.Lock() + wmlen := len(weakMap) + rmlen := len(reverseMap) + l.Unlock() - // Ensure that the cleanup happened and the maps used for interning regexp are empty - l.Lock() - require.Len(t, weakMap, 0) - require.Len(t, reverseMap, 0) - l.Unlock() + if wmlen == 0 && rmlen == 0 { + // Cleanup happened, test can exit successfully. + break + } + if time.Now().After(deadline) { + t.Fatalf("cleanup of interned regexps did not happen in time") + } + time.Sleep(1 * time.Second) + } }) } } From 5d9a94a24fed4c7de9949f0636623d7956507892 Mon Sep 17 00:00:00 2001 From: Kuba Wieczorek Date: Mon, 24 Feb 2025 16:44:08 +0000 Subject: [PATCH 20/20] Add a test that ensures that the cleanup function does not confuse POSIX and non-POSIX regexs --- regexp/regexp_test.go | 51 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/regexp/regexp_test.go b/regexp/regexp_test.go index fbfa527..1f29572 100644 --- a/regexp/regexp_test.go +++ b/regexp/regexp_test.go @@ -49,13 +49,14 @@ func TestInternedRegexps(t *testing.T) { // Compile two identical regular expressions, their pointers should be the same. var r1, r2 *regexp.Regexp var err error + pattern := ".*" if tc.mustCompile { - r1 = tc.mustCompileFunc(".*") - r2 = tc.mustCompileFunc(".*") + r1 = tc.mustCompileFunc(pattern) + r2 = tc.mustCompileFunc(pattern) } else { - r1, err = tc.compileFunc(".*") + r1, err = tc.compileFunc(pattern) require.NoError(t, err) - r2, err = tc.compileFunc(".*") + r2, err = tc.compileFunc(pattern) require.NoError(t, err) // While we're here, check that errors work as expected. @@ -95,6 +96,48 @@ func TestInternedRegexps(t *testing.T) { } } +// TestCleanupCorrectKind tests that the cleanup function removes the correct +// kind (POSIX or non-POSIX) of regular expression from the correct backing maps. +func TestCleanupCorrectKind(t *testing.T) { + pattern := ".*" + // Compile a POSIX and a non-POSIX regular expression + posixRegexp, err := CompilePOSIXInterned(pattern) + require.NoError(t, err) + nonPosixRegexp, err := CompileInterned(pattern) + require.NoError(t, err) + + // Ensure they are different pointers + require.NotEqual(t, posixRegexp, nonPosixRegexp) + + // Manually run the cleanup function for the POSIX regular expression + cleanupCollectedPointers(posixWeakMap[pattern], posixWeakMap) + + // Ensure that the POSIX regular expression was removed from the maps + l.Lock() + require.Len(t, posixWeakMap, 0) + require.Len(t, reverseMap, 1) + require.Len(t, weakMap, 1) + l.Unlock() + + // Compile a new POSIX regular expression with the same pattern + posixRegexp, err = CompilePOSIXInterned(pattern) + require.NoError(t, err) + + l.Lock() + require.Len(t, posixWeakMap, 1) + require.Len(t, reverseMap, 2) + require.Len(t, weakMap, 1) + l.Unlock() + + // Manually run the cleanup function for the non-POSIX regular expression + cleanupCollectedPointers(weakMap[pattern], weakMap) + l.Lock() + require.Len(t, weakMap, 0) + require.Len(t, reverseMap, 1) + require.Len(t, posixWeakMap, 1) + l.Unlock() +} + func BenchmarkRegexps(b *testing.B) { s := make([]*regexp.Regexp, b.N) for i := 0; i < b.N; i++ {