-
Notifications
You must be signed in to change notification settings - Fork 5
perf: aggregate parallelism cap + 2 GB Lambda for collect refresh (followup #269) #270
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
cristim
merged 3 commits into
feat/multicloud-web-frontend
from
perf/recommendations-aggregate-cap
May 4, 2026
+342
−30
Merged
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,87 @@ | ||
| // Package concurrency provides a shared global parallelism cap for the | ||
| // recommendations-collection fan-out tree. | ||
| // | ||
| // The fan-out has up to four nested levels (provider → account → service|region | ||
| // → per-region service). Each level was independently capped, so peak goroutine | ||
| // counts multiplied through the tree (3 providers × 20 accounts × 30 regions × | ||
| // 2 services = thousands of in-flight gRPC/HTTP clients). On a 512 MB Lambda | ||
| // that exhausted memory before the work could finish. | ||
| // | ||
| // A single semaphore stashed on the context lets every leaf goroutine — the | ||
| // goroutine that issues the actual cloud-API call — acquire one slot before | ||
| // doing IO and release it after, so the aggregate concurrent IO count is hard- | ||
| // bounded regardless of nesting depth. Intermediate dispatchers (provider, | ||
| // account, GCP region) do NOT acquire — they only launch sub-goroutines — so | ||
| // no goroutine can deadlock by holding a permit while waiting for sub-permits. | ||
| // | ||
| // If no semaphore is attached to the context (e.g. unit tests, ambient calls | ||
| // from CLI tools), Acquire and Release are no-ops; callers don't need to | ||
| // branch on whether the semaphore is set. | ||
| package concurrency | ||
|
|
||
| import ( | ||
| "context" | ||
| "os" | ||
| "strconv" | ||
|
|
||
| "golang.org/x/sync/semaphore" | ||
| ) | ||
|
|
||
| // DefaultMaxParallelism is the default cap on aggregate concurrent leaf | ||
| // goroutines across the recommendations-collection fan-out tree. Override at | ||
| // runtime with CUDLY_MAX_PARALLELISM. | ||
| const DefaultMaxParallelism = 20 | ||
|
|
||
| // MaxParallelismFromEnv reads CUDLY_MAX_PARALLELISM and returns its | ||
| // positive-integer value, falling back to DefaultMaxParallelism on unset / | ||
| // invalid / non-positive values. | ||
| func MaxParallelismFromEnv() int { | ||
| if v := os.Getenv("CUDLY_MAX_PARALLELISM"); v != "" { | ||
| if n, err := strconv.Atoi(v); err == nil && n > 0 { | ||
| return n | ||
| } | ||
| } | ||
| return DefaultMaxParallelism | ||
| } | ||
|
|
||
| type ctxKey struct{} | ||
|
|
||
| // WithSharedSemaphore returns a context carrying sem. Goroutines spawned from | ||
| // this context (or any descendant) can acquire/release slots via Acquire and | ||
| // Release. If sem is nil the context is returned unchanged. | ||
| func WithSharedSemaphore(ctx context.Context, sem *semaphore.Weighted) context.Context { | ||
| if sem == nil { | ||
| return ctx | ||
| } | ||
| return context.WithValue(ctx, ctxKey{}, sem) | ||
| } | ||
|
|
||
| // SharedSemaphore returns the semaphore stashed in ctx, or nil if none. | ||
| func SharedSemaphore(ctx context.Context) *semaphore.Weighted { | ||
| sem, _ := ctx.Value(ctxKey{}).(*semaphore.Weighted) | ||
| return sem | ||
| } | ||
|
|
||
| // Acquire blocks until a slot is available on the shared semaphore in ctx and | ||
| // returns nil. Returns ctx.Err() if the wait is cancelled. If no semaphore is | ||
| // attached to ctx, Acquire is a no-op and returns nil immediately — leaf | ||
| // callers can use it unconditionally without checking. | ||
| func Acquire(ctx context.Context) error { | ||
| sem := SharedSemaphore(ctx) | ||
| if sem == nil { | ||
| return nil | ||
| } | ||
| return sem.Acquire(ctx, 1) | ||
| } | ||
|
|
||
| // Release returns one slot to the shared semaphore in ctx. Always pair with a | ||
| // successful Acquire (return value nil); calling Release after a cancelled | ||
| // Acquire would corrupt the slot count. If no semaphore is attached to ctx, | ||
| // Release is a no-op. | ||
| func Release(ctx context.Context) { | ||
| sem := SharedSemaphore(ctx) | ||
| if sem == nil { | ||
| return | ||
| } | ||
| sem.Release(1) | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,136 @@ | ||
| package concurrency | ||
|
|
||
| import ( | ||
| "context" | ||
| "os" | ||
| "sync" | ||
| "sync/atomic" | ||
| "testing" | ||
| "time" | ||
|
|
||
| "github.com/stretchr/testify/assert" | ||
| "github.com/stretchr/testify/require" | ||
| "golang.org/x/sync/semaphore" | ||
| ) | ||
|
|
||
| // TestMaxParallelismFromEnv pins the env-knob parser semantics for | ||
| // CUDLY_MAX_PARALLELISM. | ||
| func TestMaxParallelismFromEnv(t *testing.T) { | ||
| cases := []struct { | ||
| name string | ||
| env string | ||
| want int | ||
| }{ | ||
| {"unset returns default", "", DefaultMaxParallelism}, | ||
| {"positive integer overrides", "50", 50}, | ||
| {"non-numeric falls back to default", "many", DefaultMaxParallelism}, | ||
| {"zero falls back to default", "0", DefaultMaxParallelism}, | ||
| {"negative falls back to default", "-3", DefaultMaxParallelism}, | ||
| } | ||
| for _, tc := range cases { | ||
| t.Run(tc.name, func(t *testing.T) { | ||
| t.Setenv("CUDLY_MAX_PARALLELISM", tc.env) | ||
| assert.Equal(t, tc.want, MaxParallelismFromEnv()) | ||
| }) | ||
| } | ||
|
|
||
| t.Run("explicit unset returns default", func(t *testing.T) { | ||
| os.Unsetenv("CUDLY_MAX_PARALLELISM") | ||
| assert.Equal(t, DefaultMaxParallelism, MaxParallelismFromEnv()) | ||
| }) | ||
| } | ||
|
|
||
| // TestSharedSemaphore_NoSemaphoreOnContext verifies Acquire/Release are | ||
| // no-ops when no semaphore is attached — the documented contract that lets | ||
| // CLI tools and unit tests skip the semaphore entirely without per-call | ||
| // branching. | ||
| func TestSharedSemaphore_NoSemaphoreOnContext(t *testing.T) { | ||
| ctx := context.Background() | ||
| assert.Nil(t, SharedSemaphore(ctx)) | ||
| require.NoError(t, Acquire(ctx)) | ||
| Release(ctx) // must not panic | ||
| } | ||
|
|
||
| // TestSharedSemaphore_WithNilSemaphore verifies WithSharedSemaphore returns | ||
| // the input ctx unchanged when sem is nil — defensive against accidental | ||
| // nil passes. | ||
| func TestSharedSemaphore_WithNilSemaphore(t *testing.T) { | ||
| ctx := context.Background() | ||
| assert.Equal(t, ctx, WithSharedSemaphore(ctx, nil)) | ||
| } | ||
|
|
||
| // TestSharedSemaphore_BoundsConcurrency is the load-bearing contract test: | ||
| // with a cap of 3, 20 goroutines all calling Acquire/work/Release must | ||
| // never see more than 3 in-flight concurrently. Asserts peak concurrency | ||
| // observed via atomics. | ||
| func TestSharedSemaphore_BoundsConcurrency(t *testing.T) { | ||
| const cap = 3 | ||
| const goroutines = 20 | ||
| sem := semaphore.NewWeighted(cap) | ||
| ctx := WithSharedSemaphore(context.Background(), sem) | ||
|
|
||
| var inflight, peak atomic.Int32 | ||
| updatePeak := func(cur int32) { | ||
| for { | ||
| p := peak.Load() | ||
| if cur <= p || peak.CompareAndSwap(p, cur) { | ||
| return | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Workers must never call require.* / FailNow on a non-test goroutine — | ||
| // testify's contract is that those land on the test's own goroutine | ||
| // (otherwise the failure mechanism uses runtime.Goexit on the worker | ||
| // instead of stopping the test, which can hang or skip cleanup). Each | ||
| // worker captures its Acquire result on a buffered channel and the main | ||
| // goroutine asserts after wg.Wait(). Release is only deferred on a | ||
| // successful Acquire — the documented pairing contract. | ||
| var wg sync.WaitGroup | ||
| errCh := make(chan error, goroutines) | ||
| for i := 0; i < goroutines; i++ { | ||
| wg.Add(1) | ||
| go func() { | ||
| defer wg.Done() | ||
| if err := Acquire(ctx); err != nil { | ||
| errCh <- err | ||
| return | ||
| } | ||
| defer Release(ctx) | ||
| errCh <- nil | ||
| cur := inflight.Add(1) | ||
| updatePeak(cur) | ||
| time.Sleep(2 * time.Millisecond) // make overlap observable | ||
| inflight.Add(-1) | ||
| }() | ||
| } | ||
| wg.Wait() | ||
| close(errCh) | ||
| for err := range errCh { | ||
| require.NoError(t, err) | ||
| } | ||
|
|
||
| assert.LessOrEqual(t, peak.Load(), int32(cap), | ||
| "peak concurrent in-flight goroutines must not exceed semaphore cap") | ||
| assert.GreaterOrEqual(t, peak.Load(), int32(2), | ||
| "with %d goroutines and cap %d, peak should reach at least 2 (proves goroutines genuinely overlapped)", | ||
| goroutines, cap) | ||
| } | ||
|
|
||
| // TestSharedSemaphore_AcquireRespectsCancellation verifies Acquire returns | ||
| // ctx.Err() when the parent ctx is cancelled while waiting for a slot. | ||
| // Without this, a cancelled refresh would leak a goroutine parked | ||
| // indefinitely on Acquire. | ||
| func TestSharedSemaphore_AcquireRespectsCancellation(t *testing.T) { | ||
| sem := semaphore.NewWeighted(1) | ||
| // Pre-occupy the only slot so the second Acquire must wait. | ||
| require.NoError(t, sem.Acquire(context.Background(), 1)) | ||
| defer sem.Release(1) | ||
|
|
||
| ctx, cancel := context.WithCancel(WithSharedSemaphore(context.Background(), sem)) | ||
| cancel() // cancel before Acquire even starts | ||
|
|
||
| err := Acquire(ctx) | ||
| require.Error(t, err) | ||
| assert.ErrorIs(t, err, context.Canceled) | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.