-
Notifications
You must be signed in to change notification settings - Fork 5
perf(azure): parallelize 5 sequential service calls in GetRecommendations (closes #258) #259
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,11 +9,13 @@ import ( | |
|
|
||
| "github.com/Azure/azure-sdk-for-go/sdk/azcore" | ||
| "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/advisor/armadvisor" | ||
| "golang.org/x/sync/errgroup" | ||
|
|
||
| "github.com/LeanerCloud/CUDly/pkg/common" | ||
| "github.com/LeanerCloud/CUDly/pkg/logging" | ||
| "github.com/LeanerCloud/CUDly/providers/azure/services/cache" | ||
| "github.com/LeanerCloud/CUDly/providers/azure/services/compute" | ||
| "github.com/LeanerCloud/CUDly/providers/azure/services/cosmosdb" | ||
| "github.com/LeanerCloud/CUDly/providers/azure/services/database" | ||
| ) | ||
|
|
||
|
|
@@ -55,53 +57,116 @@ func NewRecommendationsClientAdapter(cred azcore.TokenCredential, subscriptionID | |
| // blank on the client — converters must populate Region from the response data | ||
| // (see known_issues/10_azure_provider.md CRITICAL "Recommendation converters | ||
| // ignore the API response entirely" for the matching converter work). | ||
| // | ||
| // All five service calls run concurrently under errgroup. Each goroutine captures | ||
| // its own error and returns nil to the group so that a single service failure | ||
| // does not cancel sibling calls. Results are appended in a deterministic order | ||
| // (compute → database → cache → cosmosdb → advisor) after all goroutines finish. | ||
| func (r *RecommendationsClientAdapter) GetRecommendations(ctx context.Context, params common.RecommendationParams) ([]common.Recommendation, error) { | ||
| allRecommendations := make([]common.Recommendation, 0) | ||
| var ( | ||
| computeRecs, dbRecs, cacheRecs, cosmosRecs, advisorRecs []common.Recommendation | ||
| computeErr, dbErr, cacheErr, cosmosErr, advisorErr error | ||
| ) | ||
|
|
||
| g, gctx := errgroup.WithContext(ctx) | ||
|
|
||
| // Compute (VM) recommendations — subscription-wide. | ||
| if shouldIncludeService(params, common.ServiceCompute) { | ||
| computeClient := compute.NewClient(r.cred, r.subscriptionID, "") | ||
| computeRecs, err := computeClient.GetRecommendations(ctx, params) | ||
| if err != nil { | ||
| logging.Warnf("Azure compute recommendations: %v", err) | ||
| } else { | ||
| allRecommendations = append(allRecommendations, computeRecs...) | ||
| } | ||
| g.Go(func() error { | ||
| computeClient := compute.NewClient(r.cred, r.subscriptionID, "") | ||
| computeRecs, computeErr = computeClient.GetRecommendations(gctx, params) | ||
| return nil // error isolation: never propagate to errgroup | ||
| }) | ||
| } | ||
|
|
||
| // Database (SQL) recommendations — subscription-wide. | ||
| if shouldIncludeService(params, common.ServiceRelationalDB) { | ||
| dbClient := database.NewClient(r.cred, r.subscriptionID, "") | ||
| dbRecs, err := dbClient.GetRecommendations(ctx, params) | ||
| if err != nil { | ||
| logging.Warnf("Azure database recommendations: %v", err) | ||
| } else { | ||
| allRecommendations = append(allRecommendations, dbRecs...) | ||
| } | ||
| g.Go(func() error { | ||
| dbClient := database.NewClient(r.cred, r.subscriptionID, "") | ||
| dbRecs, dbErr = dbClient.GetRecommendations(gctx, params) | ||
| return nil | ||
| }) | ||
| } | ||
|
|
||
| // Cache (Redis) recommendations — subscription-wide. | ||
| if shouldIncludeService(params, common.ServiceCache) { | ||
| cacheClient := cache.NewClient(r.cred, r.subscriptionID, "") | ||
| cacheRecs, err := cacheClient.GetRecommendations(ctx, params) | ||
| if err != nil { | ||
| logging.Warnf("Azure cache recommendations: %v", err) | ||
| } else { | ||
| allRecommendations = append(allRecommendations, cacheRecs...) | ||
| } | ||
| g.Go(func() error { | ||
| cacheClient := cache.NewClient(r.cred, r.subscriptionID, "") | ||
| cacheRecs, cacheErr = cacheClient.GetRecommendations(gctx, params) | ||
| return nil | ||
| }) | ||
| } | ||
|
|
||
| // CosmosDB (NoSQL) recommendations — subscription-wide. | ||
| if shouldIncludeService(params, common.ServiceNoSQL) { | ||
| g.Go(func() error { | ||
| cosmosClient := cosmosdb.NewClient(r.cred, r.subscriptionID, "") | ||
| cosmosRecs, cosmosErr = cosmosClient.GetRecommendations(gctx, params) | ||
| return nil | ||
| }) | ||
| } | ||
|
|
||
| // Azure Advisor adds cross-cutting cost recommendations independent of the | ||
| // per-service Reservation API. Failures here are non-fatal — the per-service | ||
| // results above are still useful on their own. | ||
| advisorRecs, err := r.getAdvisorRecommendations(ctx, params) | ||
| if err != nil { | ||
| logging.Errorf("Failed to get Azure Advisor recommendations: %v", err) | ||
| } else { | ||
| allRecommendations = append(allRecommendations, advisorRecs...) | ||
| g.Go(func() error { | ||
| advisorRecs, advisorErr = r.getAdvisorRecommendations(gctx, params) | ||
| return nil | ||
| }) | ||
|
|
||
| // Wait for all goroutines. g.Wait() always returns nil because every | ||
| // goroutine returns nil — errors are captured in per-service variables | ||
| // above. After Wait, propagate ctx cancellation so callers can distinguish | ||
| // "all five services completed (with possibly per-service errors)" from | ||
| // "the parent ctx was canceled mid-fan-out". Without this check the | ||
| // CHECK could swallow a deadline exceeded that the caller expected to | ||
| // see. | ||
| _ = g.Wait() | ||
|
Comment on lines
+71
to
+124
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🌐 Web query:
💡 Result: When using golang.org/x/sync/errgroup WithContext: 1) When is the derived context canceled? - The derived context returned by WithContext is canceled the first time either: a) any function passed to Go returns a non-nil error, or b) Wait returns. This cancellation happens “whichever occurs first.” 2) What does Wait return if every goroutine returns nil but the parent context is canceled? - Wait only returns the first non-nil error produced by the goroutines (g.err). If all goroutines return nil, then g.err stays nil. - Wait does still cancel the derived context when it returns (it calls g.cancel(g.err) with g.err possibly nil), but Wait’s return value remains nil. So: if every goroutine returns nil, g.Wait returns nil even if the parent context was canceled (assuming no goroutine returned a non-nil error). If you need Wait to return something like ctx.Err on parent cancellation, you must ensure at least one goroutine returns ctx.Err (or otherwise propagate it), because errgroup.Wait does not automatically return ctx.Err just because the parent context was canceled. Citations:
🏁 Script executed: wc -l providers/azure/recommendations.goRepository: LeanerCloud/CUDly Length of output: 100 🏁 Script executed: sed -n '150,165p' providers/azure/recommendations.goRepository: LeanerCloud/CUDly Length of output: 598 🏁 Script executed: sed -n '120,165p' providers/azure/recommendations.goRepository: LeanerCloud/CUDly Length of output: 1644 🏁 Script executed: sed -n '71,75p' providers/azure/recommendations.goRepository: LeanerCloud/CUDly Length of output: 234 🏁 Script executed: sed -n '50,75p' providers/azure/recommendations.goRepository: LeanerCloud/CUDly Length of output: 1589 🏁 Script executed: sed -n '115,162p' providers/azure/recommendations.goRepository: LeanerCloud/CUDly Length of output: 1751 Propagate context cancellation instead of returning success on timeout. Every goroutine returns After Suggested fix // Wait for all goroutines. g.Wait() always returns nil because every goroutine
// returns nil — errors are captured in per-service variables above.
_ = g.Wait()
// Log per-service errors (matches previous sequential behaviour). Append
// results in the same deterministic order as the original sequential code
// (compute → database → cache → cosmosdb → advisor) so that any
// order-sensitive callers remain stable.
allRecommendations := make([]common.Recommendation, 0,
len(computeRecs)+len(dbRecs)+len(cacheRecs)+len(cosmosRecs)+len(advisorRecs))
if computeErr != nil {
logging.Warnf("Azure compute recommendations: %v", computeErr)
} else {
allRecommendations = append(allRecommendations, computeRecs...)
}
if dbErr != nil {
logging.Warnf("Azure database recommendations: %v", dbErr)
} else {
allRecommendations = append(allRecommendations, dbRecs...)
}
if cacheErr != nil {
logging.Warnf("Azure cache recommendations: %v", cacheErr)
} else {
allRecommendations = append(allRecommendations, cacheRecs...)
}
if cosmosErr != nil {
logging.Warnf("Azure cosmosdb recommendations: %v", cosmosErr)
} else {
allRecommendations = append(allRecommendations, cosmosRecs...)
}
if advisorErr != nil {
logging.Errorf("Failed to get Azure Advisor recommendations: %v", advisorErr)
} else {
allRecommendations = append(allRecommendations, advisorRecs...)
}
+ if err := ctx.Err(); err != nil {
+ return allRecommendations, err
+ }
return allRecommendations, nil🤖 Prompt for AI Agents |
||
| if err := ctx.Err(); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| return allRecommendations, nil | ||
| return mergeServiceResults(serviceResult{"compute", computeRecs, computeErr}, | ||
| serviceResult{"database", dbRecs, dbErr}, | ||
| serviceResult{"cache", cacheRecs, cacheErr}, | ||
| serviceResult{"cosmosdb", cosmosRecs, cosmosErr}, | ||
| serviceResult{"advisor", advisorRecs, advisorErr}), nil | ||
| } | ||
|
|
||
| // serviceResult bundles a per-service collection outcome for the deterministic | ||
| // merge in mergeServiceResults. Extracted into a helper so GetRecommendations | ||
| // stays under the cyclomatic-complexity gate after the post-Wait ctx.Err() | ||
| // propagation was added. | ||
| type serviceResult struct { | ||
| name string | ||
| recs []common.Recommendation | ||
| err error | ||
| } | ||
|
|
||
| // mergeServiceResults logs per-service errors (matches the previous sequential | ||
| // behaviour where each error was logged inline via logging.Warnf) and appends | ||
| // successful results in the order the slice is passed — callers must preserve | ||
| // the canonical compute → database → cache → cosmosdb → advisor order so that | ||
| // order-sensitive consumers remain stable. The advisor entry's error is logged | ||
| // via logging.Errorf to match the pre-parallelisation severity. | ||
| func mergeServiceResults(results ...serviceResult) []common.Recommendation { | ||
| total := 0 | ||
| for _, r := range results { | ||
| total += len(r.recs) | ||
| } | ||
| out := make([]common.Recommendation, 0, total) | ||
| for _, r := range results { | ||
| if r.err != nil { | ||
| if r.name == "advisor" { | ||
| logging.Errorf("Failed to get Azure Advisor recommendations: %v", r.err) | ||
| } else { | ||
| logging.Warnf("Azure %s recommendations: %v", r.name, r.err) | ||
| } | ||
| continue | ||
| } | ||
| out = append(out, r.recs...) | ||
| } | ||
| return out | ||
| } | ||
|
|
||
| // GetRecommendationsForService retrieves Azure reservation recommendations for a specific service | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion | 🟠 Major | 🏗️ Heavy lift
Please pin the latency win with a concurrency regression test.
This refactor’s main value is wall-clock behavior, but nothing in the diff guards against accidental re-serialization later. A fake-client test/benchmark that sleeps per service and asserts total duration stays near the max single-call latency, while result order remains deterministic, would make this change much safer to maintain.
Also applies to: 65-159
🤖 Prompt for AI Agents