diff --git a/providers/azure/go.mod b/providers/azure/go.mod index e223e198..3c0e05fd 100644 --- a/providers/azure/go.mod +++ b/providers/azure/go.mod @@ -1,8 +1,6 @@ module github.com/LeanerCloud/CUDly/providers/azure -go 1.23.0 - -toolchain go1.24.4 +go 1.25.0 require ( github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.1 @@ -18,6 +16,7 @@ require ( github.com/LeanerCloud/CUDly/pkg v0.0.0 github.com/google/uuid v1.6.0 github.com/stretchr/testify v1.11.1 + golang.org/x/sync v0.20.0 ) require ( diff --git a/providers/azure/go.sum b/providers/azure/go.sum index fb36bd85..1a0d2760 100644 --- a/providers/azure/go.sum +++ b/providers/azure/go.sum @@ -72,6 +72,8 @@ golang.org/x/crypto v0.40.0 h1:r4x+VvoG5Fm+eJcxMaY8CQM7Lb0l1lsmjGBQ6s8BfKM= golang.org/x/crypto v0.40.0/go.mod h1:Qr1vMER5WyS2dfPHAlsOj01wgLbsyWtFn/aY+5+ZdxY= golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs= golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= diff --git a/providers/azure/recommendations.go b/providers/azure/recommendations.go index e53c6184..ffb773d9 100644 --- a/providers/azure/recommendations.go +++ b/providers/azure/recommendations.go @@ -9,11 +9,13 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/advisor/armadvisor" + "golang.org/x/sync/errgroup" "github.com/LeanerCloud/CUDly/pkg/common" "github.com/LeanerCloud/CUDly/pkg/logging" "github.com/LeanerCloud/CUDly/providers/azure/services/cache" "github.com/LeanerCloud/CUDly/providers/azure/services/compute" + "github.com/LeanerCloud/CUDly/providers/azure/services/cosmosdb" "github.com/LeanerCloud/CUDly/providers/azure/services/database" ) @@ -55,53 +57,116 @@ func NewRecommendationsClientAdapter(cred azcore.TokenCredential, subscriptionID // blank on the client — converters must populate Region from the response data // (see known_issues/10_azure_provider.md CRITICAL "Recommendation converters // ignore the API response entirely" for the matching converter work). +// +// All five service calls run concurrently under errgroup. Each goroutine captures +// its own error and returns nil to the group so that a single service failure +// does not cancel sibling calls. Results are appended in a deterministic order +// (compute → database → cache → cosmosdb → advisor) after all goroutines finish. func (r *RecommendationsClientAdapter) GetRecommendations(ctx context.Context, params common.RecommendationParams) ([]common.Recommendation, error) { - allRecommendations := make([]common.Recommendation, 0) + var ( + computeRecs, dbRecs, cacheRecs, cosmosRecs, advisorRecs []common.Recommendation + computeErr, dbErr, cacheErr, cosmosErr, advisorErr error + ) + + g, gctx := errgroup.WithContext(ctx) // Compute (VM) recommendations — subscription-wide. if shouldIncludeService(params, common.ServiceCompute) { - computeClient := compute.NewClient(r.cred, r.subscriptionID, "") - computeRecs, err := computeClient.GetRecommendations(ctx, params) - if err != nil { - logging.Warnf("Azure compute recommendations: %v", err) - } else { - allRecommendations = append(allRecommendations, computeRecs...) - } + g.Go(func() error { + computeClient := compute.NewClient(r.cred, r.subscriptionID, "") + computeRecs, computeErr = computeClient.GetRecommendations(gctx, params) + return nil // error isolation: never propagate to errgroup + }) } // Database (SQL) recommendations — subscription-wide. if shouldIncludeService(params, common.ServiceRelationalDB) { - dbClient := database.NewClient(r.cred, r.subscriptionID, "") - dbRecs, err := dbClient.GetRecommendations(ctx, params) - if err != nil { - logging.Warnf("Azure database recommendations: %v", err) - } else { - allRecommendations = append(allRecommendations, dbRecs...) - } + g.Go(func() error { + dbClient := database.NewClient(r.cred, r.subscriptionID, "") + dbRecs, dbErr = dbClient.GetRecommendations(gctx, params) + return nil + }) } // Cache (Redis) recommendations — subscription-wide. if shouldIncludeService(params, common.ServiceCache) { - cacheClient := cache.NewClient(r.cred, r.subscriptionID, "") - cacheRecs, err := cacheClient.GetRecommendations(ctx, params) - if err != nil { - logging.Warnf("Azure cache recommendations: %v", err) - } else { - allRecommendations = append(allRecommendations, cacheRecs...) - } + g.Go(func() error { + cacheClient := cache.NewClient(r.cred, r.subscriptionID, "") + cacheRecs, cacheErr = cacheClient.GetRecommendations(gctx, params) + return nil + }) + } + + // CosmosDB (NoSQL) recommendations — subscription-wide. + if shouldIncludeService(params, common.ServiceNoSQL) { + g.Go(func() error { + cosmosClient := cosmosdb.NewClient(r.cred, r.subscriptionID, "") + cosmosRecs, cosmosErr = cosmosClient.GetRecommendations(gctx, params) + return nil + }) } // Azure Advisor adds cross-cutting cost recommendations independent of the // per-service Reservation API. Failures here are non-fatal — the per-service // results above are still useful on their own. - advisorRecs, err := r.getAdvisorRecommendations(ctx, params) - if err != nil { - logging.Errorf("Failed to get Azure Advisor recommendations: %v", err) - } else { - allRecommendations = append(allRecommendations, advisorRecs...) + g.Go(func() error { + advisorRecs, advisorErr = r.getAdvisorRecommendations(gctx, params) + return nil + }) + + // Wait for all goroutines. g.Wait() always returns nil because every + // goroutine returns nil — errors are captured in per-service variables + // above. After Wait, propagate ctx cancellation so callers can distinguish + // "all five services completed (with possibly per-service errors)" from + // "the parent ctx was canceled mid-fan-out". Without this check the + // CHECK could swallow a deadline exceeded that the caller expected to + // see. + _ = g.Wait() + if err := ctx.Err(); err != nil { + return nil, err } - return allRecommendations, nil + return mergeServiceResults(serviceResult{"compute", computeRecs, computeErr}, + serviceResult{"database", dbRecs, dbErr}, + serviceResult{"cache", cacheRecs, cacheErr}, + serviceResult{"cosmosdb", cosmosRecs, cosmosErr}, + serviceResult{"advisor", advisorRecs, advisorErr}), nil +} + +// serviceResult bundles a per-service collection outcome for the deterministic +// merge in mergeServiceResults. Extracted into a helper so GetRecommendations +// stays under the cyclomatic-complexity gate after the post-Wait ctx.Err() +// propagation was added. +type serviceResult struct { + name string + recs []common.Recommendation + err error +} + +// mergeServiceResults logs per-service errors (matches the previous sequential +// behaviour where each error was logged inline via logging.Warnf) and appends +// successful results in the order the slice is passed — callers must preserve +// the canonical compute → database → cache → cosmosdb → advisor order so that +// order-sensitive consumers remain stable. The advisor entry's error is logged +// via logging.Errorf to match the pre-parallelisation severity. +func mergeServiceResults(results ...serviceResult) []common.Recommendation { + total := 0 + for _, r := range results { + total += len(r.recs) + } + out := make([]common.Recommendation, 0, total) + for _, r := range results { + if r.err != nil { + if r.name == "advisor" { + logging.Errorf("Failed to get Azure Advisor recommendations: %v", r.err) + } else { + logging.Warnf("Azure %s recommendations: %v", r.name, r.err) + } + continue + } + out = append(out, r.recs...) + } + return out } // GetRecommendationsForService retrieves Azure reservation recommendations for a specific service diff --git a/providers/azure/recommendations_test.go b/providers/azure/recommendations_test.go index 2cf52403..b2eb251d 100644 --- a/providers/azure/recommendations_test.go +++ b/providers/azure/recommendations_test.go @@ -216,6 +216,37 @@ func TestRecommendationsClientAdapter_GetAllRecommendations(t *testing.T) { _, _ = adapter.GetAllRecommendations(context.Background()) } +// TestRecommendationsClientAdapter_GetRecommendations_PropagatesContextCancellation +// pins the contract that GetRecommendations propagates ctx.Err() to its caller +// after the errgroup Wait() — the parent context being cancelled or its +// deadline exceeding must surface as an error rather than being swallowed by +// the per-service error-isolation goroutines (which all return nil to the +// errgroup so a single per-service failure does not cancel siblings). +// +// Without the explicit `if err := ctx.Err(); err != nil { return nil, err }` +// after `g.Wait()`, callers that wrap GetRecommendations with a deadline could +// see "all services finished cleanly" even when the deadline expired +// mid-fan-out (because every goroutine returned nil from its closure). +func TestRecommendationsClientAdapter_GetRecommendations_PropagatesContextCancellation(t *testing.T) { + adapter := &RecommendationsClientAdapter{ + cred: &mockAzureTokenCredential{}, + subscriptionID: "test-subscription", + } + + // Cancel the context BEFORE the call so we don't depend on race-y timing + // inside the SDK clients. The Azure clients constructed inside the + // goroutines will observe the cancelled gctx (derived from the parent ctx + // via errgroup.WithContext) and either short-circuit or return cancelled + // errors; either way, our post-Wait ctx.Err() check returns context.Canceled. + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + _, err := adapter.GetRecommendations(ctx, common.RecommendationParams{}) + require.Error(t, err, "expected context.Canceled to propagate from GetRecommendations") + assert.ErrorIs(t, err, context.Canceled, + "GetRecommendations must propagate the parent ctx error after g.Wait()") +} + func TestExtractServiceType(t *testing.T) { tests := []struct { name string