Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions providers/azure/go.mod
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
module github.com/LeanerCloud/CUDly/providers/azure

go 1.23.0

toolchain go1.24.4
go 1.25.0

require (
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.1
Expand All @@ -18,6 +16,7 @@ require (
github.com/LeanerCloud/CUDly/pkg v0.0.0
github.com/google/uuid v1.6.0
github.com/stretchr/testify v1.11.1
golang.org/x/sync v0.20.0
)

require (
Expand Down
2 changes: 2 additions & 0 deletions providers/azure/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ golang.org/x/crypto v0.40.0 h1:r4x+VvoG5Fm+eJcxMaY8CQM7Lb0l1lsmjGBQ6s8BfKM=
golang.org/x/crypto v0.40.0/go.mod h1:Qr1vMER5WyS2dfPHAlsOj01wgLbsyWtFn/aY+5+ZdxY=
golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs=
golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8=
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA=
golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
Expand Down
121 changes: 93 additions & 28 deletions providers/azure/recommendations.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ import (

"github.com/Azure/azure-sdk-for-go/sdk/azcore"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/advisor/armadvisor"
"golang.org/x/sync/errgroup"

"github.com/LeanerCloud/CUDly/pkg/common"
"github.com/LeanerCloud/CUDly/pkg/logging"
"github.com/LeanerCloud/CUDly/providers/azure/services/cache"
"github.com/LeanerCloud/CUDly/providers/azure/services/compute"
"github.com/LeanerCloud/CUDly/providers/azure/services/cosmosdb"
"github.com/LeanerCloud/CUDly/providers/azure/services/database"
)

Expand Down Expand Up @@ -55,53 +57,116 @@ func NewRecommendationsClientAdapter(cred azcore.TokenCredential, subscriptionID
// blank on the client — converters must populate Region from the response data
// (see known_issues/10_azure_provider.md CRITICAL "Recommendation converters
// ignore the API response entirely" for the matching converter work).
//
// All five service calls run concurrently under errgroup. Each goroutine captures
// its own error and returns nil to the group so that a single service failure
// does not cancel sibling calls. Results are appended in a deterministic order
// (compute → database → cache → cosmosdb → advisor) after all goroutines finish.
Comment on lines +61 to +64
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major | 🏗️ Heavy lift

Please pin the latency win with a concurrency regression test.

This refactor’s main value is wall-clock behavior, but nothing in the diff guards against accidental re-serialization later. A fake-client test/benchmark that sleeps per service and asserts total duration stays near the max single-call latency, while result order remains deterministic, would make this change much safer to maintain.

Also applies to: 65-159

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@providers/azure/recommendations.go` around lines 61 - 64, Add a concurrency
regression test/benchmark that uses a fake client for each service (compute,
database, cache, cosmosdb, advisor) where each fake sleeps a configurable
duration; call the recommendations orchestration (the code path that uses
errgroup and launches goroutines) and assert the total elapsed time is
approximately the max single-client sleep (not the sum) to ensure calls run
concurrently, and also assert the returned results preserve the deterministic
order (compute → database → cache → cosmosdb → advisor); make this a fast unit
test with small sleeps and a benchmark variant to detect regressions.

func (r *RecommendationsClientAdapter) GetRecommendations(ctx context.Context, params common.RecommendationParams) ([]common.Recommendation, error) {
allRecommendations := make([]common.Recommendation, 0)
var (
computeRecs, dbRecs, cacheRecs, cosmosRecs, advisorRecs []common.Recommendation
computeErr, dbErr, cacheErr, cosmosErr, advisorErr error
)

g, gctx := errgroup.WithContext(ctx)

// Compute (VM) recommendations — subscription-wide.
if shouldIncludeService(params, common.ServiceCompute) {
computeClient := compute.NewClient(r.cred, r.subscriptionID, "")
computeRecs, err := computeClient.GetRecommendations(ctx, params)
if err != nil {
logging.Warnf("Azure compute recommendations: %v", err)
} else {
allRecommendations = append(allRecommendations, computeRecs...)
}
g.Go(func() error {
computeClient := compute.NewClient(r.cred, r.subscriptionID, "")
computeRecs, computeErr = computeClient.GetRecommendations(gctx, params)
return nil // error isolation: never propagate to errgroup
})
}

// Database (SQL) recommendations — subscription-wide.
if shouldIncludeService(params, common.ServiceRelationalDB) {
dbClient := database.NewClient(r.cred, r.subscriptionID, "")
dbRecs, err := dbClient.GetRecommendations(ctx, params)
if err != nil {
logging.Warnf("Azure database recommendations: %v", err)
} else {
allRecommendations = append(allRecommendations, dbRecs...)
}
g.Go(func() error {
dbClient := database.NewClient(r.cred, r.subscriptionID, "")
dbRecs, dbErr = dbClient.GetRecommendations(gctx, params)
return nil
})
}

// Cache (Redis) recommendations — subscription-wide.
if shouldIncludeService(params, common.ServiceCache) {
cacheClient := cache.NewClient(r.cred, r.subscriptionID, "")
cacheRecs, err := cacheClient.GetRecommendations(ctx, params)
if err != nil {
logging.Warnf("Azure cache recommendations: %v", err)
} else {
allRecommendations = append(allRecommendations, cacheRecs...)
}
g.Go(func() error {
cacheClient := cache.NewClient(r.cred, r.subscriptionID, "")
cacheRecs, cacheErr = cacheClient.GetRecommendations(gctx, params)
return nil
})
}

// CosmosDB (NoSQL) recommendations — subscription-wide.
if shouldIncludeService(params, common.ServiceNoSQL) {
g.Go(func() error {
cosmosClient := cosmosdb.NewClient(r.cred, r.subscriptionID, "")
cosmosRecs, cosmosErr = cosmosClient.GetRecommendations(gctx, params)
return nil
})
}

// Azure Advisor adds cross-cutting cost recommendations independent of the
// per-service Reservation API. Failures here are non-fatal — the per-service
// results above are still useful on their own.
advisorRecs, err := r.getAdvisorRecommendations(ctx, params)
if err != nil {
logging.Errorf("Failed to get Azure Advisor recommendations: %v", err)
} else {
allRecommendations = append(allRecommendations, advisorRecs...)
g.Go(func() error {
advisorRecs, advisorErr = r.getAdvisorRecommendations(gctx, params)
return nil
})

// Wait for all goroutines. g.Wait() always returns nil because every
// goroutine returns nil — errors are captured in per-service variables
// above. After Wait, propagate ctx cancellation so callers can distinguish
// "all five services completed (with possibly per-service errors)" from
// "the parent ctx was canceled mid-fan-out". Without this check the
// CHECK could swallow a deadline exceeded that the caller expected to
// see.
_ = g.Wait()
Comment on lines +71 to +124
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🌐 Web query:

golang.org/x/sync/errgroup WithContext documentation: when is the derived context canceled, and what does Wait return if every goroutine returns nil but the parent context is canceled?

💡 Result:

When using golang.org/x/sync/errgroup WithContext: 1) When is the derived context canceled? - The derived context returned by WithContext is canceled the first time either: a) any function passed to Go returns a non-nil error, or b) Wait returns. This cancellation happens “whichever occurs first.” 2) What does Wait return if every goroutine returns nil but the parent context is canceled? - Wait only returns the first non-nil error produced by the goroutines (g.err). If all goroutines return nil, then g.err stays nil. - Wait does still cancel the derived context when it returns (it calls g.cancel(g.err) with g.err possibly nil), but Wait’s return value remains nil. So: if every goroutine returns nil, g.Wait returns nil even if the parent context was canceled (assuming no goroutine returned a non-nil error). If you need Wait to return something like ctx.Err on parent cancellation, you must ensure at least one goroutine returns ctx.Err (or otherwise propagate it), because errgroup.Wait does not automatically return ctx.Err just because the parent context was canceled.

Citations:


🏁 Script executed:

wc -l providers/azure/recommendations.go

Repository: LeanerCloud/CUDly

Length of output: 100


🏁 Script executed:

sed -n '150,165p' providers/azure/recommendations.go

Repository: LeanerCloud/CUDly

Length of output: 598


🏁 Script executed:

sed -n '120,165p' providers/azure/recommendations.go

Repository: LeanerCloud/CUDly

Length of output: 1644


🏁 Script executed:

sed -n '71,75p' providers/azure/recommendations.go

Repository: LeanerCloud/CUDly

Length of output: 234


🏁 Script executed:

sed -n '50,75p' providers/azure/recommendations.go

Repository: LeanerCloud/CUDly

Length of output: 1589


🏁 Script executed:

sed -n '115,162p' providers/azure/recommendations.go

Repository: LeanerCloud/CUDly

Length of output: 1751


Propagate context cancellation instead of returning success on timeout.

Every goroutine returns nil, so g.Wait() cannot surface context.Canceled or context.DeadlineExceeded. If the parent context times out before all results are collected, line 158 still returns nil, allowing callers to treat an empty or partial result set as a clean success.

After g.Wait() at line 115, check ctx.Err() to propagate parent cancellation while preserving the per-service error logging behavior.

Suggested fix
 	// Wait for all goroutines. g.Wait() always returns nil because every goroutine
 	// returns nil — errors are captured in per-service variables above.
 	_ = g.Wait()

 	// Log per-service errors (matches previous sequential behaviour). Append
 	// results in the same deterministic order as the original sequential code
 	// (compute → database → cache → cosmosdb → advisor) so that any
 	// order-sensitive callers remain stable.
 	allRecommendations := make([]common.Recommendation, 0,
 		len(computeRecs)+len(dbRecs)+len(cacheRecs)+len(cosmosRecs)+len(advisorRecs))

 	if computeErr != nil {
 		logging.Warnf("Azure compute recommendations: %v", computeErr)
 	} else {
 		allRecommendations = append(allRecommendations, computeRecs...)
 	}

 	if dbErr != nil {
 		logging.Warnf("Azure database recommendations: %v", dbErr)
 	} else {
 		allRecommendations = append(allRecommendations, dbRecs...)
 	}

 	if cacheErr != nil {
 		logging.Warnf("Azure cache recommendations: %v", cacheErr)
 	} else {
 		allRecommendations = append(allRecommendations, cacheRecs...)
 	}

 	if cosmosErr != nil {
 		logging.Warnf("Azure cosmosdb recommendations: %v", cosmosErr)
 	} else {
 		allRecommendations = append(allRecommendations, cosmosRecs...)
 	}

 	if advisorErr != nil {
 		logging.Errorf("Failed to get Azure Advisor recommendations: %v", advisorErr)
 	} else {
 		allRecommendations = append(allRecommendations, advisorRecs...)
 	}

+	if err := ctx.Err(); err != nil {
+		return allRecommendations, err
+	}
 	return allRecommendations, nil
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@providers/azure/recommendations.go` around lines 71 - 119, The goroutines
swallow context cancellation because every g.Go returns nil and the code
discards g.Wait(); change the final wait handling so that after g.Wait() you
check ctx.Err() (or gctx.Err()) and propagate it if non-nil. Concretely, replace
"_ = g.Wait()" with err := g.Wait(); if err != nil { return err } and/or if
ctx.Err() != nil { return ctx.Err() } (preserving the per-service error
variables and logging) so that context.Canceled or context.DeadlineExceeded from
gctx/ctx is returned to callers; keep each anonymous goroutine bodies (where
computeClient.GetRecommendations, databaseClient.GetRecommendations,
cacheClient.GetRecommendations, cosmosdb.NewClient, r.getAdvisorRecommendations,
etc.) returning nil to retain per-service error isolation.

if err := ctx.Err(); err != nil {
return nil, err
}

return allRecommendations, nil
return mergeServiceResults(serviceResult{"compute", computeRecs, computeErr},
serviceResult{"database", dbRecs, dbErr},
serviceResult{"cache", cacheRecs, cacheErr},
serviceResult{"cosmosdb", cosmosRecs, cosmosErr},
serviceResult{"advisor", advisorRecs, advisorErr}), nil
}

// serviceResult bundles a per-service collection outcome for the deterministic
// merge in mergeServiceResults. Extracted into a helper so GetRecommendations
// stays under the cyclomatic-complexity gate after the post-Wait ctx.Err()
// propagation was added.
type serviceResult struct {
name string
recs []common.Recommendation
err error
}

// mergeServiceResults logs per-service errors (matches the previous sequential
// behaviour where each error was logged inline via logging.Warnf) and appends
// successful results in the order the slice is passed — callers must preserve
// the canonical compute → database → cache → cosmosdb → advisor order so that
// order-sensitive consumers remain stable. The advisor entry's error is logged
// via logging.Errorf to match the pre-parallelisation severity.
func mergeServiceResults(results ...serviceResult) []common.Recommendation {
total := 0
for _, r := range results {
total += len(r.recs)
}
out := make([]common.Recommendation, 0, total)
for _, r := range results {
if r.err != nil {
if r.name == "advisor" {
logging.Errorf("Failed to get Azure Advisor recommendations: %v", r.err)
} else {
logging.Warnf("Azure %s recommendations: %v", r.name, r.err)
}
continue
}
out = append(out, r.recs...)
}
return out
}

// GetRecommendationsForService retrieves Azure reservation recommendations for a specific service
Expand Down
31 changes: 31 additions & 0 deletions providers/azure/recommendations_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,37 @@ func TestRecommendationsClientAdapter_GetAllRecommendations(t *testing.T) {
_, _ = adapter.GetAllRecommendations(context.Background())
}

// TestRecommendationsClientAdapter_GetRecommendations_PropagatesContextCancellation
// pins the contract that GetRecommendations propagates ctx.Err() to its caller
// after the errgroup Wait() — the parent context being cancelled or its
// deadline exceeding must surface as an error rather than being swallowed by
// the per-service error-isolation goroutines (which all return nil to the
// errgroup so a single per-service failure does not cancel siblings).
//
// Without the explicit `if err := ctx.Err(); err != nil { return nil, err }`
// after `g.Wait()`, callers that wrap GetRecommendations with a deadline could
// see "all services finished cleanly" even when the deadline expired
// mid-fan-out (because every goroutine returned nil from its closure).
func TestRecommendationsClientAdapter_GetRecommendations_PropagatesContextCancellation(t *testing.T) {
adapter := &RecommendationsClientAdapter{
cred: &mockAzureTokenCredential{},
subscriptionID: "test-subscription",
}

// Cancel the context BEFORE the call so we don't depend on race-y timing
// inside the SDK clients. The Azure clients constructed inside the
// goroutines will observe the cancelled gctx (derived from the parent ctx
// via errgroup.WithContext) and either short-circuit or return cancelled
// errors; either way, our post-Wait ctx.Err() check returns context.Canceled.
ctx, cancel := context.WithCancel(context.Background())
cancel()

_, err := adapter.GetRecommendations(ctx, common.RecommendationParams{})
require.Error(t, err, "expected context.Canceled to propagate from GetRecommendations")
assert.ErrorIs(t, err, context.Canceled,
"GetRecommendations must propagate the parent ctx error after g.Wait()")
}

func TestExtractServiceType(t *testing.T) {
tests := []struct {
name string
Expand Down