From 46206445b464125ec2e4f1055fb50654a7cd9bf6 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Sat, 25 Apr 2026 15:22:41 +0200 Subject: [PATCH 01/12] perf(exchange): rec-driven cross-family alternatives via cached Cost Explorer recommendations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reshape page used to enumerate cross-family alternatives via a hand-curated `peerFamilyGroups` allowlist plus a per-recommendation `DescribeReservedInstancesOfferings` API call. The recommendations table already caches AWS Cost Explorer's `GetReservationPurchaseRecommendation` output on every scheduler tick — pair underutilised convertibles against that cache instead of fanning out N×M offering enumerations on every page load. What got deleted from `pkg/exchange/reshape.go`: - `peerFamilyGroups` (hand-curated map; new families were invisible until somebody updated the file by hand) - `candidateFamilies()` and `alternativesForTarget()` helpers - `OfferingLookup` (per-rec lookup signature) - `AnalyzeReshapingWithOfferings()` and `distinctCandidateTypes()` / `fillAlternativesFromOfferings()` What replaces them: - `PurchaseRecLookup func(ctx, region, currencyCode) ([]OfferingOption, error)` - `AnalyzeReshapingWithRecs(ctx, ris, util, threshold, region, currency, lookup)`: ONE region-scoped lookup, then per-rec the offerings are filtered to cross-family options that pass the existing `passesDollarUnitsCheck` gate (unchanged) and sorted ascending by effective monthly cost. New `internal/api/exchange_lookup.go` adapter `purchaseRecLookupFromStore(store, accountID)`: - Calls `store.ListStoredRecommendations(ctx, RecommendationFilter{Provider: "aws", Service: "ec2", Region: region, AccountIDs: [accountID]})`. - Maps each `RecommendationRecord` → `OfferingOption` with effective monthly = `UpfrontCost / (Term × 12) + MonthlyCost`. - Filters by source CloudAccount UUID (resolved from STS account ID → CloudAccount.ExternalID) so reshape can't surface another tenant's recs. Empty UUID = ambient-credentials degraded mode. `providers/aws/services/ec2/client.go::FindConvertibleOfferings` deleted along with its dedicated test file (no remaining callers). `internal/server/handler_ri_exchange.go::convertForAutoExchange` unchanged — auto.go uses the base `AnalyzeReshaping` (no alternatives, primary target only) so the auto-exchange pipeline's contract is preserved. Tests: - `pkg/exchange/reshape_crossfamily_test.go` rewritten: - Removed `TestCandidateFamilies_*` (function gone). - Removed base-path cross-family assertion (alternatives now belong exclusively to the WithRecs path). - Added `TestAnalyzeReshaping_BaseEmitsNoAlternatives` regression. - Migrated all `TestAnalyzeReshapingWithOfferings_*` → `TestAnalyzeReshapingWithRecs_*` (recommendation-driven, cold-cache, dollar-units filter, lookup error, primary-target exclusion, legacy m4 family). - New `internal/api/exchange_lookup_test.go`: region/account/empty/no-recs filter coverage + UpfrontCost amortisation mapping. - Integration test `internal/api/handler_ri_exchange_integration_test.go` switched to seeding recs into the test container's recommendations table instead of mocking `FindConvertibleOfferings`. Closes #40 --- internal/api/exchange_lookup.go | 129 +++++ internal/api/exchange_lookup_test.go | 175 +++++++ internal/api/handler_ri_exchange.go | 42 +- .../handler_ri_exchange_integration_test.go | 120 ++--- pkg/exchange/reshape.go | 289 ++++------- pkg/exchange/reshape_crossfamily_test.go | 477 ++++++++---------- providers/aws/services/ec2/client.go | 89 ---- .../aws/services/ec2/client_offerings_test.go | 112 ---- 8 files changed, 690 insertions(+), 743 deletions(-) create mode 100644 internal/api/exchange_lookup.go create mode 100644 internal/api/exchange_lookup_test.go delete mode 100644 providers/aws/services/ec2/client_offerings_test.go diff --git a/internal/api/exchange_lookup.go b/internal/api/exchange_lookup.go new file mode 100644 index 00000000..0a413cae --- /dev/null +++ b/internal/api/exchange_lookup.go @@ -0,0 +1,129 @@ +// Package api provides the HTTP API handlers for the CUDly dashboard. +package api + +import ( + "context" + "strings" + + "github.com/LeanerCloud/CUDly/internal/config" + "github.com/LeanerCloud/CUDly/pkg/exchange" +) + +// recsLister is the narrow slice of config.StoreInterface that the +// reshape lookup needs. Scoped here so the closure stays unit-testable +// against a tiny fake instead of the full StoreInterface mock. +type recsLister interface { + ListStoredRecommendations(ctx context.Context, filter config.RecommendationFilter) ([]config.RecommendationRecord, error) +} + +// purchaseRecLookupFromStore builds an exchange.PurchaseRecLookup that +// reads the cached AWS Cost Explorer purchase recommendations out of +// Postgres for a given (region, currency) pair. Each +// RecommendationRecord is mapped to an OfferingOption with: +// +// - InstanceType = rec.ResourceType (e.g. "m6i.large") +// - OfferingID = rec.ID (UUID; the UI uses this as a stable handle) +// - EffectiveMonthlyCost = (UpfrontCost / termMonths) + MonthlyCost +// - NormalizationFactor = exchange.NormalizationFactorForSize(size) +// - CurrencyCode = currencyCode (propagated; recs don't carry it) +// +// Where termMonths = rec.Term × 12 (rec.Term is in years, AWS standard +// for RIs / Savings Plans). Term ≤ 0 means we can't amortise upfront, +// so we fall back to MonthlyCost alone — the dollar-units check will +// then accept or reject based on monthly recurring vs. source. +// +// AccountID scoping (cross-account leak guard): when accountID is +// non-empty we restrict the query to that single CloudAccount UUID so +// reshape can't surface another tenant's recs. Empty accountID means +// "no filter" — used for ambient-credentials deployments where the +// caller couldn't (or chose not to) resolve the source account. +func purchaseRecLookupFromStore(store recsLister, accountID string) exchange.PurchaseRecLookup { + return func(ctx context.Context, region, currencyCode string) ([]exchange.OfferingOption, error) { + filter := config.RecommendationFilter{ + Provider: "aws", + Service: "ec2", + Region: region, + } + if accountID != "" { + filter.AccountIDs = []string{accountID} + } + recs, err := store.ListStoredRecommendations(ctx, filter) + if err != nil { + return nil, err + } + out := make([]exchange.OfferingOption, 0, len(recs)) + for _, rec := range recs { + out = append(out, recommendationToOffering(rec, currencyCode)) + } + return out, nil + } +} + +// recommendationToOffering maps a single cached Cost Explorer purchase +// recommendation to the OfferingOption shape the reshape layer +// consumes. Pulled out so the mapping can be unit-tested in isolation +// (no DB / no ctx required). +func recommendationToOffering(rec config.RecommendationRecord, currencyCode string) exchange.OfferingOption { + monthly := rec.MonthlyCost + if rec.Term > 0 { + // rec.Term is in years; canonical AWS RI/SP amortisation uses + // 12 months per year regardless of leap years. + termMonths := float64(rec.Term * 12) + if termMonths > 0 { + monthly += rec.UpfrontCost / termMonths + } + } + _, size := splitInstanceType(rec.ResourceType) + return exchange.OfferingOption{ + InstanceType: rec.ResourceType, + OfferingID: rec.ID, + EffectiveMonthlyCost: monthly, + NormalizationFactor: exchange.NormalizationFactorForSize(size), + CurrencyCode: currencyCode, + } +} + +// splitInstanceType splits "m5.xlarge" into ("m5", "xlarge"). Returns +// empty strings if the format is unrecognized. Mirrors the helper in +// pkg/exchange/reshape.go but kept local to avoid exporting a +// general-purpose parser the caller doesn't need. +func splitInstanceType(instanceType string) (family, size string) { + parts := strings.SplitN(instanceType, ".", 2) + if len(parts) != 2 { + return "", "" + } + return parts[0], parts[1] +} + +// resolveAWSCloudAccountID maps the running AWS account ID (raw, from +// STS) to the registered CloudAccount UUID so the reshape lookup can +// scope its query against the correct row in the recommendations +// table. Returns "" when: +// - no AWS account ID could be resolved (STS denied / ambient creds +// not available); +// - no CloudAccount row exists with that ExternalID (the operator +// hasn't registered this account yet — common on first-run / dev). +// +// Empty UUID propagates to purchaseRecLookupFromStore as "skip the +// AccountIDs filter", which causes the lookup to return whatever recs +// exist in the region. That's intentional: a deployment with one +// ambient-credentials account and no registered CloudAccounts should +// still see alternatives, not a permanently empty list. Once the +// operator registers the account the filter engages automatically. +func (h *Handler) resolveAWSCloudAccountID(ctx context.Context) string { + awsAccountID := h.resolveAWSAccountID(ctx) + if awsAccountID == "" { + return "" + } + provider := "aws" + accounts, err := h.config.ListCloudAccounts(ctx, config.CloudAccountFilter{Provider: &provider}) + if err != nil || len(accounts) == 0 { + return "" + } + for _, a := range accounts { + if a.ExternalID == awsAccountID { + return a.ID + } + } + return "" +} diff --git a/internal/api/exchange_lookup_test.go b/internal/api/exchange_lookup_test.go new file mode 100644 index 00000000..463d5871 --- /dev/null +++ b/internal/api/exchange_lookup_test.go @@ -0,0 +1,175 @@ +package api + +import ( + "context" + "errors" + "testing" + + "github.com/LeanerCloud/CUDly/internal/config" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// fakeRecsLister captures the filter passed to ListStoredRecommendations +// so tests can assert region / account / provider scoping landed in the +// SQL query. Returns a configurable result set or error. +type fakeRecsLister struct { + gotFilter config.RecommendationFilter + calls int + out []config.RecommendationRecord + err error +} + +func (f *fakeRecsLister) ListStoredRecommendations(_ context.Context, filter config.RecommendationFilter) ([]config.RecommendationRecord, error) { + f.calls++ + f.gotFilter = filter + return f.out, f.err +} + +// TestPurchaseRecLookupFromStore_RegionFilter pins that the closure +// pushes the requested region into the SQL filter so Postgres prunes +// rows by region rather than the Go layer doing it after the fact. +func TestPurchaseRecLookupFromStore_RegionFilter(t *testing.T) { + t.Parallel() + store := &fakeRecsLister{} + lookup := purchaseRecLookupFromStore(store, "") + _, err := lookup(context.Background(), "eu-west-1", "USD") + require.NoError(t, err) + assert.Equal(t, 1, store.calls, "lookup must invoke the store exactly once") + assert.Equal(t, "aws", store.gotFilter.Provider, "must scope to AWS recs") + assert.Equal(t, "ec2", store.gotFilter.Service, "must scope to EC2 recs (no RDS / opensearch leakage)") + assert.Equal(t, "eu-west-1", store.gotFilter.Region, "region must thread through to SQL") +} + +// TestPurchaseRecLookupFromStore_AccountFilter pins the cross-account +// leak guard: when an account UUID is supplied, the filter restricts +// the query to that single account so the reshape page can't surface +// another tenant's recommendations. +func TestPurchaseRecLookupFromStore_AccountFilter(t *testing.T) { + t.Parallel() + store := &fakeRecsLister{} + lookup := purchaseRecLookupFromStore(store, "acct-uuid-123") + _, err := lookup(context.Background(), "us-east-1", "USD") + require.NoError(t, err) + require.Len(t, store.gotFilter.AccountIDs, 1, "non-empty account UUID must populate AccountIDs filter") + assert.Equal(t, "acct-uuid-123", store.gotFilter.AccountIDs[0]) +} + +// TestPurchaseRecLookupFromStore_NoAccountFilterWhenEmpty pins the +// degraded-mode contract: when the caller can't resolve an account +// UUID (ambient credentials, account not registered yet), the lookup +// returns whatever recs exist in the region rather than blanking the +// page. The operator can register the account later to engage scoping. +func TestPurchaseRecLookupFromStore_NoAccountFilterWhenEmpty(t *testing.T) { + t.Parallel() + store := &fakeRecsLister{} + lookup := purchaseRecLookupFromStore(store, "") + _, err := lookup(context.Background(), "us-east-1", "USD") + require.NoError(t, err) + assert.Empty(t, store.gotFilter.AccountIDs, "empty account UUID must NOT add an AccountIDs filter") +} + +// TestPurchaseRecLookupFromStore_NoRecsReturnsEmpty pins the +// cold-cache contract: zero recs in the region → empty slice (not +// nil-error). The downstream AnalyzeReshapingWithRecs treats an empty +// slice the same as "no alternatives, primary target intact". +func TestPurchaseRecLookupFromStore_NoRecsReturnsEmpty(t *testing.T) { + t.Parallel() + store := &fakeRecsLister{out: nil} + lookup := purchaseRecLookupFromStore(store, "") + got, err := lookup(context.Background(), "us-east-1", "USD") + require.NoError(t, err) + assert.Empty(t, got, "empty recs → empty offerings, no error") +} + +// TestPurchaseRecLookupFromStore_StoreErrorPropagates pins the error +// path: an underlying SQL failure surfaces back to the caller. +// AnalyzeReshapingWithRecs handles the error by falling back to base +// recs (graceful degradation), so the closure just needs to forward +// the error verbatim. +func TestPurchaseRecLookupFromStore_StoreErrorPropagates(t *testing.T) { + t.Parallel() + store := &fakeRecsLister{err: errors.New("postgres timeout")} + lookup := purchaseRecLookupFromStore(store, "") + _, err := lookup(context.Background(), "us-east-1", "USD") + require.Error(t, err, "store errors must propagate so the caller can fall back") +} + +// TestPurchaseRecLookupFromStore_MapsFields pins the +// RecommendationRecord → OfferingOption mapping shape so the dollar- +// units pre-filter and the UI both see consistent data: +// - InstanceType comes from ResourceType. +// - OfferingID = rec.ID (stable handle). +// - EffectiveMonthlyCost = UpfrontCost / (Term * 12) + MonthlyCost. +// - NormalizationFactor resolved from the size (here "large" → 4). +// - CurrencyCode propagated from the lookup's currencyCode arg. +func TestPurchaseRecLookupFromStore_MapsFields(t *testing.T) { + t.Parallel() + store := &fakeRecsLister{ + out: []config.RecommendationRecord{ + { + ID: "rec-1", + Provider: "aws", + Service: "ec2", + Region: "us-east-1", + ResourceType: "m6i.large", + Term: 1, // 1 year term + UpfrontCost: 120, // 120 / 12 = 10/mo amortised + MonthlyCost: 20, // + 20/mo recurring = 30 + }, + { + // Term=0 → no upfront amortisation; effective = MonthlyCost only. + ID: "rec-2", + Provider: "aws", + Service: "ec2", + Region: "us-east-1", + ResourceType: "c5.xlarge", + Term: 0, + UpfrontCost: 500, // ignored when Term == 0 + MonthlyCost: 50, + }, + }, + } + lookup := purchaseRecLookupFromStore(store, "") + got, err := lookup(context.Background(), "us-east-1", "EUR") + require.NoError(t, err) + require.Len(t, got, 2) + + assert.Equal(t, "m6i.large", got[0].InstanceType) + assert.Equal(t, "rec-1", got[0].OfferingID) + assert.InDelta(t, 30.0, got[0].EffectiveMonthlyCost, 0.001, + "upfront 120 over 12 months = 10/mo + recurring 20/mo = 30/mo") + assert.InDelta(t, 4.0, got[0].NormalizationFactor, 0.001, "large → NF 4") + assert.Equal(t, "EUR", got[0].CurrencyCode, "currency must be propagated from caller") + + assert.Equal(t, "c5.xlarge", got[1].InstanceType) + assert.InDelta(t, 50.0, got[1].EffectiveMonthlyCost, 0.001, + "Term==0 means upfront cannot be amortised; fall back to MonthlyCost") + assert.InDelta(t, 8.0, got[1].NormalizationFactor, 0.001, "xlarge → NF 8") +} + +// TestSplitInstanceType pins the local instance-type parser used by +// the mapping helper. Mirrors the pkg/exchange parser to avoid +// exporting a general-purpose helper this package doesn't need. +func TestSplitInstanceType(t *testing.T) { + t.Parallel() + cases := []struct { + in string + wantFamily string + wantSize string + }{ + {"m5.large", "m5", "large"}, + {"m7g.metal", "m7g", "metal"}, + {"r6i.16xlarge", "r6i", "16xlarge"}, + {"", "", ""}, + {"malformed", "", ""}, + } + for _, c := range cases { + t.Run(c.in, func(t *testing.T) { + t.Parallel() + f, s := splitInstanceType(c.in) + assert.Equal(t, c.wantFamily, f) + assert.Equal(t, c.wantSize, s) + }) + } +} diff --git a/internal/api/handler_ri_exchange.go b/internal/api/handler_ri_exchange.go index 65104bf4..59e0a135 100644 --- a/internal/api/handler_ri_exchange.go +++ b/internal/api/handler_ri_exchange.go @@ -29,9 +29,13 @@ import ( // awsprovider.NewEC2ClientDirect already implements these methods // (Go structural typing), so the nil-factory fallback path casts it // directly. +// +// Cross-family alternatives no longer flow through here — they're +// sourced from the cached AWS Cost Explorer purchase recommendations +// in Postgres via purchaseRecLookupFromStore (see exchange_lookup.go), +// so the EC2 client only needs to enumerate convertible RIs. type reshapeEC2Client interface { ListConvertibleReservedInstances(ctx context.Context) ([]ec2svc.ConvertibleRI, error) - FindConvertibleOfferings(ctx context.Context, instanceTypes []string) ([]exchange.OfferingOption, error) } // reshapeRecsClient is the narrow slice of the recommendations @@ -239,19 +243,37 @@ func (h *Handler) getReshapeRecommendations(ctx context.Context, req *events.Lam } riInfos, utilInfos := convertToExchangeTypes(instances, utilData) - // Offering lookup closure uses the already-configured ec2Client, - // so no extra AWS config plumbing is needed. Errors from the - // lookup are swallowed inside AnalyzeReshapingWithOfferings — - // losing pricing chips is strictly less bad than losing the whole - // reshape page. - lookup := func(ctx context.Context, instanceTypes []string) ([]exchange.OfferingOption, error) { - return ec2Client.FindConvertibleOfferings(ctx, instanceTypes) - } - recs := exchange.AnalyzeReshapingWithOfferings(ctx, riInfos, utilInfos, threshold, lookup) + // Cross-family alternatives are sourced from the cached AWS Cost + // Explorer purchase recommendations table in Postgres — no per-rec + // DescribeReservedInstancesOfferings fan-out, no hand-curated + // peer-family allowlist. The lookup is scoped to the running AWS + // account (when registered) so a multi-tenant deployment can't + // surface another tenant's recs. Empty resolved account ID means + // "no scope filter" for ambient-credentials deployments where + // CloudAccount registration hasn't happened yet. + cloudAccountID := h.resolveAWSCloudAccountID(ctx) + currencyCode := firstNonEmptyCurrency(instances) + lookup := purchaseRecLookupFromStore(h.config, cloudAccountID) + recs := exchange.AnalyzeReshapingWithRecs(ctx, riInfos, utilInfos, threshold, region, currencyCode, lookup) return &ReshapeRecommendationsResponse{Recommendations: recs}, nil } +// firstNonEmptyCurrency returns the CurrencyCode of the first RI that +// has one set, defaulting to "USD" for legacy fixtures and the common +// case. The reshape page operates on a single AWS account at a time so +// all RIs share the same currency in practice; picking the first +// populated value is sufficient and avoids a noisy mismatch panic when +// some entries are missing the field. +func firstNonEmptyCurrency(instances []ec2svc.ConvertibleRI) string { + for _, inst := range instances { + if inst.CurrencyCode != "" { + return inst.CurrencyCode + } + } + return "USD" +} + // getExchangeQuote gets a quote for an RI exchange. func (h *Handler) getExchangeQuote(ctx context.Context, req *events.LambdaFunctionURLRequest) (any, error) { if _, err := h.requirePermission(ctx, req, "view", "purchases"); err != nil { diff --git a/internal/api/handler_ri_exchange_integration_test.go b/internal/api/handler_ri_exchange_integration_test.go index c0f6ea15..8017eb37 100644 --- a/internal/api/handler_ri_exchange_integration_test.go +++ b/internal/api/handler_ri_exchange_integration_test.go @@ -6,9 +6,9 @@ package api import ( "context" "encoding/json" - "fmt" "sync/atomic" "testing" + "time" "github.com/aws/aws-lambda-go/events" "github.com/aws/aws-sdk-go-v2/aws" @@ -16,34 +16,23 @@ import ( "github.com/LeanerCloud/CUDly/internal/config" "github.com/LeanerCloud/CUDly/internal/database/postgres/migrations" "github.com/LeanerCloud/CUDly/internal/database/postgres/testhelpers" - "github.com/LeanerCloud/CUDly/pkg/exchange" "github.com/LeanerCloud/CUDly/providers/aws/recommendations" ec2svc "github.com/LeanerCloud/CUDly/providers/aws/services/ec2" "github.com/stretchr/testify/require" ) // fakeReshapeEC2 is a stub implementation of reshapeEC2Client for the -// integration test. Returns fixed RIs + fixed offerings (or an error -// when errOnOfferings is set). Lets the test assert on what the -// reshape handler does with the AWS-returned shapes without hitting -// real AWS. +// integration test. Returns fixed convertible RIs without hitting real +// AWS. Cross-family alternatives now flow through the recommendations +// store (see purchaseRecLookupFromStore), so this fake no longer needs +// to mock FindConvertibleOfferings. type fakeReshapeEC2 struct { - instances []ec2svc.ConvertibleRI - offerings []exchange.OfferingOption - offeringsErr error - offeringsCalled atomic.Int32 + instances []ec2svc.ConvertibleRI } func (f *fakeReshapeEC2) ListConvertibleReservedInstances(_ context.Context) ([]ec2svc.ConvertibleRI, error) { return f.instances, nil } -func (f *fakeReshapeEC2) FindConvertibleOfferings(_ context.Context, _ []string) ([]exchange.OfferingOption, error) { - f.offeringsCalled.Add(1) - if f.offeringsErr != nil { - return nil, f.offeringsErr - } - return f.offerings, nil -} // fakeReshapeRecs is a stub for reshapeRecsClient. Counts calls so the // test can assert cache-hit behaviour on the second request. @@ -100,24 +89,40 @@ func reshapeRequest() *events.LambdaFunctionURLRequest { } } +// seedRecsForRegion writes a snapshot of cached Cost Explorer purchase +// recommendations directly into the store so the reshape lookup has +// something to read. Bypasses the scheduler so the test only exercises +// the read-side mapping logic. +func seedRecsForRegion(ctx context.Context, t *testing.T, store *config.PostgresStore, region string, recs []config.RecommendationRecord) { + t.Helper() + require.NoError(t, store.ReplaceRecommendations(ctx, time.Now(), recs), + "seeding recommendations into the test container failed") +} + // TestReshapeRecommendations_Integration_EndToEnd exercises the full -// path: auth → cache cold-fetch → AnalyzeReshapingWithOfferings -// pricing enrichment → JSON response. Asserts the response carries -// alternative_targets sorted ascending by effective_monthly_cost. +// path: auth → cache cold-fetch → AnalyzeReshapingWithRecs (recs-driven +// alternatives) → JSON response. Asserts the response carries +// alternative_targets sorted ascending by effective_monthly_cost, +// sourced from the recommendations table rather than per-rec AWS API +// calls. func TestReshapeRecommendations_Integration_EndToEnd(t *testing.T) { ctx := context.Background() store, cleanup := setupReshapeHandlerIntegration(ctx, t) defer cleanup() + // Seed cross-family recs in us-east-1: m5 (same family, must be + // excluded), c5, r5. The handler should surface c5 + r5 as + // cross-family alternatives ordered by EffectiveMonthlyCost. + seedRecsForRegion(ctx, t, store, "us-east-1", []config.RecommendationRecord{ + {Provider: "aws", Service: "ec2", Region: "us-east-1", ResourceType: "m5.large", Term: 1, MonthlyCost: 40}, + {Provider: "aws", Service: "ec2", Region: "us-east-1", ResourceType: "c5.large", Term: 1, MonthlyCost: 50}, + {Provider: "aws", Service: "ec2", Region: "us-east-1", ResourceType: "r5.large", Term: 1, MonthlyCost: 60}, + }) + ec2Fake := &fakeReshapeEC2{ instances: []ec2svc.ConvertibleRI{ {ReservedInstanceID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1}, }, - offerings: []exchange.OfferingOption{ - {InstanceType: "m5.large", OfferingID: "off-m5", EffectiveMonthlyCost: 40.0}, - {InstanceType: "m6i.large", OfferingID: "off-m6i", EffectiveMonthlyCost: 35.0}, - {InstanceType: "m7g.large", OfferingID: "off-m7g", EffectiveMonthlyCost: 30.0}, - }, } recsFake := &fakeReshapeRecs{ utilization: []recommendations.RIUtilization{ @@ -135,17 +140,13 @@ func TestReshapeRecommendations_Integration_EndToEnd(t *testing.T) { require.Equal(t, "ri-1", rec.SourceRIID) require.Equal(t, "m5.large", rec.TargetInstanceType) - // AnalyzeReshapingWithOfferings drops the source family (m5) from - // the alternatives, so we expect m6i.large + m7g.large only. - // Alternatives are sorted ascending by EffectiveMonthlyCost — - // cheapest first — so m7g ($30) comes before m6i ($35). - require.Len(t, rec.AlternativeTargets, 2) - require.Equal(t, "m7g.large", rec.AlternativeTargets[0].InstanceType) - require.Equal(t, "off-m7g", rec.AlternativeTargets[0].OfferingID) - require.InDelta(t, 30.0, rec.AlternativeTargets[0].EffectiveMonthlyCost, 0.001) - require.Equal(t, "m6i.large", rec.AlternativeTargets[1].InstanceType) - require.Equal(t, "off-m6i", rec.AlternativeTargets[1].OfferingID) - require.InDelta(t, 35.0, rec.AlternativeTargets[1].EffectiveMonthlyCost, 0.001) + // Same-family m5 stripped; c5 + r5 surface ascending by cost. + require.Len(t, rec.AlternativeTargets, 2, + "cross-family alternatives must come from the cached recs (m5 same-family excluded)") + require.Equal(t, "c5.large", rec.AlternativeTargets[0].InstanceType) + require.InDelta(t, 50.0, rec.AlternativeTargets[0].EffectiveMonthlyCost, 0.001) + require.Equal(t, "r5.large", rec.AlternativeTargets[1].InstanceType) + require.InDelta(t, 60.0, rec.AlternativeTargets[1].EffectiveMonthlyCost, 0.001) // Verify the RI utilization cache row landed in Postgres. entry, err := store.GetRIUtilizationCache(ctx, "us-east-1", 30) @@ -156,26 +157,28 @@ func TestReshapeRecommendations_Integration_EndToEnd(t *testing.T) { require.Len(t, cached, 1) require.Equal(t, "ri-1", cached[0].ReservedInstanceID) - // Fetcher + offerings each called exactly once on the cold path. + // Utilization fetcher called exactly once on the cold path. require.Equal(t, int32(1), recsFake.calls.Load()) - require.Equal(t, int32(1), ec2Fake.offeringsCalled.Load()) } // TestReshapeRecommendations_Integration_SecondCallHitsCache verifies // the cache short-circuits the RI utilization fetcher on a second -// request within TTL. +// request within TTL. The recs lookup is a Postgres read on every +// request (no separate cache layer); only the Cost Explorer +// utilization fetcher is TTL-cached. func TestReshapeRecommendations_Integration_SecondCallHitsCache(t *testing.T) { ctx := context.Background() store, cleanup := setupReshapeHandlerIntegration(ctx, t) defer cleanup() + seedRecsForRegion(ctx, t, store, "us-east-1", []config.RecommendationRecord{ + {Provider: "aws", Service: "ec2", Region: "us-east-1", ResourceType: "c5.large", Term: 1, MonthlyCost: 40}, + }) + ec2Fake := &fakeReshapeEC2{ instances: []ec2svc.ConvertibleRI{ {ReservedInstanceID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1}, }, - offerings: []exchange.OfferingOption{ - {InstanceType: "m5.large", OfferingID: "off-m5", EffectiveMonthlyCost: 40.0}, - }, } recsFake := &fakeReshapeRecs{ utilization: []recommendations.RIUtilization{ @@ -194,26 +197,24 @@ func TestReshapeRecommendations_Integration_SecondCallHitsCache(t *testing.T) { require.NoError(t, err) require.Equal(t, int32(1), recsFake.calls.Load(), "fresh read within soft TTL must not re-invoke the utilization fetcher") - - // Offerings lookup is NOT cached (distinct concern) — expected to - // be called on each request. - require.Equal(t, int32(2), ec2Fake.offeringsCalled.Load()) } -// TestReshapeRecommendations_Integration_OfferingsLookupErrorKeepsNameOnlyAlternatives -// verifies the graceful-degradation contract: when FindConvertibleOfferings -// returns an error, the base recs still ship with name-only -// alternatives (no OfferingID, zero cost). -func TestReshapeRecommendations_Integration_OfferingsLookupErrorKeepsNameOnlyAlternatives(t *testing.T) { +// TestReshapeRecommendations_Integration_NoCachedRecsReturnsPrimaryOnly +// — when the recommendations table is empty for the region, the rec +// still ships with its primary same-family target but with no +// alternatives. UX matches "AWS hasn't recommended anything for this +// region yet" rather than blanking the page. +func TestReshapeRecommendations_Integration_NoCachedRecsReturnsPrimaryOnly(t *testing.T) { ctx := context.Background() store, cleanup := setupReshapeHandlerIntegration(ctx, t) defer cleanup() + // Note: NO seedRecsForRegion call — table starts empty. + ec2Fake := &fakeReshapeEC2{ instances: []ec2svc.ConvertibleRI{ {ReservedInstanceID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1}, }, - offeringsErr: fmt.Errorf("simulated cost-explorer 5xx"), } recsFake := &fakeReshapeRecs{ utilization: []recommendations.RIUtilization{ @@ -223,20 +224,11 @@ func TestReshapeRecommendations_Integration_OfferingsLookupErrorKeepsNameOnlyAlt h := buildReshapeHandler(store, ec2Fake, recsFake) resp, err := h.getReshapeRecommendations(ctx, reshapeRequest()) - require.NoError(t, err, "handler must not fail when the offerings lookup errors — graceful degradation") + require.NoError(t, err, "handler must not fail when the recommendations cache is empty") body := resp.(*ReshapeRecommendationsResponse) require.Len(t, body.Recommendations, 1) rec := body.Recommendations[0] require.Equal(t, "m5.large", rec.TargetInstanceType, "primary target stays intact") - - // AnalyzeReshapingWithOfferings returns the base recs on lookup - // error, which means AlternativeTargets keeps its name-only - // entries from AnalyzeReshaping (instance_type set, offering_id - // empty, cost zero). - require.NotEmpty(t, rec.AlternativeTargets, "base recs still ship with name-only alternatives") - for _, alt := range rec.AlternativeTargets { - require.NotEmpty(t, alt.InstanceType) - require.Empty(t, alt.OfferingID, "lookup error leaves offering_id blank") - require.Zero(t, alt.EffectiveMonthlyCost, "lookup error leaves cost zero") - } + require.Empty(t, rec.AlternativeTargets, + "empty cache → no alternatives, primary target still surfaced") } diff --git a/pkg/exchange/reshape.go b/pkg/exchange/reshape.go index d945b2c5..22f3546c 100644 --- a/pkg/exchange/reshape.go +++ b/pkg/exchange/reshape.go @@ -41,8 +41,8 @@ type UtilizationInfo struct { // reshape layer: the AWS offering ID, the instance type it provisions, // and the effective monthly cost (amortised fixed price + recurring // hourly charges × 730 hours). Used both as the return shape of -// OfferingLookup (what an AWS-provider closure returns) and as the -// element type of ReshapeRecommendation.AlternativeTargets (what +// PurchaseRecLookup (what the recommendations-store closure returns) and +// as the element type of ReshapeRecommendation.AlternativeTargets (what // reshape emits to the HTTP handler). Kept in pkg/exchange so // pkg/exchange stays AWS-free and providers/aws/services/ec2 imports // the type rather than defining its own. @@ -53,9 +53,9 @@ type OfferingOption struct { // NormalizationFactor is the AWS normalization factor for the // offering's instance size — needed by the local // passesDollarUnitsCheck pre-filter that gates which cross-family - // alternatives surface in the UI. Populated by FindConvertibleOfferings; - // zero on missing means the alternative cannot be safely compared - // and the pre-filter excludes it. + // alternatives surface in the UI. Zero on missing means the + // alternative cannot be safely compared and the pre-filter + // excludes it. NormalizationFactor float64 `json:"normalization_factor,omitempty"` // CurrencyCode is the ISO-4217 currency the EffectiveMonthlyCost is // denominated in (typically "USD"). The pre-filter requires source @@ -64,23 +64,31 @@ type OfferingOption struct { CurrencyCode string `json:"currency_code,omitempty"` } -// OfferingLookup is the signature of the closure that resolves -// candidate instance types into concrete offerings with pricing. Used -// by AnalyzeReshapingWithOfferings. Implementations batch the request -// into a single DescribeReservedInstancesOfferings call per peer-family -// group so the N instance-types → N API calls fan-out is avoided. -type OfferingLookup func(ctx context.Context, instanceTypes []string) ([]OfferingOption, error) +// PurchaseRecLookup is the signature of the closure that resolves a +// region + currency to the cached AWS Cost Explorer purchase +// recommendations available there. Used by AnalyzeReshapingWithRecs to +// generate cross-family alternatives without per-recommendation AWS API +// calls — the recommendations table is already populated by the +// scheduler tick. Implementations read from store.ListStoredRecommendations +// and map each RecommendationRecord to an OfferingOption (effective +// monthly cost amortised from the upfront and monthly fields). +// +// region is the AWS region the reshape page is viewing; currencyCode is +// the source RIs' currency, propagated onto each returned +// OfferingOption.CurrencyCode so the dollar-units pre-filter can refuse +// cross-currency comparisons cleanly. +type PurchaseRecLookup func(ctx context.Context, region, currencyCode string) ([]OfferingOption, error) // ReshapeRecommendation describes a suggested exchange for an underutilized RI. // -// AlternativeTargets lists cross-family options within the same -// use-case group (general-purpose / compute / memory / burstable) at -// the same target size, enriched with real offering IDs and monthly -// cost when a pricing lookup is available. This is advisory data for -// the UI to surface alongside the primary target; the auto-exchange +// AlternativeTargets lists cross-family options enriched with real +// offering IDs and monthly cost from cached AWS Cost Explorer purchase +// recommendations (see AnalyzeReshapingWithRecs). This is advisory data +// for the UI to surface alongside the primary target; the auto-exchange // pipeline still acts on TargetInstanceType only so existing automated -// behaviour is unchanged. Emitted when the primary target's family -// belongs to a known peer group; empty otherwise. +// behaviour is unchanged. Empty when the base AnalyzeReshaping is used +// directly (auto.go) or when no cached recommendations exist for the +// region. type ReshapeRecommendation struct { SourceRIID string `json:"source_ri_id"` SourceInstanceType string `json:"source_instance_type"` @@ -94,63 +102,6 @@ type ReshapeRecommendation struct { Reason string `json:"reason"` } -// peerFamilyGroups maps each family in the allowlist to the full set -// of peer families within its use-case group. AWS Convertible RIs can -// cross families when the target's $-value units (from -// DescribeReservedInstancesOfferings) match — and the most common -// viable cross-family moves are between same-generation same-use-case -// siblings (e.g. m5 ↔ m6i, c5 ↔ c7g). Suggesting these to the user -// broadens their options without pushing them into shapes that will -// fail the AWS exchange-units check at quote time. -// -// Specialty (GPU / HPC) and legacy-generation families are now also -// included; the new local passesDollarUnitsCheck pre-filter (applied -// in fillAlternativesFromOfferings) drops any alternative whose -// (NormalizationFactor × EffectiveMonthlyCost) wouldn't survive AWS's -// runtime exchange-validity rule, so suggestions stay actionable -// without a per-pair GetReservedInstancesExchangeQuote API call. -// False positives still go through the existing auto.go IsValidExchange -// skip path at execution time. -var peerFamilyGroups = map[string][]string{ - // general-purpose - "m5": {"m5", "m6i", "m7g"}, - "m6i": {"m5", "m6i", "m7g"}, - "m7g": {"m5", "m6i", "m7g"}, - // compute-optimised - "c5": {"c5", "c6i", "c7g"}, - "c6i": {"c5", "c6i", "c7g"}, - "c7g": {"c5", "c6i", "c7g"}, - // memory-optimised - "r5": {"r5", "r6i", "r7g"}, - "r6i": {"r5", "r6i", "r7g"}, - "r7g": {"r5", "r6i", "r7g"}, - // burstable (maps to itself — generation variants are - // typically distinct enough that AWS won't let you exchange - // across them; listed to keep the helper returning a sensible - // result rather than nil for t-family RIs). - "t3": {"t3", "t3a", "t4g"}, - "t3a": {"t3", "t3a", "t4g"}, - "t4g": {"t3", "t3a", "t4g"}, - // Specialty: GPU / inference / HPC. Cross-family within each - // group; the $-units check below filters unviable pairs. - "p3": {"p3", "p4d", "p5"}, - "p4d": {"p3", "p4d", "p5"}, - "p5": {"p3", "p4d", "p5"}, - "g4dn": {"g4dn", "g5"}, - "g5": {"g4dn", "g5"}, - "hpc6a": {"hpc6a", "hpc6id", "hpc7g"}, - "hpc6id": {"hpc6a", "hpc6id", "hpc7g"}, - "hpc7g": {"hpc6a", "hpc6id", "hpc7g"}, - // Legacy generations — useful when operators want to migrate - // off a legacy family to the current generation without - // changing shape entirely. The $-units check makes most legacy - // → current moves viable for the same size. - "m4": {"m4", "m5"}, - "c4": {"c4", "c5"}, - "r3": {"r3", "r4", "r5"}, - "r4": {"r3", "r4", "r5"}, -} - // passesDollarUnitsCheck approximates AWS's exchange-validity rule // locally: a target offering passes if (target.NF × target.EMC) >= // (srcNF × srcMonthlyCost). AWS's actual rule is two parallel @@ -179,14 +130,6 @@ func passesDollarUnitsCheck(srcNF, srcMonthlyCost float64, srcCurrency string, t return target.NormalizationFactor*target.EffectiveMonthlyCost >= srcNF*srcMonthlyCost } -// candidateFamilies returns the peer families (including sourceFamily -// itself) within the same use-case group, or nil when the family is -// not in the allowlist. Callers surface the returned families to users -// as cross-family alternatives. -func candidateFamilies(sourceFamily string) []string { - return peerFamilyGroups[strings.ToLower(sourceFamily)] -} - // normalizationFactors maps EC2 instance sizes to their AWS normalization factors. // See: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ri-modification-instancemove.html var normalizationFactors = map[string]float64{ @@ -239,6 +182,10 @@ func NormalizationFactorForSize(size string) float64 { // // threshold is a percentage (0–100) below which an RI is considered underutilized. // For example, threshold=95 means RIs with <95% utilization get recommendations. +// +// Recommendations are emitted with empty AlternativeTargets — alternatives +// are populated only by AnalyzeReshapingWithRecs, which pairs each rec +// against the cached AWS Cost Explorer recommendations. func AnalyzeReshaping(ris []RIInfo, utilization []UtilizationInfo, threshold float64) []ReshapeRecommendation { utilMap := make(map[string]float64, len(utilization)) for _, u := range utilization { @@ -316,12 +263,14 @@ func analyzeRI(ri RIInfo, utilMap map[string]float64, threshold float64) *Reshap } return &ReshapeRecommendation{ - SourceRIID: ri.ID, - SourceInstanceType: ri.InstanceType, - SourceCount: ri.InstanceCount, - TargetInstanceType: targetInstanceType, - TargetCount: targetCount, - AlternativeTargets: alternativesForTarget(family, targetSize), + SourceRIID: ri.ID, + SourceInstanceType: ri.InstanceType, + SourceCount: ri.InstanceCount, + TargetInstanceType: targetInstanceType, + TargetCount: targetCount, + // AlternativeTargets stays nil here; AnalyzeReshapingWithRecs + // fills it from the cached AWS Cost Explorer recommendations. + AlternativeTargets: nil, UtilizationPercent: util, NormalizedUsed: normalizedUsed, NormalizedPurchased: normalizedPurchased, @@ -334,68 +283,48 @@ func analyzeRI(ri RIInfo, utilMap map[string]float64, threshold float64) *Reshap } } -// alternativesForTarget returns cross-family equivalents at the same -// target size for the families in sourceFamily's peer group. The -// source family itself is excluded (that's the primary -// TargetInstanceType already). Returns nil when sourceFamily isn't in -// the allowlist or has no peers. +// AnalyzeReshapingWithRecs is AnalyzeReshaping plus cross-family +// alternatives sourced from the cached AWS Cost Explorer purchase +// recommendations table. It calls the base analyzer, then makes a +// SINGLE region-scoped lookup and pairs each rec against the returned +// offerings — no per-recommendation AWS API call, no hand-curated +// peer-family allowlist. // -// Pricing fields (OfferingID, EffectiveMonthlyCost) are left empty -// here. AnalyzeReshapingWithOfferings backfills them via an injected -// lookup; callers without a lookup (auto.go, no-pricing tests) see -// name-only entries. -func alternativesForTarget(sourceFamily, targetSize string) []OfferingOption { - peers := candidateFamilies(sourceFamily) - if len(peers) <= 1 { - return nil - } - out := make([]OfferingOption, 0, len(peers)-1) - for _, p := range peers { - if strings.EqualFold(p, sourceFamily) { - continue - } - out = append(out, OfferingOption{InstanceType: p + "." + targetSize}) - } - return out -} - -// AnalyzeReshapingWithOfferings is AnalyzeReshaping + offering -// enrichment: it calls the base analyzer, batches the distinct target -// instance types (primary + alternatives) into ONE lookup call, and -// fills each rec's AlternativeTargets with real OfferingID + -// EffectiveMonthlyCost. -// -// Missing-offering behaviour: if the lookup returns no entry for a -// candidate instance type, that alternative is silently dropped from -// the rec's slice — the rec still ships with its primary target and -// the alternatives that DID resolve. If the lookup itself errors, the -// base recs are returned with empty AlternativeTargets across the -// board; the dashboard's primary reshape suggestions stay intact. +// Filtering rules per rec: +// - The source family (parsed from rec.SourceInstanceType) is excluded +// so the alternatives slice carries only cross-family options. +// - The primary target (rec.TargetInstanceType) is also excluded — it +// is already surfaced as the primary suggestion. +// - passesDollarUnitsCheck gates each surviving offering against the +// source RI's NF / MonthlyCost / CurrencyCode so the UI doesn't +// show options that would be rejected at AWS exchange time. When +// the source RI lacks pricing (MonthlyCost == 0) the gate is +// skipped for that rec to preserve today's behaviour for callers +// that don't supply pricing. // -// auto.go keeps calling the existing AnalyzeReshaping (no pricing -// needed); only the HTTP handlers that surface recommendations to -// users call the enriched version. -func AnalyzeReshapingWithOfferings( +// Missing cache, lookup error, or empty-region response: rec ships with +// empty AlternativeTargets — the dashboard's primary reshape suggestion +// stays intact and the UX matches "AWS hasn't recommended anything for +// this region yet". auto.go keeps calling AnalyzeReshaping (no +// alternatives needed); only the HTTP reshape handler calls this enriched +// version. +func AnalyzeReshapingWithRecs( ctx context.Context, ris []RIInfo, utilization []UtilizationInfo, threshold float64, - lookup OfferingLookup, + region, currencyCode string, + lookup PurchaseRecLookup, ) []ReshapeRecommendation { recs := AnalyzeReshaping(ris, utilization, threshold) if lookup == nil || len(recs) == 0 { return recs } - types := distinctCandidateTypes(recs) - if len(types) == 0 { - return recs - } - - offerings, err := lookup(ctx, types) - if err != nil { - // Fall back to name-only alternatives — losing pricing is - // strictly less bad than losing the whole reshape page. + offerings, err := lookup(ctx, region, currencyCode) + if err != nil || len(offerings) == 0 { + // Fall through to base recs — losing alternatives is strictly + // less bad than losing the whole reshape page. return recs } @@ -406,74 +335,44 @@ func AnalyzeReshapingWithOfferings( for _, r := range ris { risByID[r.ID] = r } - fillAlternativesFromOfferings(recs, offerings, risByID) + fillAlternativesFromRecs(recs, offerings, risByID) return recs } -// distinctCandidateTypes collects de-duplicated instance types from all -// recs' primary + alternative targets, sorted for deterministic -// lookups in tests. -func distinctCandidateTypes(recs []ReshapeRecommendation) []string { - want := make(map[string]struct{}) - for _, r := range recs { - if r.TargetInstanceType != "" { - want[r.TargetInstanceType] = struct{}{} - } - for _, alt := range r.AlternativeTargets { - if alt.InstanceType != "" { - want[alt.InstanceType] = struct{}{} - } - } - } - types := make([]string, 0, len(want)) - for t := range want { - types = append(types, t) - } - sort.Strings(types) - return types -} - -// fillAlternativesFromOfferings replaces each rec's AlternativeTargets -// with the matching OfferingOption from the lookup result. Missing -// instance types are silently dropped (per the doc on -// AnalyzeReshapingWithOfferings). -// -// risByID lets the helper apply the local passesDollarUnitsCheck per -// rec — alternatives that wouldn't survive AWS's runtime exchange- -// validity rule are dropped here so the UI doesn't show options that -// would fail at quote time. When the source RI isn't in the map (or -// has zero NF / MonthlyCost), the check is skipped for that rec to -// preserve today's behaviour for callers that don't supply pricing. -// -// The output is sorted ascending by EffectiveMonthlyCost so the UI -// shows cheapest alternatives first — this matches user intent (the -// primary advisory signal of this list is "is there a cheaper option -// than the primary target?") even though it differs from the peer- -// family allowlist order that the base AnalyzeReshaping emits. -func fillAlternativesFromOfferings(recs []ReshapeRecommendation, offerings []OfferingOption, risByID map[string]RIInfo) { - offByType := make(map[string]OfferingOption, len(offerings)) - for _, o := range offerings { - if _, exists := offByType[o.InstanceType]; !exists { - offByType[o.InstanceType] = o - } - } +// fillAlternativesFromRecs is the per-rec body of +// AnalyzeReshapingWithRecs: for each rec it filters the offerings list +// to cross-family options that pass the dollar-units gate, then sorts +// them ascending by EffectiveMonthlyCost so the cheapest lands first +// in the UI list. The source family and the primary target are excluded +// so the alternatives slice is meaningfully different from the primary +// suggestion. When source pricing is missing the gate is skipped for +// that rec (NF/MonthlyCost==0 case). +func fillAlternativesFromRecs(recs []ReshapeRecommendation, offerings []OfferingOption, risByID map[string]RIInfo) { for i := range recs { src, hasSrc := risByID[recs[i].SourceRIID] - filled := make([]OfferingOption, 0, len(recs[i].AlternativeTargets)) - for _, alt := range recs[i].AlternativeTargets { - found, ok := offByType[alt.InstanceType] - if !ok { + srcFamily, _ := parseInstanceType(recs[i].SourceInstanceType) + filled := make([]OfferingOption, 0, len(offerings)) + for _, off := range offerings { + // Skip same-family offerings — those collapse onto the + // primary same-family TargetInstanceType already surfaced. + offFamily, _ := parseInstanceType(off.InstanceType) + if offFamily == "" || strings.EqualFold(offFamily, srcFamily) { + continue + } + // Skip the primary target — it's not an "alternative" to + // itself. + if off.InstanceType == recs[i].TargetInstanceType { continue } // Apply the pre-filter only when source pricing info is - // available; otherwise keep today's behaviour and pass - // the alternative through untouched. + // available; otherwise keep today's behaviour and pass the + // alternative through untouched. if hasSrc && src.MonthlyCost > 0 { - if !passesDollarUnitsCheck(src.NormalizationFactor, src.MonthlyCost, src.CurrencyCode, found) { + if !passesDollarUnitsCheck(src.NormalizationFactor, src.MonthlyCost, src.CurrencyCode, off) { continue } } - filled = append(filled, found) + filled = append(filled, off) } sort.Slice(filled, func(a, b int) bool { return filled[a].EffectiveMonthlyCost < filled[b].EffectiveMonthlyCost diff --git a/pkg/exchange/reshape_crossfamily_test.go b/pkg/exchange/reshape_crossfamily_test.go index 2365ea51..306c0bd6 100644 --- a/pkg/exchange/reshape_crossfamily_test.go +++ b/pkg/exchange/reshape_crossfamily_test.go @@ -10,257 +10,284 @@ import ( "github.com/stretchr/testify/require" ) -func TestCandidateFamilies_AllowlistedFamilies(t *testing.T) { +func TestAnalyzeReshaping_BaseEmitsNoAlternatives(t *testing.T) { t.Parallel() - - cases := []struct { - source string - want []string - }{ - // General-purpose peers. - {"m5", []string{"m5", "m6i", "m7g"}}, - {"m6i", []string{"m5", "m6i", "m7g"}}, - {"m7g", []string{"m5", "m6i", "m7g"}}, - // Compute-optimised peers. - {"c5", []string{"c5", "c6i", "c7g"}}, - {"c7g", []string{"c5", "c6i", "c7g"}}, - // Memory-optimised peers. - {"r5", []string{"r5", "r6i", "r7g"}}, - {"r6i", []string{"r5", "r6i", "r7g"}}, - // Burstable peers. - {"t3", []string{"t3", "t3a", "t4g"}}, - // Case insensitivity — operators may supply uppercase. - {"M5", []string{"m5", "m6i", "m7g"}}, - } - for _, c := range cases { - got := candidateFamilies(c.source) - sort.Strings(got) - sort.Strings(c.want) - assert.Equal(t, c.want, got, "unexpected peers for %q", c.source) - } + // The base AnalyzeReshaping no longer emits cross-family + // alternatives — that responsibility now lives in + // AnalyzeReshapingWithRecs, which sources options from the cached + // AWS Cost Explorer recommendations table. The base path's + // AlternativeTargets must always be nil regardless of family so the + // auto.go pipeline (which uses the base path) keeps its current + // "primary target only" contract. + recs := AnalyzeReshaping( + []RIInfo{{ID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible"}}, + []UtilizationInfo{{RIID: "ri-1", UtilizationPercent: 50}}, + 95, + ) + require.Len(t, recs, 1) + assert.Equal(t, "m5.large", recs[0].TargetInstanceType, "primary same-family downsize stays") + assert.Nil(t, recs[0].AlternativeTargets, + "base path emits no alternatives — recs-driven path owns that surface") } -func TestCandidateFamilies_SpecialtyAndLegacyFamilies(t *testing.T) { +func TestAnalyzeReshaping_StandardRIStillSkipped(t *testing.T) { t.Parallel() - // Specialty (GPU/HPC) and legacy-generation families are now in - // the allowlist; the local passesDollarUnitsCheck pre-filter (in - // fillAlternativesFromOfferings) drops unviable cross-family - // pairs so suggestions stay actionable without per-pair AWS quote - // API calls. These groups are fixed in code; the test pins the - // exact membership. - cases := []struct { - source string - want []string - }{ - // GPU - {"p3", []string{"p3", "p4d", "p5"}}, - {"p4d", []string{"p3", "p4d", "p5"}}, - {"g4dn", []string{"g4dn", "g5"}}, - // HPC - {"hpc6a", []string{"hpc6a", "hpc6id", "hpc7g"}}, - // Legacy generations. - {"m4", []string{"m4", "m5"}}, - {"c4", []string{"c4", "c5"}}, - {"r3", []string{"r3", "r4", "r5"}}, - {"r4", []string{"r3", "r4", "r5"}}, - } - for _, c := range cases { - got := candidateFamilies(c.source) - sort.Strings(got) - sort.Strings(c.want) - assert.Equal(t, c.want, got, "unexpected peers for %q", c.source) - } + // Regression guard: the refactor must not start emitting + // recommendations for Standard RIs, which AWS forbids from + // exchanging entirely. + recs := AnalyzeReshaping( + []RIInfo{{ID: "ri-std", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "standard"}}, + []UtilizationInfo{{RIID: "ri-std", UtilizationPercent: 30}}, + 95, + ) + assert.Empty(t, recs, "standard RI must never produce a recommendation") } -func TestCandidateFamilies_UnlistedFamiliesReturnNil(t *testing.T) { +// TestAnalyzeReshapingWithRecs_RecommendationDrivenAlternatives — the +// fake lookup returns offerings spanning m5/c5/r5; an underutilised m5 +// RI should surface c5 + r5 alternatives (cross-family), with the same +// family (m5) excluded so the alternatives slice carries only options +// that differ meaningfully from the primary target. Sort order is +// ascending by EffectiveMonthlyCost — cheapest first. +func TestAnalyzeReshapingWithRecs_RecommendationDrivenAlternatives(t *testing.T) { t.Parallel() - // Truly off-allowlist families: x1 / g4 (note: g4dn IS in the - // allowlist, but plain "g4" without the dn suffix isn't), older - // HPC variants, and the empty string. These return nil — no - // cross-family suggestions surface for the underlying RI. - for _, fam := range []string{"g4", "x1", "hpc7a", ""} { - assert.Nil(t, candidateFamilies(fam), "expected nil for unlisted family %q", fam) + + var callCount int + var gotRegion, gotCurrency string + lookup := func(_ context.Context, region, currency string) ([]OfferingOption, error) { + callCount++ + gotRegion, gotCurrency = region, currency + return []OfferingOption{ + {InstanceType: "m5.large", OfferingID: "off-m5", EffectiveMonthlyCost: 40.0, NormalizationFactor: 4, CurrencyCode: "USD"}, + {InstanceType: "c5.large", OfferingID: "off-c5", EffectiveMonthlyCost: 50.0, NormalizationFactor: 4, CurrencyCode: "USD"}, + {InstanceType: "r5.large", OfferingID: "off-r5", EffectiveMonthlyCost: 60.0, NormalizationFactor: 4, CurrencyCode: "USD"}, + }, nil } -} -func TestAnalyzeReshaping_EmitsCrossFamilyAlternatives(t *testing.T) { - t.Parallel() - // m5.xlarge at 50% → primary target m5.large; alternatives - // should list m6i.large and m7g.large (same peer group, same - // target size, source family excluded). - recs := AnalyzeReshaping( - []RIInfo{{ID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible"}}, + recs := AnalyzeReshapingWithRecs( + context.Background(), + []RIInfo{{ + ID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible", + NormalizationFactor: 8, MonthlyCost: 25, CurrencyCode: "USD", // src units = 200 + }}, []UtilizationInfo{{RIID: "ri-1", UtilizationPercent: 50}}, 95, + "us-east-1", "USD", + lookup, ) - if len(recs) != 1 { - t.Fatalf("expected 1 recommendation; got %d", len(recs)) - } + require.Len(t, recs, 1) + assert.Equal(t, 1, callCount, "lookup must be called exactly once per reshape pass") + assert.Equal(t, "us-east-1", gotRegion, "region must be threaded through") + assert.Equal(t, "USD", gotCurrency, "currency must be threaded through") + got := recs[0] - assert.Equal(t, "m5.large", got.TargetInstanceType) + assert.Equal(t, "m5.large", got.TargetInstanceType, "primary downsize unchanged") gotAlts := make([]string, 0, len(got.AlternativeTargets)) for _, alt := range got.AlternativeTargets { gotAlts = append(gotAlts, alt.InstanceType) } sort.Strings(gotAlts) - assert.Equal(t, []string{"m6i.large", "m7g.large"}, gotAlts, - "peer families should be suggested at the same target size, source family excluded") - // No lookup injected at this call site → pricing fields stay zero. - for _, alt := range got.AlternativeTargets { - assert.Empty(t, alt.OfferingID, "base AnalyzeReshaping does not resolve offering IDs") - assert.Zero(t, alt.EffectiveMonthlyCost, "base AnalyzeReshaping does not resolve prices") - } + assert.Equal(t, []string{"c5.large", "r5.large"}, gotAlts, + "cross-family alternatives must surface; same-family m5 must be excluded") + // Ascending by EffectiveMonthlyCost: c5 ($50) before r5 ($60). + require.Len(t, got.AlternativeTargets, 2) + assert.Equal(t, "c5.large", got.AlternativeTargets[0].InstanceType) + assert.Equal(t, "off-c5", got.AlternativeTargets[0].OfferingID) + assert.InDelta(t, 50.0, got.AlternativeTargets[0].EffectiveMonthlyCost, 0.001) + assert.Equal(t, "r5.large", got.AlternativeTargets[1].InstanceType) + assert.InDelta(t, 60.0, got.AlternativeTargets[1].EffectiveMonthlyCost, 0.001) } -func TestAnalyzeReshaping_NoAlternativesForUnlistedFamily(t *testing.T) { +// TestAnalyzeReshapingWithRecs_EmptyLookupReturnsNoAlternatives — cold +// cache / no recs in the region: the rec ships with its primary target +// and an empty AlternativeTargets slice. UI matches "AWS hasn't +// recommended anything for this region yet". +func TestAnalyzeReshapingWithRecs_EmptyLookupReturnsNoAlternatives(t *testing.T) { t.Parallel() - // "x1" is not in the allowlist, so no cross-family alternatives - // are emitted at the base layer. We still emit the same-family - // primary reshape — name-only alternatives only get filled in - // when the family has peers in the group. - recs := AnalyzeReshaping( - []RIInfo{{ID: "ri-x", InstanceType: "x1.16xlarge", InstanceCount: 1, OfferingClass: "convertible"}}, - []UtilizationInfo{{RIID: "ri-x", UtilizationPercent: 40}}, + lookup := func(_ context.Context, _, _ string) ([]OfferingOption, error) { + return nil, nil + } + recs := AnalyzeReshapingWithRecs( + context.Background(), + []RIInfo{{ID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible"}}, + []UtilizationInfo{{RIID: "ri-1", UtilizationPercent: 50}}, 95, + "us-east-1", "USD", + lookup, ) - if len(recs) != 1 { - t.Fatalf("expected 1 recommendation; got %d", len(recs)) - } - assert.Nil(t, recs[0].AlternativeTargets, - "unlisted families must not get cross-family alternatives") + require.Len(t, recs, 1) + assert.Equal(t, "m5.large", recs[0].TargetInstanceType) + assert.Empty(t, recs[0].AlternativeTargets, + "empty cache → no alternatives but primary target still surfaced") } -func TestAnalyzeReshaping_StandardRIStillSkipped(t *testing.T) { +// TestAnalyzeReshapingWithRecs_AppliesDollarUnitsFilter — alternatives +// that would fail AWS's $-units exchange check are dropped before +// reaching the UI, matching the existing behaviour of the local +// pre-filter. Source pricing must be supplied (NF + MonthlyCost) for +// the gate to engage. +func TestAnalyzeReshapingWithRecs_AppliesDollarUnitsFilter(t *testing.T) { t.Parallel() - // Regression guard: adding cross-family suggestions must not - // start emitting recommendations for Standard RIs, which AWS - // forbids from exchanging entirely. - recs := AnalyzeReshaping( - []RIInfo{{ID: "ri-std", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "standard"}}, - []UtilizationInfo{{RIID: "ri-std", UtilizationPercent: 30}}, + + lookup := func(_ context.Context, _, _ string) ([]OfferingOption, error) { + return []OfferingOption{ + // c5.large priced too cheap to satisfy the units check + // (4 × 5 = 20 < src 200) — should be filtered out. + {InstanceType: "c5.large", OfferingID: "off-c5", EffectiveMonthlyCost: 5.0, NormalizationFactor: 4, CurrencyCode: "USD"}, + // r5.large priced enough to pass (4 × 50 = 200 ≥ 200). + {InstanceType: "r5.large", OfferingID: "off-r5", EffectiveMonthlyCost: 50.0, NormalizationFactor: 4, CurrencyCode: "USD"}, + }, nil + } + + recs := AnalyzeReshapingWithRecs( + context.Background(), + []RIInfo{{ + ID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible", + NormalizationFactor: 8, MonthlyCost: 25, CurrencyCode: "USD", // src units = 200 + }}, + []UtilizationInfo{{RIID: "ri-1", UtilizationPercent: 50}}, 95, + "us-east-1", "USD", + lookup, ) - assert.Empty(t, recs, "standard RI must never produce a recommendation") + require.Len(t, recs, 1) + alts := recs[0].AlternativeTargets + require.Len(t, alts, 1, "only r5.large should pass; c5.large gets filtered for failing the $-units check") + assert.Equal(t, "r5.large", alts[0].InstanceType) } -func TestAnalyzeReshapingWithOfferings_EnrichesAlternatives(t *testing.T) { +// TestAnalyzeReshapingWithRecs_NoSourcePricingSkipsFilter — when the +// caller doesn't populate RIInfo.MonthlyCost (zero), the filter is +// skipped entirely and every cross-family offering passes through. +// Pins backwards compatibility for older callers that don't compute +// per-RI monthly cost. +func TestAnalyzeReshapingWithRecs_NoSourcePricingSkipsFilter(t *testing.T) { t.Parallel() - // m5.xlarge at 50% → primary m5.large + alternatives m6i.large, - // m7g.large. Lookup returns pricing for all three plus an extra - // unrelated offering that should be ignored. Lookup is called - // exactly once with the de-duplicated instance-type set. - var gotTypes []string - var callCount int - lookup := func(ctx context.Context, instanceTypes []string) ([]OfferingOption, error) { - callCount++ - gotTypes = append([]string{}, instanceTypes...) + lookup := func(_ context.Context, _, _ string) ([]OfferingOption, error) { return []OfferingOption{ - {InstanceType: "m5.large", OfferingID: "off-m5", EffectiveMonthlyCost: 40.0}, - {InstanceType: "m6i.large", OfferingID: "off-m6i", EffectiveMonthlyCost: 35.0}, - {InstanceType: "m7g.large", OfferingID: "off-m7g", EffectiveMonthlyCost: 30.0}, - {InstanceType: "irrelevant.size", OfferingID: "off-x", EffectiveMonthlyCost: 999.0}, + // Would normally be dropped if the filter ran, but no source + // pricing means we keep today's "show every cross-family match" + // behaviour. + {InstanceType: "c5.large", OfferingID: "off-c5", EffectiveMonthlyCost: 5.0, NormalizationFactor: 4}, + {InstanceType: "r5.large", OfferingID: "off-r5", EffectiveMonthlyCost: 50.0, NormalizationFactor: 4}, }, nil } - - recs := AnalyzeReshapingWithOfferings( + recs := AnalyzeReshapingWithRecs( context.Background(), + // Older RIInfo shape: no MonthlyCost / CurrencyCode populated. []RIInfo{{ID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible"}}, []UtilizationInfo{{RIID: "ri-1", UtilizationPercent: 50}}, 95, + "us-east-1", "", lookup, ) - if len(recs) != 1 { - t.Fatalf("expected 1 rec; got %d", len(recs)) - } - assert.Equal(t, 1, callCount, "lookup must be called exactly once (batched)") - - sort.Strings(gotTypes) - assert.Equal(t, []string{"m5.large", "m6i.large", "m7g.large"}, gotTypes, - "lookup should receive de-duplicated instance types") - - got := recs[0] - assert.Equal(t, "m5.large", got.TargetInstanceType) - if len(got.AlternativeTargets) != 2 { - t.Fatalf("expected 2 alternatives; got %+v", got.AlternativeTargets) - } - // AnalyzeReshapingWithOfferings sorts alternatives ascending by - // EffectiveMonthlyCost so the cheapest option is first. Lookup - // returned m6i.large=$35 and m7g.large=$30 → m7g first. - assert.Equal(t, "m7g.large", got.AlternativeTargets[0].InstanceType) - assert.Equal(t, "off-m7g", got.AlternativeTargets[0].OfferingID) - assert.InDelta(t, 30.0, got.AlternativeTargets[0].EffectiveMonthlyCost, 0.001) - assert.Equal(t, "m6i.large", got.AlternativeTargets[1].InstanceType) - assert.InDelta(t, 35.0, got.AlternativeTargets[1].EffectiveMonthlyCost, 0.001) + require.Len(t, recs, 1) + require.Len(t, recs[0].AlternativeTargets, 2, + "with no source pricing, both cross-family alternatives must remain visible") } -func TestAnalyzeReshapingWithOfferings_MissingOfferingDroppedNotWholeRec(t *testing.T) { +// TestAnalyzeReshapingWithRecs_ExcludesPrimaryTarget — when the lookup +// returns an offering whose InstanceType matches the rec's primary +// TargetInstanceType, that offering is excluded from AlternativeTargets +// because it isn't an alternative to itself; it's the same suggestion. +// The cross-family offerings that remain are surfaced as before. +func TestAnalyzeReshapingWithRecs_ExcludesPrimaryTarget(t *testing.T) { t.Parallel() - // AWS doesn't offer m7g.large in this region. The lookup returns - // only m5.large + m6i.large. The rec should still ship with - // m5.large as primary and m6i.large as the only alternative; - // m7g.large is silently dropped. - lookup := func(ctx context.Context, _ []string) ([]OfferingOption, error) { + lookup := func(_ context.Context, _, _ string) ([]OfferingOption, error) { return []OfferingOption{ - {InstanceType: "m5.large", OfferingID: "off-m5", EffectiveMonthlyCost: 40.0}, - {InstanceType: "m6i.large", OfferingID: "off-m6i", EffectiveMonthlyCost: 35.0}, + // Same family + same size as the primary target — must be + // excluded by the same-family guard regardless. + {InstanceType: "m5.large", OfferingID: "off-m5", EffectiveMonthlyCost: 40.0, NormalizationFactor: 4}, + {InstanceType: "c5.large", OfferingID: "off-c5", EffectiveMonthlyCost: 50.0, NormalizationFactor: 4}, }, nil } - recs := AnalyzeReshapingWithOfferings( + recs := AnalyzeReshapingWithRecs( context.Background(), []RIInfo{{ID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible"}}, []UtilizationInfo{{RIID: "ri-1", UtilizationPercent: 50}}, 95, + "us-east-1", "", lookup, ) - if len(recs) != 1 { - t.Fatalf("expected rec to still ship; got %d", len(recs)) - } - if len(recs[0].AlternativeTargets) != 1 || recs[0].AlternativeTargets[0].InstanceType != "m6i.large" { - t.Fatalf("expected only m6i.large to survive; got %+v", recs[0].AlternativeTargets) - } + require.Len(t, recs, 1) + require.Len(t, recs[0].AlternativeTargets, 1) + assert.Equal(t, "c5.large", recs[0].AlternativeTargets[0].InstanceType) } -func TestAnalyzeReshapingWithOfferings_LookupErrorFallsBackToBaseRecs(t *testing.T) { +// TestAnalyzeReshapingWithRecs_LookupErrorFallsBackToBaseRecs — Cost +// Explorer cache read fails: return the base recs (primary target +// intact, empty alternatives). Losing alternatives is strictly less +// bad than losing the whole reshape page. +func TestAnalyzeReshapingWithRecs_LookupErrorFallsBackToBaseRecs(t *testing.T) { t.Parallel() - // Cost Explorer 5xx → return base recs with empty - // AlternativeTargets (primary target still shown). Losing pricing - // is strictly less bad than losing the whole reshape page. - lookup := func(ctx context.Context, _ []string) ([]OfferingOption, error) { - return nil, fmt.Errorf("api call failed") + lookup := func(_ context.Context, _, _ string) ([]OfferingOption, error) { + return nil, fmt.Errorf("postgres timeout") } - recs := AnalyzeReshapingWithOfferings( + recs := AnalyzeReshapingWithRecs( context.Background(), []RIInfo{{ID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible"}}, []UtilizationInfo{{RIID: "ri-1", UtilizationPercent: 50}}, 95, + "us-east-1", "USD", lookup, ) - if len(recs) != 1 { - t.Fatalf("expected 1 rec; got %d", len(recs)) - } + require.Len(t, recs, 1) assert.Equal(t, "m5.large", recs[0].TargetInstanceType) - // Base recs had name-only alternatives; lookup failed so those - // are preserved as-is (not silently cleared). The handler can - // still render instance-type chips even without pricing. - assert.NotEmpty(t, recs[0].AlternativeTargets, - "lookup error should leave the name-only alternatives intact") + assert.Empty(t, recs[0].AlternativeTargets, + "lookup error → no alternatives, primary target still ships") } -func TestAnalyzeReshapingWithOfferings_NilLookupUsesBaseRecs(t *testing.T) { +// TestAnalyzeReshapingWithRecs_NilLookupUsesBaseRecs — defensive: nil +// lookup should not panic; the base recs flow through unchanged. +func TestAnalyzeReshapingWithRecs_NilLookupUsesBaseRecs(t *testing.T) { t.Parallel() - recs := AnalyzeReshapingWithOfferings( + recs := AnalyzeReshapingWithRecs( context.Background(), []RIInfo{{ID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible"}}, []UtilizationInfo{{RIID: "ri-1", UtilizationPercent: 50}}, 95, + "us-east-1", "USD", nil, ) - if len(recs) != 1 { - t.Fatalf("expected 1 rec; got %d", len(recs)) + require.Len(t, recs, 1) + assert.Equal(t, "m5.large", recs[0].TargetInstanceType) + assert.Empty(t, recs[0].AlternativeTargets) +} + +// TestAnalyzeReshapingWithRecs_LegacyFamilyM4GeneratesAlternatives — +// post-refactor, "legacy family" support is no longer hand-curated: +// any cross-family offering returned by the cached recs surfaces as +// long as it passes the dollar-units check. An underutilised m4 RI +// paired against c5 / r5 / m5 recs surfaces all three (m5 because it +// is a different family from m4 and the gate accepts). +func TestAnalyzeReshapingWithRecs_LegacyFamilyM4GeneratesAlternatives(t *testing.T) { + t.Parallel() + lookup := func(_ context.Context, _, _ string) ([]OfferingOption, error) { + return []OfferingOption{ + {InstanceType: "m5.large", OfferingID: "off-m5l", EffectiveMonthlyCost: 60.0, NormalizationFactor: 4, CurrencyCode: "USD"}, + {InstanceType: "c5.large", OfferingID: "off-c5l", EffectiveMonthlyCost: 70.0, NormalizationFactor: 4, CurrencyCode: "USD"}, + }, nil } - assert.NotEmpty(t, recs[0].AlternativeTargets, - "nil lookup should still leave the name-only alternatives in place") + recs := AnalyzeReshapingWithRecs( + context.Background(), + []RIInfo{{ + ID: "ri-m4", InstanceType: "m4.xlarge", InstanceCount: 1, OfferingClass: "convertible", + NormalizationFactor: 8, MonthlyCost: 25, CurrencyCode: "USD", // src units = 200 + }}, + []UtilizationInfo{{RIID: "ri-m4", UtilizationPercent: 50}}, + 95, + "us-east-1", "USD", + lookup, + ) + require.Len(t, recs, 1) + got := recs[0] + assert.Equal(t, "m4.large", got.TargetInstanceType) + require.Len(t, got.AlternativeTargets, 2, + "both m5.large and c5.large should pass the $-units check") + // Ascending by EffectiveMonthlyCost. + assert.Equal(t, "m5.large", got.AlternativeTargets[0].InstanceType) + assert.Equal(t, "c5.large", got.AlternativeTargets[1].InstanceType) } // TestPassesDollarUnitsCheck pins the local pre-filter rule that gates @@ -348,99 +375,3 @@ func TestPassesDollarUnitsCheck(t *testing.T) { }) } } - -// TestAnalyzeReshapingWithOfferings_LegacyFamilyM4GeneratesM5Alternative -// — given an underutilised m4 RI, the new legacy-family entry in -// peerFamilyGroups + the dollar-units pre-filter should surface m5 as -// an alternative when its (NF × EMC) is at least the source's units. -func TestAnalyzeReshapingWithOfferings_LegacyFamilyM4GeneratesM5Alternative(t *testing.T) { - t.Parallel() - - lookup := func(_ context.Context, _ []string) ([]OfferingOption, error) { - return []OfferingOption{ - // m4.large primary: NF=4, EMC=$50 → 200 units (matches src 200, passes the boundary). - {InstanceType: "m4.large", OfferingID: "off-m4l", EffectiveMonthlyCost: 50.0, NormalizationFactor: 4}, - // m5.large alternative: NF=4, EMC=$60 → 240 units (>= src 200 → passes). - {InstanceType: "m5.large", OfferingID: "off-m5l", EffectiveMonthlyCost: 60.0, NormalizationFactor: 4}, - }, nil - } - - recs := AnalyzeReshapingWithOfferings( - context.Background(), - []RIInfo{{ - ID: "ri-m4", InstanceType: "m4.xlarge", InstanceCount: 1, OfferingClass: "convertible", - NormalizationFactor: 8, MonthlyCost: 25, // src units = 200 - }}, - []UtilizationInfo{{RIID: "ri-m4", UtilizationPercent: 50}}, - 95, - lookup, - ) - require.Len(t, recs, 1) - got := recs[0] - assert.Equal(t, "m4.large", got.TargetInstanceType) - require.Len(t, got.AlternativeTargets, 1, "m5.large should pass the $-units check") - assert.Equal(t, "m5.large", got.AlternativeTargets[0].InstanceType) -} - -// TestAnalyzeReshapingWithOfferings_DollarUnitsFilterDropsUnviable -// — when an alternative would fail AWS's $-units check, the local -// pre-filter excludes it from the rec's AlternativeTargets so the UI -// doesn't show a suggestion that would be rejected at exchange time. -func TestAnalyzeReshapingWithOfferings_DollarUnitsFilterDropsUnviable(t *testing.T) { - t.Parallel() - - lookup := func(_ context.Context, _ []string) ([]OfferingOption, error) { - return []OfferingOption{ - // Primary m5.large stays untouched (it's not an "alternative"). - {InstanceType: "m5.large", OfferingID: "off-m5l", EffectiveMonthlyCost: 50.0, NormalizationFactor: 4}, - // m6i.large priced too cheap to satisfy the units check — should be filtered out. - {InstanceType: "m6i.large", OfferingID: "off-m6il", EffectiveMonthlyCost: 5.0, NormalizationFactor: 4}, - // m7g.large priced enough to pass. - {InstanceType: "m7g.large", OfferingID: "off-m7gl", EffectiveMonthlyCost: 50.0, NormalizationFactor: 4}, - }, nil - } - - recs := AnalyzeReshapingWithOfferings( - context.Background(), - []RIInfo{{ - ID: "ri-m5", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible", - NormalizationFactor: 8, MonthlyCost: 25, // src units = 200 - }}, - []UtilizationInfo{{RIID: "ri-m5", UtilizationPercent: 50}}, - 95, - lookup, - ) - require.Len(t, recs, 1) - alts := recs[0].AlternativeTargets - require.Len(t, alts, 1, "only m7g.large should pass; m6i.large gets filtered for failing the $-units check") - assert.Equal(t, "m7g.large", alts[0].InstanceType) -} - -// TestAnalyzeReshapingWithOfferings_NoSourcePricingSkipsFilter — when -// the caller doesn't populate RIInfo.MonthlyCost (zero), the filter is -// skipped entirely and behaviour matches today's "name + offering" -// shape. Pins backwards compatibility for older callers. -func TestAnalyzeReshapingWithOfferings_NoSourcePricingSkipsFilter(t *testing.T) { - t.Parallel() - - lookup := func(_ context.Context, _ []string) ([]OfferingOption, error) { - return []OfferingOption{ - {InstanceType: "m5.large", OfferingID: "off-m5l", EffectiveMonthlyCost: 50.0, NormalizationFactor: 4}, - // Would normally be dropped if the filter ran, but no source - // pricing means we keep today's "show every match" behaviour. - {InstanceType: "m6i.large", OfferingID: "off-m6il", EffectiveMonthlyCost: 5.0, NormalizationFactor: 4}, - {InstanceType: "m7g.large", OfferingID: "off-m7gl", EffectiveMonthlyCost: 50.0, NormalizationFactor: 4}, - }, nil - } - - recs := AnalyzeReshapingWithOfferings( - context.Background(), - // Older RIInfo shape: no MonthlyCost / CurrencyCode populated. - []RIInfo{{ID: "ri-m5", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible"}}, - []UtilizationInfo{{RIID: "ri-m5", UtilizationPercent: 50}}, - 95, - lookup, - ) - require.Len(t, recs, 1) - require.Len(t, recs[0].AlternativeTargets, 2, "with no source pricing, both alternatives must remain visible") -} diff --git a/providers/aws/services/ec2/client.go b/providers/aws/services/ec2/client.go index 6d6ded7d..95603460 100644 --- a/providers/aws/services/ec2/client.go +++ b/providers/aws/services/ec2/client.go @@ -457,95 +457,6 @@ type FindConvertibleOfferingParams struct { Duration int64 } -// FindConvertibleOfferings enumerates convertible RI offerings for -// every instance type in instanceTypes in a single -// DescribeReservedInstancesOfferings call (via a multi-value -// instance-type filter). Returns one OfferingOption per instance type -// that has at least one matching offering, sorted ascending by -// EffectiveMonthlyCost. Missing instance types (no matching offering -// in the region) are silently dropped — the caller treats that as -// "offer not available" rather than an error. -// -// Cost formula (matches AWS Reserved Instances console): -// -// monthly = (FixedPrice / hours_per_term + UsagePrice + Σ RecurringCharges[].Amount) × 730 -// hours_per_term = Duration_seconds / 3600 -// 730 = AWS's canonical hours-per-month -// -// Defaults mirror FindConvertibleOffering: Linux/UNIX, default tenancy, -// Region scope, 1 year term. -func (c *Client) FindConvertibleOfferings(ctx context.Context, instanceTypes []string) ([]exchange.OfferingOption, error) { - if len(instanceTypes) == 0 { - return nil, nil - } - duration := OneYearSeconds - filters := []types.Filter{ - {Name: aws.String("instance-type"), Values: instanceTypes}, - {Name: aws.String("product-description"), Values: []string{"Linux/UNIX"}}, - {Name: aws.String("instance-tenancy"), Values: []string{"default"}}, - {Name: aws.String("scope"), Values: []string{"Region"}}, - {Name: aws.String("duration"), Values: []string{fmt.Sprintf("%d", duration)}}, - {Name: aws.String("offering-class"), Values: []string{"convertible"}}, - } - input := &ec2.DescribeReservedInstancesOfferingsInput{ - Filters: filters, - IncludeMarketplace: aws.Bool(false), - MaxResults: aws.Int32(100), - } - result, err := c.client.DescribeReservedInstancesOfferings(ctx, input) - if err != nil { - return nil, fmt.Errorf("failed to describe convertible offerings: %w", err) - } - - // Pick the cheapest offering per instance type: AWS may return - // multiple offerings per (instance-type, duration, tenancy) tuple - // that differ only by payment option (all-upfront / partial / - // no-upfront); we want the one with the lowest effective monthly - // cost so the user's "alternative" comparison is apples-to-apples. - bestByType := make(map[string]exchange.OfferingOption) - for _, o := range result.ReservedInstancesOfferings { - instanceType := string(o.InstanceType) - cost := effectiveMonthlyCost(o) - if existing, ok := bestByType[instanceType]; ok && existing.EffectiveMonthlyCost <= cost { - continue - } - bestByType[instanceType] = exchange.OfferingOption{ - InstanceType: instanceType, - OfferingID: aws.ToString(o.ReservedInstancesOfferingId), - EffectiveMonthlyCost: cost, - NormalizationFactor: normalizationFactorForInstanceType(instanceType), - CurrencyCode: string(o.CurrencyCode), - } - } - - out := make([]exchange.OfferingOption, 0, len(bestByType)) - for _, o := range bestByType { - out = append(out, o) - } - sort.Slice(out, func(i, j int) bool { - return out[i].EffectiveMonthlyCost < out[j].EffectiveMonthlyCost - }) - return out, nil -} - -// effectiveMonthlyCost computes the monthly cost AWS's RI console -// displays. All three inputs (FixedPrice, UsagePrice, RecurringCharges) -// can be absent depending on the payment option — treat missing as 0. -func effectiveMonthlyCost(o types.ReservedInstancesOffering) float64 { - hoursPerTerm := float64(aws.ToInt64(o.Duration)) / 3600.0 - if hoursPerTerm <= 0 { - hoursPerTerm = float64(OneYearSeconds) / 3600.0 - } - fixed := float64(aws.ToFloat32(o.FixedPrice)) - usage := float64(aws.ToFloat32(o.UsagePrice)) - var recurring float64 - for _, rc := range o.RecurringCharges { - recurring += aws.ToFloat64(rc.Amount) - } - hourly := fixed/hoursPerTerm + usage + recurring - return hourly * 730.0 -} - // FindConvertibleOffering finds a convertible RI offering ID for the given parameters. func (c *Client) FindConvertibleOffering(ctx context.Context, params FindConvertibleOfferingParams) (string, error) { tenancy := params.Tenancy diff --git a/providers/aws/services/ec2/client_offerings_test.go b/providers/aws/services/ec2/client_offerings_test.go deleted file mode 100644 index f2f71e41..00000000 --- a/providers/aws/services/ec2/client_offerings_test.go +++ /dev/null @@ -1,112 +0,0 @@ -package ec2 - -import ( - "context" - "testing" - - "github.com/aws/aws-sdk-go-v2/aws" - "github.com/aws/aws-sdk-go-v2/service/ec2" - "github.com/aws/aws-sdk-go-v2/service/ec2/types" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/mock" -) - -// TestFindConvertibleOfferings_ReturnsRankedByEffectiveCost verifies that -// FindConvertibleOfferings batches its call, picks the cheapest offering -// per instance type, and sorts ascending by EffectiveMonthlyCost. -func TestFindConvertibleOfferings_ReturnsRankedByEffectiveCost(t *testing.T) { - t.Parallel() - mockEC2 := new(MockEC2Client) - client := &Client{client: mockEC2, region: "us-east-1"} - - // AWS returns 4 offerings: 2 for m5.large at different payment - // options, 1 for m6i.large, 1 for m7g.large. The method must - // collapse the two m5.large rows to the cheaper of the two and - // sort the result ascending by monthly cost. - // - // Costs chosen so sort order is m7g (cheapest) → m6i → m5 cheap - // variant → dropped m5 expensive variant. With 1-year term - // (31536000s = 8760 hours) and 730 hrs/month: - // m5.large cheap: FixedPrice=0, Usage=0.04 → 0.04 * 730 = 29.20 - // m5.large dear: FixedPrice=876, Usage=0 → 876/8760 * 730 = 73.00 - // m6i.large: FixedPrice=0, Usage=0.03 → 21.90 - // m7g.large: FixedPrice=0, Usage=0.02 → 14.60 - mockEC2.On("DescribeReservedInstancesOfferings", mock.Anything, - mock.MatchedBy(func(in *ec2.DescribeReservedInstancesOfferingsInput) bool { - // Assert the filter carries all three instance types in one call. - for _, f := range in.Filters { - if aws.ToString(f.Name) == "instance-type" { - if len(f.Values) != 3 { - return false - } - } - } - return true - })).Return(&ec2.DescribeReservedInstancesOfferingsOutput{ - ReservedInstancesOfferings: []types.ReservedInstancesOffering{ - { - ReservedInstancesOfferingId: aws.String("off-m5-cheap"), - InstanceType: "m5.large", - Duration: aws.Int64(OneYearSeconds), - FixedPrice: aws.Float32(0), - UsagePrice: aws.Float32(0.04), - }, - { - ReservedInstancesOfferingId: aws.String("off-m5-dear"), - InstanceType: "m5.large", - Duration: aws.Int64(OneYearSeconds), - FixedPrice: aws.Float32(876), - UsagePrice: aws.Float32(0), - }, - { - ReservedInstancesOfferingId: aws.String("off-m6i"), - InstanceType: "m6i.large", - Duration: aws.Int64(OneYearSeconds), - FixedPrice: aws.Float32(0), - UsagePrice: aws.Float32(0.03), - }, - { - ReservedInstancesOfferingId: aws.String("off-m7g"), - InstanceType: "m7g.large", - Duration: aws.Int64(OneYearSeconds), - FixedPrice: aws.Float32(0), - UsagePrice: aws.Float32(0.02), - }, - }, - }, nil) - - got, err := client.FindConvertibleOfferings(context.Background(), - []string{"m5.large", "m6i.large", "m7g.large"}) - assert.NoError(t, err) - if len(got) != 3 { - t.Fatalf("expected 3 offerings (one per instance type, expensive m5 dropped); got %d", len(got)) - } - - // Ascending by monthly cost. - assert.Equal(t, "m7g.large", got[0].InstanceType) - assert.Equal(t, "off-m7g", got[0].OfferingID) - assert.InDelta(t, 14.60, got[0].EffectiveMonthlyCost, 0.01) - - assert.Equal(t, "m6i.large", got[1].InstanceType) - assert.InDelta(t, 21.90, got[1].EffectiveMonthlyCost, 0.01) - - assert.Equal(t, "m5.large", got[2].InstanceType) - assert.Equal(t, "off-m5-cheap", got[2].OfferingID, "cheaper m5.large variant must win over the expensive one") - assert.InDelta(t, 29.20, got[2].EffectiveMonthlyCost, 0.01) - - // Single batched API call, not one per instance type. - mockEC2.AssertNumberOfCalls(t, "DescribeReservedInstancesOfferings", 1) -} - -// TestFindConvertibleOfferings_EmptyInputMakesNoCall guards against -// fan-out when the caller passes an empty list (shouldn't happen in -// practice but the helper is library-public). -func TestFindConvertibleOfferings_EmptyInputMakesNoCall(t *testing.T) { - t.Parallel() - mockEC2 := new(MockEC2Client) - client := &Client{client: mockEC2} - got, err := client.FindConvertibleOfferings(context.Background(), nil) - assert.NoError(t, err) - assert.Nil(t, got) - mockEC2.AssertNotCalled(t, "DescribeReservedInstancesOfferings", mock.Anything, mock.Anything) -} From 742b5b063a21c95e6139cb43f7b1aab2fddd104a Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Mon, 27 Apr 2026 23:20:34 +0200 Subject: [PATCH 02/12] fix(api): normalize region after loadAWSConfigWithRegion (#79 nitpick) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the reshape endpoint is called without ?region=, loadAWSConfigWithRegion resolves a default region from the AWS SDK chain but the local `region` string stays empty. That empty value then propagates into the RI utilization cache key and AnalyzeReshapingWithRecs lookup, which scopes neither — so alternatives from unrelated regions could surface on the reshape page. Adopt cfg.Region after the config load when the query string was empty so every downstream consumer sees the same region the AWS clients are talking to. CodeRabbit (PR #79). --- internal/api/handler_ri_exchange.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/internal/api/handler_ri_exchange.go b/internal/api/handler_ri_exchange.go index 59e0a135..5ad80f63 100644 --- a/internal/api/handler_ri_exchange.go +++ b/internal/api/handler_ri_exchange.go @@ -229,6 +229,15 @@ func (h *Handler) getReshapeRecommendations(ctx context.Context, req *events.Lam if err != nil { return nil, fmt.Errorf("failed to load AWS config: %w", err) } + // Normalize region: when the caller omits ?region=, loadAWSConfigWithRegion + // resolves a default from the AWS SDK chain but the local string stays + // empty — which would scope the RI utilization cache and the recs lookup + // unscoped, leaking alternatives from other regions onto the reshape page. + // Adopt the resolved region so every downstream consumer sees the same + // value the AWS clients are actually talking to. + if region == "" { + region = cfg.Region + } ec2Client := h.buildReshapeEC2Client(cfg) instances, err := ec2Client.ListConvertibleReservedInstances(ctx) From 0768ac55fd8662edf8b26fcb7bab67122d796194 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Mon, 27 Apr 2026 23:22:56 +0200 Subject: [PATCH 03/12] fix(api): resolveAWSCloudAccountID returns (string, error) (#79 nitpick) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The resolver previously hid ListCloudAccounts errors behind an empty-string return that the caller treated as "no AWS account registered, omit the AccountIDs filter". A real DB error therefore caused the rec-driven lookup to run unscoped and could match recommendations from another tenant. Promote the function to (string, error): - ("", nil) for the intentionally-unregistered cases (no STS resolution, no CloudAccount rows, no matching ExternalID) — caller still treats this as "skip the filter" so first-run/dev keeps working. - ("", err) for genuine ListCloudAccounts failures — handler now aborts the reshape request rather than silently broadening scope. CodeRabbit (PR #79). --- internal/api/exchange_lookup.go | 38 +++++++++++++++++------------ internal/api/handler_ri_exchange.go | 9 +++++-- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/internal/api/exchange_lookup.go b/internal/api/exchange_lookup.go index 0a413cae..0c7290af 100644 --- a/internal/api/exchange_lookup.go +++ b/internal/api/exchange_lookup.go @@ -3,6 +3,7 @@ package api import ( "context" + "fmt" "strings" "github.com/LeanerCloud/CUDly/internal/config" @@ -98,32 +99,39 @@ func splitInstanceType(instanceType string) (family, size string) { // resolveAWSCloudAccountID maps the running AWS account ID (raw, from // STS) to the registered CloudAccount UUID so the reshape lookup can // scope its query against the correct row in the recommendations -// table. Returns "" when: +// table. Returns ("", nil) for the intentionally-unregistered cases: +// // - no AWS account ID could be resolved (STS denied / ambient creds // not available); -// - no CloudAccount row exists with that ExternalID (the operator -// hasn't registered this account yet — common on first-run / dev). +// - ListCloudAccounts returned no rows (no CloudAccounts registered +// at all); +// - no CloudAccount row matches the running ExternalID (the operator +// hasn't registered THIS account yet — common on first-run / dev). +// +// All three are clean "no scope filter" signals — purchaseRecLookupFromStore +// treats ("", nil) as "skip the AccountIDs filter" so a deployment with +// ambient credentials and no registered CloudAccounts still sees +// alternatives, not a permanently empty list. Once the operator +// registers the account the filter engages automatically. // -// Empty UUID propagates to purchaseRecLookupFromStore as "skip the -// AccountIDs filter", which causes the lookup to return whatever recs -// exist in the region. That's intentional: a deployment with one -// ambient-credentials account and no registered CloudAccounts should -// still see alternatives, not a permanently empty list. Once the -// operator registers the account the filter engages automatically. -func (h *Handler) resolveAWSCloudAccountID(ctx context.Context) string { +// A real ListCloudAccounts error (DB outage, permissions failure) is +// returned verbatim so the caller can abort the lookup rather than +// silently fall through to an unscoped query that might match the +// wrong tenant's recs. +func (h *Handler) resolveAWSCloudAccountID(ctx context.Context) (string, error) { awsAccountID := h.resolveAWSAccountID(ctx) if awsAccountID == "" { - return "" + return "", nil } provider := "aws" accounts, err := h.config.ListCloudAccounts(ctx, config.CloudAccountFilter{Provider: &provider}) - if err != nil || len(accounts) == 0 { - return "" + if err != nil { + return "", fmt.Errorf("list cloud accounts for reshape scope: %w", err) } for _, a := range accounts { if a.ExternalID == awsAccountID { - return a.ID + return a.ID, nil } } - return "" + return "", nil } diff --git a/internal/api/handler_ri_exchange.go b/internal/api/handler_ri_exchange.go index 5ad80f63..55742e81 100644 --- a/internal/api/handler_ri_exchange.go +++ b/internal/api/handler_ri_exchange.go @@ -259,8 +259,13 @@ func (h *Handler) getReshapeRecommendations(ctx context.Context, req *events.Lam // account (when registered) so a multi-tenant deployment can't // surface another tenant's recs. Empty resolved account ID means // "no scope filter" for ambient-credentials deployments where - // CloudAccount registration hasn't happened yet. - cloudAccountID := h.resolveAWSCloudAccountID(ctx) + // CloudAccount registration hasn't happened yet; a real ListCloudAccounts + // error aborts the request instead of silently falling through to an + // unscoped query that could match the wrong tenant's recs. + cloudAccountID, err := h.resolveAWSCloudAccountID(ctx) + if err != nil { + return nil, fmt.Errorf("failed to resolve cloud account scope for reshape: %w", err) + } currencyCode := firstNonEmptyCurrency(instances) lookup := purchaseRecLookupFromStore(h.config, cloudAccountID) recs := exchange.AnalyzeReshapingWithRecs(ctx, riInfos, utilInfos, threshold, region, currencyCode, lookup) From dd58fa7fb74d98800eb1aa3480c9d8d8373e06cb Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Mon, 27 Apr 2026 23:30:17 +0200 Subject: [PATCH 04/12] fix(exchange): filter alternatives by term/duration (#79 nitpick) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AnalyzeReshapingWithRecs surfaced cross-family alternatives without checking that the offering's commitment term matched the source RI's, so a 1y RI could land 3y alternatives in AlternativeTargets (and vice versa) — wrong because AWS only allows reshape exchanges within the same term. Add TermSeconds to RIInfo and OfferingOption (canonical AWS RI duration unit, matches ec2.ReservedInstance.Duration and the existing ConvertibleRI.Duration field). The handler plumbs inst.Duration through to RIInfo; the recs lookup converts rec.Term (years) using 31_536_000 s/year. fillAlternativesFromRecs rejects offerings whose TermSeconds differs from the source before the dollar-units gate runs. Both sides falling back to zero TermSeconds skips the term gate so existing fixtures and any older callers stay green. Regression tests: - pkg/exchange/reshape_crossfamily_test.go: 1y source + (1y, 3y) alternatives → only the 1y survives; zero-term source still passes the legacy mixed-term offering through. - internal/api/exchange_lookup_test.go: 1y rec → 31_536_000s, 3y rec → 3 × 31_536_000s, Term==0 → TermSeconds==0. CodeRabbit (PR #79). --- internal/api/exchange_lookup.go | 18 ++++ internal/api/exchange_lookup_test.go | 30 ++++++ internal/api/handler_ri_exchange.go | 5 + pkg/exchange/reshape.go | 118 ++++++++++++++++++----- pkg/exchange/reshape_crossfamily_test.go | 71 ++++++++++++++ 5 files changed, 219 insertions(+), 23 deletions(-) diff --git a/internal/api/exchange_lookup.go b/internal/api/exchange_lookup.go index 0c7290af..65f9a846 100644 --- a/internal/api/exchange_lookup.go +++ b/internal/api/exchange_lookup.go @@ -64,6 +64,12 @@ func purchaseRecLookupFromStore(store recsLister, accountID string) exchange.Pur // recommendation to the OfferingOption shape the reshape layer // consumes. Pulled out so the mapping can be unit-tested in isolation // (no DB / no ctx required). +// +// TermSeconds is derived from rec.Term (years) using the canonical +// AWS RI duration constant 31_536_000s/year — the same value the AWS +// SDK reports on ec2.ReservedInstance.Duration so the term-match guard +// in pkg/exchange.fillAlternativesFromRecs can compare apples-to-apples +// against RIInfo.TermSeconds populated from ec2.ConvertibleRI.Duration. func recommendationToOffering(rec config.RecommendationRecord, currencyCode string) exchange.OfferingOption { monthly := rec.MonthlyCost if rec.Term > 0 { @@ -75,15 +81,27 @@ func recommendationToOffering(rec config.RecommendationRecord, currencyCode stri } } _, size := splitInstanceType(rec.ResourceType) + var termSeconds int64 + if rec.Term > 0 { + termSeconds = int64(rec.Term) * secondsPerYear + } return exchange.OfferingOption{ InstanceType: rec.ResourceType, OfferingID: rec.ID, EffectiveMonthlyCost: monthly, NormalizationFactor: exchange.NormalizationFactorForSize(size), CurrencyCode: currencyCode, + TermSeconds: termSeconds, } } +// secondsPerYear is the AWS-canonical RI duration constant for a 1-year +// term (365 × 86400). Matches the value the AWS SDK reports on +// ec2.ReservedInstance.Duration and the value +// ec2.ConvertibleRI.Duration carries — used so OfferingOption.TermSeconds +// can be compared directly against RIInfo.TermSeconds. +const secondsPerYear int64 = 365 * 24 * 60 * 60 + // splitInstanceType splits "m5.xlarge" into ("m5", "xlarge"). Returns // empty strings if the format is unrecognized. Mirrors the helper in // pkg/exchange/reshape.go but kept local to avoid exporting a diff --git a/internal/api/exchange_lookup_test.go b/internal/api/exchange_lookup_test.go index 463d5871..5eedcb21 100644 --- a/internal/api/exchange_lookup_test.go +++ b/internal/api/exchange_lookup_test.go @@ -146,6 +146,36 @@ func TestPurchaseRecLookupFromStore_MapsFields(t *testing.T) { assert.InDelta(t, 50.0, got[1].EffectiveMonthlyCost, 0.001, "Term==0 means upfront cannot be amortised; fall back to MonthlyCost") assert.InDelta(t, 8.0, got[1].NormalizationFactor, 0.001, "xlarge → NF 8") + + // Term plumbing: 1y rec → 31_536_000 seconds (AWS canonical RI + // duration); Term==0 → TermSeconds==0 (the reshape term-match guard + // then falls back to "skip the gate" rather than blocking the rec). + assert.Equal(t, int64(365*24*60*60), got[0].TermSeconds, + "1-year rec must serialise to 31_536_000s for the term-match guard") + assert.Equal(t, int64(0), got[1].TermSeconds, + "Term==0 rec must not synthesise a fake duration — TermSeconds stays zero") +} + +// TestPurchaseRecLookupFromStore_ThreeYearTerm pins the multi-year +// path: rec.Term=3 must serialise to exactly 3 × 31_536_000s so the +// reshape term-match guard treats it as 3y rather than rounding it +// onto a 1y surface. +func TestPurchaseRecLookupFromStore_ThreeYearTerm(t *testing.T) { + t.Parallel() + store := &fakeRecsLister{ + out: []config.RecommendationRecord{ + { + ID: "rec-3y", Provider: "aws", Service: "ec2", Region: "us-east-1", + ResourceType: "m5.large", Term: 3, MonthlyCost: 10, + }, + }, + } + lookup := purchaseRecLookupFromStore(store, "") + got, err := lookup(context.Background(), "us-east-1", "USD") + require.NoError(t, err) + require.Len(t, got, 1) + assert.Equal(t, int64(3*365*24*60*60), got[0].TermSeconds, + "3-year rec must serialise to 3 × 31_536_000s for the term-match guard") } // TestSplitInstanceType pins the local instance-type parser used by diff --git a/internal/api/handler_ri_exchange.go b/internal/api/handler_ri_exchange.go index 55742e81..3ea4db67 100644 --- a/internal/api/handler_ri_exchange.go +++ b/internal/api/handler_ri_exchange.go @@ -193,6 +193,11 @@ func convertToExchangeTypes(instances []ec2svc.ConvertibleRI, utilData []recomme NormalizationFactor: inst.NormalizationFactor, MonthlyCost: monthlyCostFromConvertibleRI(inst), CurrencyCode: inst.CurrencyCode, + // Plumb the AWS-reported RI duration straight through — + // reshape's term-match guard rejects alternatives whose + // TermSeconds differs from the source so a 3y RI never + // surfaces as an alternative to a 1y commitment. + TermSeconds: inst.Duration, } } diff --git a/pkg/exchange/reshape.go b/pkg/exchange/reshape.go index 22f3546c..155c994f 100644 --- a/pkg/exchange/reshape.go +++ b/pkg/exchange/reshape.go @@ -29,6 +29,16 @@ type RIInfo struct { // comparisons across currencies. Empty matches the same "skip" // semantics as MonthlyCost == 0. CurrencyCode string + // TermSeconds is the RI commitment term in seconds (31_536_000 for + // 1y, 94_608_000 for 3y) — the same unit the AWS SDK uses on the + // ReservedInstance.Duration field and the unit ec2.ConvertibleRI.Duration + // already carries. Used by the cross-family alternatives pass to + // reject term-mismatched offerings (e.g. surfacing a 3y RI as an + // alternative to a 1y commitment is wrong because the customer's + // existing exchange anchor is the 1y term). Zero means "unknown" — + // the term filter is skipped for that rec to preserve today's + // behaviour for older callers that don't populate it. + TermSeconds int64 } // UtilizationInfo provides utilization data for a specific RI. @@ -62,6 +72,15 @@ type OfferingOption struct { // and target currencies to match; empty on either side falls back to // "skip the currency guard" so today's USD-only fixtures stay green. CurrencyCode string `json:"currency_code,omitempty"` + // TermSeconds is the offering's commitment term in seconds + // (31_536_000 for 1y, 94_608_000 for 3y) — same unit as + // RIInfo.TermSeconds and the AWS SDK's ReservedInstance.Duration. + // AnalyzeReshapingWithRecs rejects term-mismatched alternatives + // before building AlternativeTargets so a 3y RI never surfaces as + // an alternative to a 1y source commitment (and vice versa). Zero + // on either side falls back to "skip the term guard" for callers + // or fixtures that don't populate it. + TermSeconds int64 `json:"term_seconds,omitempty"` } // PurchaseRecLookup is the signature of the closure that resolves a @@ -341,37 +360,22 @@ func AnalyzeReshapingWithRecs( // fillAlternativesFromRecs is the per-rec body of // AnalyzeReshapingWithRecs: for each rec it filters the offerings list -// to cross-family options that pass the dollar-units gate, then sorts -// them ascending by EffectiveMonthlyCost so the cheapest lands first -// in the UI list. The source family and the primary target are excluded -// so the alternatives slice is meaningfully different from the primary -// suggestion. When source pricing is missing the gate is skipped for -// that rec (NF/MonthlyCost==0 case). +// to cross-family options that match the source RI's term and pass the +// dollar-units gate, then sorts them ascending by EffectiveMonthlyCost +// so the cheapest lands first in the UI list. The source family and +// the primary target are excluded so the alternatives slice is +// meaningfully different from the primary suggestion. When source +// pricing or the source/offering term is missing the relevant gate is +// skipped for that rec (NF/MonthlyCost==0 / TermSeconds==0 cases). func fillAlternativesFromRecs(recs []ReshapeRecommendation, offerings []OfferingOption, risByID map[string]RIInfo) { for i := range recs { src, hasSrc := risByID[recs[i].SourceRIID] srcFamily, _ := parseInstanceType(recs[i].SourceInstanceType) filled := make([]OfferingOption, 0, len(offerings)) for _, off := range offerings { - // Skip same-family offerings — those collapse onto the - // primary same-family TargetInstanceType already surfaced. - offFamily, _ := parseInstanceType(off.InstanceType) - if offFamily == "" || strings.EqualFold(offFamily, srcFamily) { + if !alternativeIsEligible(recs[i], off, src, hasSrc, srcFamily) { continue } - // Skip the primary target — it's not an "alternative" to - // itself. - if off.InstanceType == recs[i].TargetInstanceType { - continue - } - // Apply the pre-filter only when source pricing info is - // available; otherwise keep today's behaviour and pass the - // alternative through untouched. - if hasSrc && src.MonthlyCost > 0 { - if !passesDollarUnitsCheck(src.NormalizationFactor, src.MonthlyCost, src.CurrencyCode, off) { - continue - } - } filled = append(filled, off) } sort.Slice(filled, func(a, b int) bool { @@ -381,6 +385,74 @@ func fillAlternativesFromRecs(recs []ReshapeRecommendation, offerings []Offering } } +// alternativeIsEligible decides whether a single offering should appear +// in the AlternativeTargets slice for the given rec. Pulled out of +// fillAlternativesFromRecs so the per-offering guards stay readable +// (and below the 10-cyclomatic-complexity gate). Returns true only when +// every gate passes: +// +// - Cross-family: the offering must be a different family from the +// source RI; same-family offerings collapse onto the primary +// TargetInstanceType already surfaced. +// - Not the primary target: an "alternative to itself" is no +// alternative. +// - Term match: when both sides report TermSeconds, they must match +// because AWS only allows exchanges within the same term. Either +// side at zero falls back to "skip the gate" so legacy callers +// stay green. +// - $-units: when source pricing is available, the local +// passesDollarUnitsCheck approximation must hold; otherwise the +// gate is skipped. +func alternativeIsEligible(rec ReshapeRecommendation, off OfferingOption, src RIInfo, hasSrc bool, srcFamily string) bool { + if !isCrossFamilyAlternative(off, srcFamily, rec.TargetInstanceType) { + return false + } + if !termMatchesIfKnown(src, off, hasSrc) { + return false + } + if !pricingGatePasses(src, off, hasSrc) { + return false + } + return true +} + +// isCrossFamilyAlternative returns true when the offering is a valid +// cross-family alternative slot — i.e. its family parses, differs from +// the source family, and the offering isn't the same as the primary +// target the rec already surfaces. +func isCrossFamilyAlternative(off OfferingOption, srcFamily, primaryTarget string) bool { + offFamily, _ := parseInstanceType(off.InstanceType) + if offFamily == "" || strings.EqualFold(offFamily, srcFamily) { + return false + } + if off.InstanceType == primaryTarget { + return false + } + return true +} + +// termMatchesIfKnown enforces the term-match guard when both sides +// report TermSeconds. Either side at zero (or no source RI at all) +// returns true so legacy fixtures and older callers keep today's +// behaviour. +func termMatchesIfKnown(src RIInfo, off OfferingOption, hasSrc bool) bool { + if !hasSrc || src.TermSeconds <= 0 || off.TermSeconds <= 0 { + return true + } + return src.TermSeconds == off.TermSeconds +} + +// pricingGatePasses runs the $-units pre-filter when source pricing +// is available. Without source pricing the gate is skipped to preserve +// backwards compatibility for callers that only need the primary-target +// analysis. +func pricingGatePasses(src RIInfo, off OfferingOption, hasSrc bool) bool { + if !hasSrc || src.MonthlyCost <= 0 { + return true + } + return passesDollarUnitsCheck(src.NormalizationFactor, src.MonthlyCost, src.CurrencyCode, off) +} + // findBestFit finds the instance size and count that best fits normalizedUsed units. // Strategy: find the largest single-instance size that doesn't exceed normalizedUsed, // then round up to that size. This gives a practical 1-instance recommendation. diff --git a/pkg/exchange/reshape_crossfamily_test.go b/pkg/exchange/reshape_crossfamily_test.go index 306c0bd6..4b543949 100644 --- a/pkg/exchange/reshape_crossfamily_test.go +++ b/pkg/exchange/reshape_crossfamily_test.go @@ -290,6 +290,77 @@ func TestAnalyzeReshapingWithRecs_LegacyFamilyM4GeneratesAlternatives(t *testing assert.Equal(t, "c5.large", got.AlternativeTargets[1].InstanceType) } +// TestAnalyzeReshapingWithRecs_TermMismatchedAlternativesFiltered pins +// the term-match guard: a 1y source RI must not see 3y alternatives in +// AlternativeTargets (and vice versa) because AWS only allows exchanges +// within the same term. Both sides populate TermSeconds — the guard +// rejects the mismatched offering before AlternativeTargets is built. +const oneYearSeconds = int64(365 * 24 * 60 * 60) +const threeYearSeconds = 3 * oneYearSeconds + +func TestAnalyzeReshapingWithRecs_TermMismatchedAlternativesFiltered(t *testing.T) { + t.Parallel() + lookup := func(_ context.Context, _, _ string) ([]OfferingOption, error) { + return []OfferingOption{ + // 1y c5.large — same term as the source, must surface. + {InstanceType: "c5.large", OfferingID: "off-c5-1y", EffectiveMonthlyCost: 50, NormalizationFactor: 4, CurrencyCode: "USD", TermSeconds: oneYearSeconds}, + // 3y r5.large — term mismatch, must be filtered out even + // though it would otherwise pass the $-units check. + {InstanceType: "r5.large", OfferingID: "off-r5-3y", EffectiveMonthlyCost: 60, NormalizationFactor: 4, CurrencyCode: "USD", TermSeconds: threeYearSeconds}, + }, nil + } + recs := AnalyzeReshapingWithRecs( + context.Background(), + []RIInfo{{ + ID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible", + NormalizationFactor: 8, MonthlyCost: 25, CurrencyCode: "USD", + TermSeconds: oneYearSeconds, // 1y source + }}, + []UtilizationInfo{{RIID: "ri-1", UtilizationPercent: 50}}, + 95, + "us-east-1", "USD", + lookup, + ) + require.Len(t, recs, 1) + require.Len(t, recs[0].AlternativeTargets, 1, + "only the 1y alternative survives; the 3y r5.large must be filtered as term-mismatched") + assert.Equal(t, "c5.large", recs[0].AlternativeTargets[0].InstanceType) + assert.Equal(t, oneYearSeconds, recs[0].AlternativeTargets[0].TermSeconds, + "surviving alternative must carry the source's term so the UI can label it") +} + +// TestAnalyzeReshapingWithRecs_TermZeroSkipsTermGuard pins the +// backwards-compat path: when either the source or the offering omits +// TermSeconds, the guard does not fire and the alternative passes +// through (subject to the other gates). Mirrors the existing +// MonthlyCost==0 / CurrencyCode=="" skip semantics. +func TestAnalyzeReshapingWithRecs_TermZeroSkipsTermGuard(t *testing.T) { + t.Parallel() + lookup := func(_ context.Context, _, _ string) ([]OfferingOption, error) { + return []OfferingOption{ + // 3y offering, but the source has no term info — the term + // gate stays disabled so the offering surfaces. + {InstanceType: "r5.large", OfferingID: "off-r5-3y", EffectiveMonthlyCost: 60, NormalizationFactor: 4, CurrencyCode: "USD", TermSeconds: threeYearSeconds}, + }, nil + } + recs := AnalyzeReshapingWithRecs( + context.Background(), + []RIInfo{{ + ID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1, OfferingClass: "convertible", + NormalizationFactor: 8, MonthlyCost: 25, CurrencyCode: "USD", + // TermSeconds intentionally zero — older callers / fixtures. + }}, + []UtilizationInfo{{RIID: "ri-1", UtilizationPercent: 50}}, + 95, + "us-east-1", "USD", + lookup, + ) + require.Len(t, recs, 1) + require.Len(t, recs[0].AlternativeTargets, 1, + "source TermSeconds==0 must skip the term gate so today's behaviour is preserved") + assert.Equal(t, "r5.large", recs[0].AlternativeTargets[0].InstanceType) +} + // TestPassesDollarUnitsCheck pins the local pre-filter rule that gates // cross-family alternatives. The rule is conservative: when source NF // or any side's price is zero, the check returns false (skip). When From c3428ad7ec8ee29c4d21b6a82fa5774a8877bc39 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Mon, 27 Apr 2026 23:52:25 +0200 Subject: [PATCH 05/12] fix(api): fail closed on STS errors in reshape account-scope resolver (#79) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit review #2 flagged this as CRITICAL: resolveAWSCloudAccountID still returned ("", nil) when the underlying STS call failed, which propagated up as "no AccountIDs filter" and could leak another tenants recommendations on a transient AWS API hiccup. Fix the underlying primitive AND the wrapper: - internal/api/handler.go::resolveAWSAccountID — signature changes from string to (string, error). Three return shapes now distinguished: ("", nil) — AWS SDK config could not load (Azure/GCP host) ("", err) — config loaded but STS GetCallerIdentity failed (accountID, nil) — success The non-AWS-host case stays a clean signal; only real STS failures now surface as errors. - internal/api/exchange_lookup.go::resolveAWSCloudAccountID — propagate the STS error verbatim. Multi-tenant deployments now fail closed on STS hiccups instead of silently issuing an unscoped recommendations read. - internal/api/handler.go::resolveSourceIdentity — keeps the previous best-effort semantics (drops the error explicitly with a comment). populateSourceAccountID already fails loud on an empty AccountID for the AWS-source case, so the security gate is still in place; we just preserve backwards compatibility for non-reshape callers. plus pricingGatePasses NF fallback (CodeRabbit nit on PR #79): - pkg/exchange/reshape.go::pricingGatePasses — when src.NormalizationFactor is zero (callers occasionally leave it unset), derive NF from InstanceType via NormalizationFactorForSize. Avoids the silent reject-everything failure mode for partial RIInfo constructions. Tests: full internal/api + pkg/exchange suites green. --- internal/api/exchange_lookup.go | 25 +++++++++---- internal/api/handler.go | 66 ++++++++++++++++++++++++--------- pkg/exchange/reshape.go | 15 +++++++- 3 files changed, 80 insertions(+), 26 deletions(-) diff --git a/internal/api/exchange_lookup.go b/internal/api/exchange_lookup.go index 65f9a846..103d017a 100644 --- a/internal/api/exchange_lookup.go +++ b/internal/api/exchange_lookup.go @@ -119,8 +119,9 @@ func splitInstanceType(instanceType string) (family, size string) { // scope its query against the correct row in the recommendations // table. Returns ("", nil) for the intentionally-unregistered cases: // -// - no AWS account ID could be resolved (STS denied / ambient creds -// not available); +// - AWS SDK config could not load (deployment is running on an +// Azure / GCP host with no AWS context at all — resolveAWSAccountID +// returns ("", nil) for this case); // - ListCloudAccounts returned no rows (no CloudAccounts registered // at all); // - no CloudAccount row matches the running ExternalID (the operator @@ -132,13 +133,23 @@ func splitInstanceType(instanceType string) (family, size string) { // alternatives, not a permanently empty list. Once the operator // registers the account the filter engages automatically. // -// A real ListCloudAccounts error (DB outage, permissions failure) is -// returned verbatim so the caller can abort the lookup rather than -// silently fall through to an unscoped query that might match the -// wrong tenant's recs. +// FAIL CLOSED on real failures: +// - resolveAWSAccountID returns a non-nil error (STS GetCallerIdentity +// denied, transient AWS API failure, token expiry) — propagated so +// the caller aborts the lookup rather than silently falling through +// to an unscoped query that could leak another tenant's recs. +// - ListCloudAccounts returns an error (DB outage, permissions) — +// same treatment. func (h *Handler) resolveAWSCloudAccountID(ctx context.Context) (string, error) { - awsAccountID := h.resolveAWSAccountID(ctx) + awsAccountID, err := h.resolveAWSAccountID(ctx) + if err != nil { + // STS reachable-but-failed: must NOT fall through to an + // unscoped read. A transient STS error in a multi-tenant + // deployment would otherwise surface another tenant's recs. + return "", fmt.Errorf("resolve source aws account for reshape scope: %w", err) + } if awsAccountID == "" { + // Genuine "no AWS context" (Azure/GCP host, or unregistered). return "", nil } provider := "aws" diff --git a/internal/api/handler.go b/internal/api/handler.go index 276fa85e..94a2ea4e 100644 --- a/internal/api/handler.go +++ b/internal/api/handler.go @@ -458,7 +458,15 @@ func (h *Handler) resolveSourceIdentity(ctx context.Context) *sourceIdentity { id := &sourceIdentity{Provider: cloud} switch cloud { case "aws": - id.AccountID, id.Partition = h.resolveAWSCallerIdentity(ctx) + // resolveSourceIdentity is best-effort and is consumed by + // populateSourceAccountID, which fails loud on an empty + // AccountID for the AWS-source case. STS errors are already + // logged WARN inside resolveAWSCallerIdentity, so we drop + // the error here explicitly — the consumer's empty-string + // check is the security gate for federation rendering. + // (Reshape uses resolveAWSAccountID directly which DOES + // propagate the error for fail-closed multi-tenant safety.) + id.AccountID, id.Partition, _ = h.resolveAWSCallerIdentity(ctx) case "azure": id.ClientID = os.Getenv("AZURE_CLIENT_ID") id.SubscriptionID = os.Getenv("AZURE_SUBSCRIPTION_ID") @@ -478,38 +486,60 @@ func (h *Handler) resolveSourceAccountID(ctx context.Context) string { } // resolveAWSAccountID returns the AWS account ID where CUDly runs by calling -// STS GetCallerIdentity. Returns "" on AWS SDK config load failure or STS -// error (logs WARN in either case). +// STS GetCallerIdentity. Convenience wrapper around resolveAWSCallerIdentity +// for callers that only need the account ID (and the security-paths-aware +// error propagation). // -// WARNING: empty string is the silent-failure path. Callers in user-facing -// flows MUST check the result and fail loud rather than rendering an empty -// account ID into a bundle — a bundle with an empty source_account_id -// produces a broken trust policy that silently fails at apply time. -func (h *Handler) resolveAWSAccountID(ctx context.Context) string { - id, _ := h.resolveAWSCallerIdentity(ctx) - return id +// Return shape distinguishes three cases: +// +// - ("", nil) — AWS SDK config could not load (deployment runs +// without AWS context: e.g. Azure / GCP host). +// This is the legitimate "no AWS account configured" +// signal; callers may treat it as such. +// - ("", err) — AWS SDK config loaded but STS GetCallerIdentity +// failed (transient API error, missing IAM permission, +// token expiry). Callers in security-sensitive paths +// (e.g. multi-tenant scope filters) MUST fail closed +// on this — treating it like the legitimate empty +// case can leak data across tenants. +// - (accountID, nil) — success. +// +// WARNING: callers in user-facing flows must check the result and fail loud +// rather than rendering an empty account ID into a bundle — a bundle with +// an empty source_account_id produces a broken trust policy that silently +// fails at apply time. +func (h *Handler) resolveAWSAccountID(ctx context.Context) (string, error) { + id, _, err := h.resolveAWSCallerIdentity(ctx) + return id, err } // resolveAWSCallerIdentity returns (accountID, partition) parsed from STS // GetCallerIdentity. The partition is taken from the returned ARN's second // segment (e.g., `arn:aws-us-gov:iam::...` → "aws-us-gov") and is used by // the trust-policy snippet renderer to emit the correct ARN prefix in -// AWS China and GovCloud deployments (issue #130c). Both fields are best- -// effort: a STS failure returns ("", "") and a malformed ARN returns -// (accountID, "") — the frontend defaults to the `aws` partition when -// the value is empty. -func (h *Handler) resolveAWSCallerIdentity(ctx context.Context) (string, string) { +// AWS China and GovCloud deployments (issue #130c). +// +// Return shape distinguishes the same three cases as resolveAWSAccountID: +// +// - ("", "", nil) — AWS SDK config could not load (Azure/GCP host). +// - ("", "", err) — config loaded but STS failed; security- +// sensitive callers MUST fail closed on this. +// - (accountID, partition, nil) — success. partition may still be "" if the +// ARN was malformed (frontend defaults to "aws"). +func (h *Handler) resolveAWSCallerIdentity(ctx context.Context) (string, string, error) { h.awsCfgOnce.Do(func() { h.awsCfg, h.awsCfgErr = awsconfig.LoadDefaultConfig(ctx) }) if h.awsCfgErr != nil { - return "", "" + // AWS context genuinely unavailable — not an error from this + // caller's perspective (Azure/GCP host runs land here). + return "", "", nil } client := sts.NewFromConfig(h.awsCfg) identity, err := client.GetCallerIdentity(ctx, &sts.GetCallerIdentityInput{}) if err != nil { logging.Warnf("Failed to resolve source account ID via STS: %v", err) - return "", "" + return "", "", fmt.Errorf("sts get-caller-identity: %w", err) } var accountID, partition string if identity.Account != nil { @@ -518,7 +548,7 @@ func (h *Handler) resolveAWSCallerIdentity(ctx context.Context) (string, string) if identity.Arn != nil { partition = parseArnPartition(*identity.Arn) } - return accountID, partition + return accountID, partition, nil } // parseArnPartition extracts the partition segment from an AWS ARN. diff --git a/pkg/exchange/reshape.go b/pkg/exchange/reshape.go index 155c994f..211e2656 100644 --- a/pkg/exchange/reshape.go +++ b/pkg/exchange/reshape.go @@ -446,11 +446,24 @@ func termMatchesIfKnown(src RIInfo, off OfferingOption, hasSrc bool) bool { // is available. Without source pricing the gate is skipped to preserve // backwards compatibility for callers that only need the primary-target // analysis. +// +// Defensive NF fallback: callers occasionally populate MonthlyCost but +// leave NormalizationFactor at zero (e.g. tests, partial RIInfo +// constructions). A zero NF here would reject every alternative even +// when InstanceType is parseable. Derive NF from the instance size when +// it's missing; if the size doesn't match the AWS canonical table, +// NormalizationFactorForSize returns 1.0 — degrades to "no NF +// adjustment" rather than a hard reject. func pricingGatePasses(src RIInfo, off OfferingOption, hasSrc bool) bool { if !hasSrc || src.MonthlyCost <= 0 { return true } - return passesDollarUnitsCheck(src.NormalizationFactor, src.MonthlyCost, src.CurrencyCode, off) + srcNF := src.NormalizationFactor + if srcNF <= 0 { + _, size := parseInstanceType(src.InstanceType) + srcNF = NormalizationFactorForSize(size) + } + return passesDollarUnitsCheck(srcNF, src.MonthlyCost, src.CurrencyCode, off) } // findBestFit finds the instance size and count that best fits normalizedUsed units. From 159300708bd28797f430ba6268227919bf9ff9ee Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Tue, 28 Apr 2026 00:18:31 +0200 Subject: [PATCH 06/12] fix(api): close two more multi-tenant leak paths in reshape scope (#79) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit's third review pass surfaced two more CRITICAL gaps in the same code path: 1. `internal/api/exchange_lookup.go::resolveAWSCloudAccountID` previously returned ("", nil) when the running AWS account ID was resolved but DID NOT match any registered CloudAccount. `purchaseRecLookupFromStore` interpreted that as "skip the AccountIDs filter" and read every account's cached recs in the region — leaking other tenants' data when the host account was misconfigured or running as a fresh (unregistered) deployment alongside other already-registered tenants. Fix: distinguish "zero accounts registered" (legitimate bootstrap, recs table is also empty so unscoped read is harmless) from "accounts exist but none match the running ExternalID" (real misconfiguration, fail closed). The bootstrap path keeps ("", nil); the resolved-but-unregistered path now returns an explicit error. 2. `internal/api/handler.go::resolveAWSCallerIdentity` previously returned ("", "", nil) on EVERY AWS SDK config-load failure, including when the host is itself AWS. An AWS-hosted deployment whose SDK config breaks (IAM role expiry, credential file corruption) was silently treated as "no AWS context", which degraded the resolveAWSCloudAccountID multi-tenant scope filter into the same unscoped-read leak. Fix: only treat config-load failure as a non-error when sourceCloud() != "aws". On an AWS host, surface the error so security-sensitive callers fail closed. Both findings were duplicates from CodeRabbit's prior reviews on the same review pass — they re-flagged after seeing the partial fail-closed work in `f44b06567` didn't cover all three paths. Tests: full internal/api suite green. The new failure modes are covered by existing tests' coverage of the resolve helpers; pinning the specific cases as named tests is filed as #151. --- internal/api/exchange_lookup.go | 42 +++++++++++++++++++++++---------- internal/api/handler.go | 26 +++++++++++++++----- 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/internal/api/exchange_lookup.go b/internal/api/exchange_lookup.go index 103d017a..84785e32 100644 --- a/internal/api/exchange_lookup.go +++ b/internal/api/exchange_lookup.go @@ -117,29 +117,33 @@ func splitInstanceType(instanceType string) (family, size string) { // resolveAWSCloudAccountID maps the running AWS account ID (raw, from // STS) to the registered CloudAccount UUID so the reshape lookup can // scope its query against the correct row in the recommendations -// table. Returns ("", nil) for the intentionally-unregistered cases: +// table. Returns ("", nil) ONLY for the truly-no-scope cases: // // - AWS SDK config could not load (deployment is running on an // Azure / GCP host with no AWS context at all — resolveAWSAccountID // returns ("", nil) for this case); -// - ListCloudAccounts returned no rows (no CloudAccounts registered -// at all); -// - no CloudAccount row matches the running ExternalID (the operator -// hasn't registered THIS account yet — common on first-run / dev). +// - ListCloudAccounts returned ZERO rows (no CloudAccounts registered +// at all — the bootstrap-before-first-account path; the recs table +// is also empty so an unscoped read is harmless). // -// All three are clean "no scope filter" signals — purchaseRecLookupFromStore -// treats ("", nil) as "skip the AccountIDs filter" so a deployment with -// ambient credentials and no registered CloudAccounts still sees -// alternatives, not a permanently empty list. Once the operator -// registers the account the filter engages automatically. +// purchaseRecLookupFromStore treats ("", nil) as "skip the AccountIDs +// filter" so a deployment with ambient credentials and no registered +// CloudAccounts still sees alternatives, not a permanently empty list. +// Once the operator registers the account the filter engages. // -// FAIL CLOSED on real failures: +// FAIL CLOSED on every real failure: // - resolveAWSAccountID returns a non-nil error (STS GetCallerIdentity // denied, transient AWS API failure, token expiry) — propagated so // the caller aborts the lookup rather than silently falling through // to an unscoped query that could leak another tenant's recs. // - ListCloudAccounts returns an error (DB outage, permissions) — // same treatment. +// - The running AWS account is resolved but DOES NOT match any +// registered CloudAccount AND there ARE registered accounts: +// return an error. Returning ("", nil) here would have +// purchaseRecLookupFromStore omit the AccountIDs filter and serve +// up another tenant's recs — exactly the multi-tenant leak the rest +// of this code path is designed to prevent. func (h *Handler) resolveAWSCloudAccountID(ctx context.Context) (string, error) { awsAccountID, err := h.resolveAWSAccountID(ctx) if err != nil { @@ -149,7 +153,7 @@ func (h *Handler) resolveAWSCloudAccountID(ctx context.Context) (string, error) return "", fmt.Errorf("resolve source aws account for reshape scope: %w", err) } if awsAccountID == "" { - // Genuine "no AWS context" (Azure/GCP host, or unregistered). + // Genuine "no AWS context" (Azure/GCP host). return "", nil } provider := "aws" @@ -157,10 +161,22 @@ func (h *Handler) resolveAWSCloudAccountID(ctx context.Context) (string, error) if err != nil { return "", fmt.Errorf("list cloud accounts for reshape scope: %w", err) } + if len(accounts) == 0 { + // Bootstrap: no CloudAccounts registered at all. The recs + // table is necessarily empty too, so an unscoped read is a + // no-op rather than a leak. + return "", nil + } for _, a := range accounts { if a.ExternalID == awsAccountID { return a.ID, nil } } - return "", nil + // Resolved-but-unregistered: AWS host has accounts, but the running + // account isn't one of them. Could be a misconfigured deployment, a + // fresh first-run before the operator added their own account, or + // (worst case) a host running in a different account than any + // registered tenant. Either way, returning "" here would skip the + // AccountIDs filter and leak other tenants' recs — fail closed. + return "", fmt.Errorf("running aws account %s is not registered in cloud_accounts; reshape scope cannot be resolved safely", awsAccountID) } diff --git a/internal/api/handler.go b/internal/api/handler.go index 94a2ea4e..88b5904c 100644 --- a/internal/api/handler.go +++ b/internal/api/handler.go @@ -519,20 +519,34 @@ func (h *Handler) resolveAWSAccountID(ctx context.Context) (string, error) { // the trust-policy snippet renderer to emit the correct ARN prefix in // AWS China and GovCloud deployments (issue #130c). // -// Return shape distinguishes the same three cases as resolveAWSAccountID: +// Return shape: // -// - ("", "", nil) — AWS SDK config could not load (Azure/GCP host). -// - ("", "", err) — config loaded but STS failed; security- -// sensitive callers MUST fail closed on this. +// - ("", "", nil) — host is non-AWS (sourceCloud() != "aws") +// AND AWS SDK config could not load. This is the legitimate +// "no AWS context" path for Azure/GCP-hosted deployments. +// - ("", "", err) — host is AWS but the SDK config could +// not load, OR config loaded but STS GetCallerIdentity failed. +// Both are real failures; security-sensitive callers MUST fail +// closed on this. // - (accountID, partition, nil) — success. partition may still be "" if the // ARN was malformed (frontend defaults to "aws"). +// +// The sourceCloud() check on the config-load path prevents an +// AWS-hosted deployment from being silently treated as "no AWS context" +// when its own SDK config breaks — that would degrade the multi-tenant +// scope filter in resolveAWSCloudAccountID into an unscoped read. func (h *Handler) resolveAWSCallerIdentity(ctx context.Context) (string, string, error) { h.awsCfgOnce.Do(func() { h.awsCfg, h.awsCfgErr = awsconfig.LoadDefaultConfig(ctx) }) if h.awsCfgErr != nil { - // AWS context genuinely unavailable — not an error from this - // caller's perspective (Azure/GCP host runs land here). + if sourceCloud() == "aws" { + // AWS host but SDK config broken: real failure. Surface + // the error so security-sensitive callers fail closed. + return "", "", fmt.Errorf("aws sdk config load: %w", h.awsCfgErr) + } + // Azure/GCP host: AWS context is legitimately absent, not an + // error from this caller's perspective. return "", "", nil } client := sts.NewFromConfig(h.awsCfg) From 37c057198f63951194d576129cb4297da82269c9 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Tue, 28 Apr 2026 00:34:23 +0200 Subject: [PATCH 07/12] fix(api): per-instance cost normalization + integration test STS bypass (#79) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two more CodeRabbit findings on review pass 4: 1. **Per-instance cost normalization in `recommendationToOffering`** AWS Cost Explorer returns `MonthlyCost` and `UpfrontCost` as totals for the recommended `Count` of instances, not per-instance. The reshape layer compares `OfferingOption.EffectiveMonthlyCost` against `RIInfo.MonthlyCost` which is per-instance, so previously a recommendation suggesting 10 of an instance would surface with 10x the per-instance price and rank artificially low. Fix: divide `MonthlyCost` and `UpfrontCost` by `rec.Count` before building the `OfferingOption`. Guard against `rec.Count == 0` by falling back to count=1 (practically rare since the scheduler only stores recs with `Count >= 1`). 2. **Integration test STS bypass** `TestReshapeRecommendations_Integration_EndToEnd` calls `getReshapeRecommendations`, which now invokes `resolveAWSCloudAccountID` → `sts.GetCallerIdentity`. Pre-populating `h.awsCfg` with a minimal `aws.Config{Region: "us-east-1"}` keeps `LoadDefaultConfig` from running but leaves the STS call live — which fails on dev / CI machines without AWS credentials. Fix: add a `reshapeAccountResolver func(ctx) (string, error)` injection point on `Handler`, mirroring the existing `reshapeEC2Factory` / `reshapeRecsFactory` pattern. Production leaves it nil; tests wire a fixed-result fake. The test now uses `func(_ context.Context) (string, error) { return "", nil }` — the legitimate "no scope filter" path, which matches the test's existing expectation that all seeded recs are visible. Tests: `go build ./internal/api/...` clean, full unit-test suite green. Integration tests need Docker for testcontainers; can't run locally, but the build proves the seam compiles. --- internal/api/exchange_lookup.go | 15 +++++++++++++-- internal/api/handler.go | 8 ++++++++ internal/api/handler_ri_exchange.go | 8 +++++++- .../api/handler_ri_exchange_integration_test.go | 6 ++++++ 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/internal/api/exchange_lookup.go b/internal/api/exchange_lookup.go index 84785e32..818174ed 100644 --- a/internal/api/exchange_lookup.go +++ b/internal/api/exchange_lookup.go @@ -71,13 +71,24 @@ func purchaseRecLookupFromStore(store recsLister, accountID string) exchange.Pur // in pkg/exchange.fillAlternativesFromRecs can compare apples-to-apples // against RIInfo.TermSeconds populated from ec2.ConvertibleRI.Duration. func recommendationToOffering(rec config.RecommendationRecord, currencyCode string) exchange.OfferingOption { - monthly := rec.MonthlyCost + // AWS Cost Explorer returns MonthlyCost and UpfrontCost as totals for + // the recommended Count of instances, not per-instance. The reshape + // layer compares OfferingOption.EffectiveMonthlyCost against + // RIInfo.MonthlyCost which is per-instance, so we normalize here. + // rec.Count == 0 is treated as "unknown count, fall back to total" + // rather than dividing by zero — practically rare since the scheduler + // only stores recs with Count >= 1. + count := float64(1) + if rec.Count > 0 { + count = float64(rec.Count) + } + monthly := rec.MonthlyCost / count if rec.Term > 0 { // rec.Term is in years; canonical AWS RI/SP amortisation uses // 12 months per year regardless of leap years. termMonths := float64(rec.Term * 12) if termMonths > 0 { - monthly += rec.UpfrontCost / termMonths + monthly += rec.UpfrontCost / (count * termMonths) } } _, size := splitInstanceType(rec.ResourceType) diff --git a/internal/api/handler.go b/internal/api/handler.go index 88b5904c..86f6dc82 100644 --- a/internal/api/handler.go +++ b/internal/api/handler.go @@ -70,6 +70,14 @@ type Handler struct { reshapeEC2Factory func(aws.Config) reshapeEC2Client reshapeRecsFactory func(aws.Config) reshapeRecsClient + // Optional account-resolver injection point used by the reshape + // handler integration test. When nil (the production default), the + // handler calls h.resolveAWSCloudAccountID which in turn invokes + // sts.GetCallerIdentity — fine in Lambda but fails on dev machines + // without AWS credentials. Tests set this to a fixed-result fake so + // the integration suite runs hermetically. + reshapeAccountResolver func(context.Context) (string, error) + // commitmentOpts discovers which AWS (term, payment) combinations // each service actually sells and validates saves against that data. // Nil is valid: the endpoint returns unavailable and save-side diff --git a/internal/api/handler_ri_exchange.go b/internal/api/handler_ri_exchange.go index 3ea4db67..ecf2c4f6 100644 --- a/internal/api/handler_ri_exchange.go +++ b/internal/api/handler_ri_exchange.go @@ -267,7 +267,13 @@ func (h *Handler) getReshapeRecommendations(ctx context.Context, req *events.Lam // CloudAccount registration hasn't happened yet; a real ListCloudAccounts // error aborts the request instead of silently falling through to an // unscoped query that could match the wrong tenant's recs. - cloudAccountID, err := h.resolveAWSCloudAccountID(ctx) + resolveAccount := h.resolveAWSCloudAccountID + if h.reshapeAccountResolver != nil { + // Test injection — bypasses sts.GetCallerIdentity so the + // integration suite runs without AWS credentials. + resolveAccount = h.reshapeAccountResolver + } + cloudAccountID, err := resolveAccount(ctx) if err != nil { return nil, fmt.Errorf("failed to resolve cloud account scope for reshape: %w", err) } diff --git a/internal/api/handler_ri_exchange_integration_test.go b/internal/api/handler_ri_exchange_integration_test.go index 8017eb37..871c7ac5 100644 --- a/internal/api/handler_ri_exchange_integration_test.go +++ b/internal/api/handler_ri_exchange_integration_test.go @@ -67,6 +67,12 @@ func buildReshapeHandler(store *config.PostgresStore, ec2Fake *fakeReshapeEC2, r apiKey: "test-api-key", reshapeEC2Factory: func(_ aws.Config) reshapeEC2Client { return ec2Fake }, reshapeRecsFactory: func(_ aws.Config) reshapeRecsClient { return recsFake }, + // Bypass STS GetCallerIdentity so the test runs without real + // AWS credentials. Empty cloud-account ID = no AccountIDs + // filter on the recs lookup, which is the legitimate + // "no-scope-filter" path; tests that assert scope filtering + // would set this to a UUID matching a seeded CloudAccount. + reshapeAccountResolver: func(_ context.Context) (string, error) { return "", nil }, } // Pre-populate the AWS config cache so loadAWSConfigWithRegion // returns immediately without LoadDefaultConfig. The Region field From 62667c8d214793ff1997f80560de96b45c29ed48 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Tue, 28 Apr 2026 01:04:11 +0200 Subject: [PATCH 08/12] fix(api): loadAPIKey uses local AWS config (no shared sync.Once poisoning) (#79) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit review pass 5 inline finding: `Handler.loadAPIKey` shares `awsCfgOnce`/`awsCfgErr` with the request- path identity resolver. NewHandler runs loadAPIKey at cold-start with a 5-second deadline. If that deadline fires, the shared sync.Once seals `awsCfgErr = "context deadline exceeded"` for the entire Lambda container lifetime — and after #79's fail-closed STS error propagation, that means the multi-tenant reshape scope filter is permanently broken until the container recycles. Fix: loadAPIKey now does a local `awsconfig.LoadDefaultConfig(ctx)` call instead of writing to the shared cache. The cold-start timeout stays scoped to that one call; the request-path cache is loaded fresh on first identity-resolver invocation. Pre-existing bug (loadAPIKey has shared the cache since the original shared-config refactor), but #79's fail-closed change made it materially worse: a once-poisoned cache used to silently degrade to "no AWS context" (unscoped read), which is now correctly an error. The cure is to stop poisoning the cache. Skipped the test-coverage nitpick (CodeRabbit asked for a scoped- account companion integration test). That work overlaps with the broader regression-tests follow-up filed as #151 and is deferred there to keep this PR's scope focused. Tests: full internal/api unit suite green; build clean. --- internal/api/handler.go | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/internal/api/handler.go b/internal/api/handler.go index 86f6dc82..cbb797d9 100644 --- a/internal/api/handler.go +++ b/internal/api/handler.go @@ -401,20 +401,26 @@ func (h *Handler) buildResponse(statusCode int, headers map[string]string, body } // loadAPIKey retrieves the API key from Secrets Manager. -// It caches the base AWS config via awsCfgOnce to avoid redundant config loads. +// +// Uses a local AWS config load — does NOT share the request-path +// awsCfgOnce/awsCfgErr cache. Reason: NewHandler runs loadAPIKey at +// cold-start with a 5-second deadline. If that deadline fires, the +// shared sync.Once would seal awsCfgErr = "context deadline exceeded" +// for the entire Lambda lifetime, permanently breaking the request-path +// resolveAWSCallerIdentity call (and the multi-tenant scope filter that +// depends on it). Loading config locally keeps the cold-start timeout +// scoped to this one call. func (h *Handler) loadAPIKey(ctx context.Context) (string, error) { if h.secretsARN == "" { return "", nil } - h.awsCfgOnce.Do(func() { - h.awsCfg, h.awsCfgErr = awsconfig.LoadDefaultConfig(ctx) - }) - if h.awsCfgErr != nil { - return "", h.awsCfgErr + cfg, err := awsconfig.LoadDefaultConfig(ctx) + if err != nil { + return "", err } - client := secretsmanager.NewFromConfig(h.awsCfg) + client := secretsmanager.NewFromConfig(cfg) result, err := client.GetSecretValue(ctx, &secretsmanager.GetSecretValueInput{ SecretId: &h.secretsARN, }) From 9d1cba17341829a12407fec89a8087eb4d4d4a72 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Tue, 28 Apr 2026 01:14:53 +0200 Subject: [PATCH 09/12] test(api): regression test for loadAPIKey shared-cache poisoning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit pass-5 follow-up: 62667c8d2 fixed loadAPIKey so it stops sharing h.awsCfgOnce/h.awsCfgErr with the request-path identity resolver, but the matching regression test was deferred. Adding it here. TestLoadAPIKey_DoesNotPoisonSharedAWSConfigCache asserts logical isolation rather than trying to provoke a config-load failure (LoadDefaultConfig itself rarely errs — credential failures surface later at the SDK call). Two halves per subtest: 1. h.awsCfgErr stays nil regardless of loadAPIKey's outcome — the shared error field must not be written at all. 2. h.awsCfgOnce is still virgin after loadAPIKey returns. Verified with a sentinel sync.Once.Do trick: Do only runs the func if it has not yet been called, so if loadAPIKey had touched the shared Once, the sentinel would no-op and the assertion would fail. Two table-style subtests cover both branches of the loadAPIKey path (empty ARN short-circuit, and non-empty ARN that goes through awsconfig.LoadDefaultConfig). AWS env vars are pinned via t.Setenv so the SDK's default config resolution stays deterministic in CI without hitting ambient credentials. If a future refactor reintroduces h.awsCfgOnce.Do inside loadAPIKey, the second subtest fails on the `fired` assertion — preventing the multi-tenant reshape scope filter from being permanently broken by a transient cold-start AWS config failure. --- internal/api/handler_test.go | 105 +++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/internal/api/handler_test.go b/internal/api/handler_test.go index 93140249..1eed957a 100644 --- a/internal/api/handler_test.go +++ b/internal/api/handler_test.go @@ -3,6 +3,7 @@ package api import ( "context" "encoding/json" + "sync" "testing" "time" @@ -65,6 +66,110 @@ func TestHandler_loadAPIKey_EmptyARN(t *testing.T) { assert.Empty(t, key) } +// TestLoadAPIKey_DoesNotPoisonSharedAWSConfigCache is a regression test +// for CodeRabbit pass-5 finding 1: loadAPIKey used to share +// h.awsCfgOnce/h.awsCfgErr with the request-path identity resolver, so a +// transient cold-start failure (NewHandler runs loadAPIKey under a 5s +// deadline) could permanently seal awsCfgErr for the entire Lambda +// container lifetime — and after the fail-closed STS error propagation +// landed, that meant the multi-tenant reshape scope filter would be +// permanently broken until the container recycled. +// +// The fix (commit 62667c8d2) made loadAPIKey load AWS config locally and +// stop touching the shared sync.Once. This test asserts that logical +// isolation rather than trying to provoke a config-load failure +// (LoadDefaultConfig itself rarely errs — credential failures surface +// later at the SDK call). Two halves: +// +// 1. h.awsCfgErr stays nil regardless of loadAPIKey's outcome — the +// shared error field must not be written at all. +// 2. h.awsCfgOnce is still virgin after loadAPIKey returns. Verified +// by having a sentinel sync.Once.Do fire (Do only runs the func if +// it has not yet been called). If loadAPIKey had used the shared +// Once, our sentinel call would no-op and the test would fail. +func TestLoadAPIKey_DoesNotPoisonSharedAWSConfigCache(t *testing.T) { + // Force a deterministic AWS config environment so the test does not + // depend on the dev machine's ambient credentials. The actual outcome + // of the GetSecretValue call is irrelevant — we only assert that the + // shared cache is left untouched on the way through. We CLEAR rather + // than set fake credential env-vars: setting an `AKIA…` shaped value + // trips the repo's git-secrets pre-commit scanner, and clearing them + // (combined with /dev/null shared-config files) is enough to keep + // LoadDefaultConfig deterministic — it will resolve to "no + // credentials" without hitting any user's ambient profile. + t.Setenv("AWS_REGION", "us-east-1") + t.Setenv("AWS_ACCESS_KEY_ID", "") + t.Setenv("AWS_SECRET_ACCESS_KEY", "") + t.Setenv("AWS_SESSION_TOKEN", "") + // Disable any locally-configured shared config / profile so the SDK + // never tries to read ~/.aws/config in CI. + t.Setenv("AWS_SDK_LOAD_CONFIG", "0") + t.Setenv("AWS_CONFIG_FILE", "/dev/null") + t.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/dev/null") + t.Setenv("AWS_PROFILE", "") + + cases := []struct { + name string + secretsARN string + }{ + { + name: "empty ARN short-circuits before any AWS work", + secretsARN: "", + }, + { + name: "non-empty ARN goes through awsconfig.LoadDefaultConfig", + // Syntactically valid but unresolvable ARN. The SecretsManager + // call will fail (no network / fake creds), but the failure + // must NOT touch the shared awsCfg* fields. + secretsARN: "arn:aws:secretsmanager:us-east-1:000000000000:secret:nonexistent-XXX", + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + h := &Handler{secretsARN: tc.secretsARN} + + // Bound the call so a hung credential-resolution attempt + // can't stall the test. We don't care about the outcome. + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + _, _ = h.loadAPIKey(ctx) + + // Half 1: the shared error field must be untouched. + assert.NoError(t, h.awsCfgErr, + "loadAPIKey must not write to h.awsCfgErr; doing so poisons the request-path identity resolver for the rest of the handler's lifetime") + + // Half 2: the shared sync.Once must still be virgin. If + // loadAPIKey had called h.awsCfgOnce.Do, our sentinel.Do + // below would no-op and `fired` would stay false. + fired := false + h.awsCfgOnce.Do(func() { fired = true }) + assert.True(t, fired, + "loadAPIKey must not call h.awsCfgOnce.Do; the request-path resolver owns that Once and a transient loadAPIKey failure must not seal it") + + // And nothing we did inside loadAPIKey should have populated + // the cached config — only the request-path resolver does. + // The Region we just set via our own Do above is the test's + // sentinel write, not loadAPIKey's. Compare structurally: + // loadAPIKey would have called LoadDefaultConfig and written + // a fully-populated config; the sentinel branch wrote zero. + assert.Empty(t, h.awsCfg.Region, + "loadAPIKey must not populate h.awsCfg; that field is owned by the request-path resolver") + }) + } + + // Sanity: an independent Once on a fresh handler still works + // normally — i.e. the test's sentinel trick isn't mutating package + // state in a way that breaks the production call site. This guards + // against a future refactor that accidentally shares the Once at + // package scope. + var probe sync.Once + probeFired := false + probe.Do(func() { probeFired = true }) + require.True(t, probeFired, "sanity: a fresh sync.Once must fire on first Do") +} + // Tests moved from handler_router_test.go - these test HandleRequest routing logic func TestHandler_HandleRequest_CORS_Preflight(t *testing.T) { From e775264db32c53aba7940e026a79529583b923e3 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Tue, 28 Apr 2026 01:43:59 +0200 Subject: [PATCH 10/12] test(api): scoped-account integration test for reshape recs lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit pass-5 finding 2: the existing reshape integration tests all wire reshapeAccountResolver to ("", nil), so only the unscoped "no AccountIDs filter" path had coverage. The scoped branch — which is the multi-tenant cross-account leak guard — was untested. A regression that silently dropped the AccountIDs filter (e.g. a future refactor reintroducing the resolved-but-unregistered fall- through bug) would have shipped undetected. This commit adds TestReshapeRecommendations_Integration_ScopedAccount_FiltersToAccount: 1. Seeds two CloudAccounts (Tenant A + Tenant B) in the test Postgres container so the cloud_account_id FK on the recommendations table is satisfied for both rows. 2. Seeds two cross-family recs in the same region — A's c5.large and B's r5.large — each tagged with its account's UUID. 3. Overrides h.reshapeAccountResolver to return Tenant A's UUID (the production equivalent of resolveAWSCloudAccountID returning a registered match). 4. Asserts the response surfaces ONLY c5.large in alternatives, never r5.large — proving the AccountIDs predicate landed on the SQL query rather than being dropped. Also tightens the EndToEnd test's same-family seed: was m5.large (same SIZE as the primary target, which trivial exact-match dedupe would also strip), now m5.2xlarge (different size, exercising the FAMILY-exclusion contract that's actually under test). A new seedCloudAccount helper keeps the new test readable and reusable for any future scoped-branch coverage. Existing tests are untouched beyond the same-family seed clarification. Tests: go test ./... green at root + pkg/. The new integration test follows the same setupReshapeHandlerIntegration / docker- based testcontainers pattern as its siblings, so it runs under the same CI gate. --- .../handler_ri_exchange_integration_test.go | 139 +++++++++++++++++- 1 file changed, 134 insertions(+), 5 deletions(-) diff --git a/internal/api/handler_ri_exchange_integration_test.go b/internal/api/handler_ri_exchange_integration_test.go index 871c7ac5..bb6a09a5 100644 --- a/internal/api/handler_ri_exchange_integration_test.go +++ b/internal/api/handler_ri_exchange_integration_test.go @@ -6,6 +6,7 @@ package api import ( "context" "encoding/json" + "strings" "sync/atomic" "testing" "time" @@ -116,11 +117,15 @@ func TestReshapeRecommendations_Integration_EndToEnd(t *testing.T) { store, cleanup := setupReshapeHandlerIntegration(ctx, t) defer cleanup() - // Seed cross-family recs in us-east-1: m5 (same family, must be - // excluded), c5, r5. The handler should surface c5 + r5 as - // cross-family alternatives ordered by EffectiveMonthlyCost. + // Seed cross-family recs in us-east-1. Same-family seed uses a + // different SIZE (m5.2xlarge) than the primary target (m5.large) + // so the test exercises the FAMILY-exclusion filter rather than + // trivial exact-match dedupe. Handler should surface c5 + r5 as + // cross-family alternatives ordered by EffectiveMonthlyCost; + // m5.2xlarge must NOT appear despite being a different size — + // same-family RIs aren't valid Convertible exchange targets. seedRecsForRegion(ctx, t, store, "us-east-1", []config.RecommendationRecord{ - {Provider: "aws", Service: "ec2", Region: "us-east-1", ResourceType: "m5.large", Term: 1, MonthlyCost: 40}, + {Provider: "aws", Service: "ec2", Region: "us-east-1", ResourceType: "m5.2xlarge", Term: 1, MonthlyCost: 40}, {Provider: "aws", Service: "ec2", Region: "us-east-1", ResourceType: "c5.large", Term: 1, MonthlyCost: 50}, {Provider: "aws", Service: "ec2", Region: "us-east-1", ResourceType: "r5.large", Term: 1, MonthlyCost: 60}, }) @@ -148,11 +153,17 @@ func TestReshapeRecommendations_Integration_EndToEnd(t *testing.T) { // Same-family m5 stripped; c5 + r5 surface ascending by cost. require.Len(t, rec.AlternativeTargets, 2, - "cross-family alternatives must come from the cached recs (m5 same-family excluded)") + "cross-family alternatives must come from the cached recs (same-family m5.2xlarge excluded)") require.Equal(t, "c5.large", rec.AlternativeTargets[0].InstanceType) require.InDelta(t, 50.0, rec.AlternativeTargets[0].EffectiveMonthlyCost, 0.001) require.Equal(t, "r5.large", rec.AlternativeTargets[1].InstanceType) require.InDelta(t, 60.0, rec.AlternativeTargets[1].EffectiveMonthlyCost, 0.001) + // Independent regression guard: no m5.* of any size should appear + // in alternatives — pins the family-exclusion contract. + for _, alt := range rec.AlternativeTargets { + require.False(t, strings.HasPrefix(alt.InstanceType, "m5."), + "same-family m5.* alternatives must be excluded, got %s", alt.InstanceType) + } // Verify the RI utilization cache row landed in Postgres. entry, err := store.GetRIUtilizationCache(ctx, "us-east-1", 30) @@ -238,3 +249,121 @@ func TestReshapeRecommendations_Integration_NoCachedRecsReturnsPrimaryOnly(t *te require.Empty(t, rec.AlternativeTargets, "empty cache → no alternatives, primary target still surfaced") } + +// seedCloudAccount inserts a minimal CloudAccount row and returns its +// generated UUID. Used by the scoped-account integration test so the +// recommendations table can carry a cloud_account_id that satisfies the +// FK to cloud_accounts(id). All fields beyond Provider / ExternalID / +// Name are left at their zero value — the recs scope filter joins only +// on cloud_account_id, so credentials and auth-mode plumbing are +// irrelevant for this test. +func seedCloudAccount(ctx context.Context, t *testing.T, store *config.PostgresStore, externalID, name string) string { + t.Helper() + account := &config.CloudAccount{ + Name: name, + Provider: "aws", + ExternalID: externalID, + Enabled: true, + } + require.NoError(t, store.CreateCloudAccount(ctx, account), + "seeding CloudAccount %q failed", externalID) + require.NotEmpty(t, account.ID, "CreateCloudAccount must generate a UUID") + return account.ID +} + +// TestReshapeRecommendations_Integration_ScopedAccount_FiltersToAccount +// is the companion to the existing unscoped integration tests +// (CodeRabbit pass-5 finding 2). The other tests all wire +// reshapeAccountResolver to ("", nil), exercising only the unscoped +// "no AccountIDs filter" path. This test exercises the scoped branch: +// +// 1. Seed two CloudAccounts (A and B) so the cloud_account_id FK on +// the recommendations table is satisfied for both rows. +// 2. Seed two recs in the same region — A's is c5.large, B's is +// r5.large — each tagged with its account's UUID. +// 3. Override reshapeAccountResolver to return A's UUID, simulating +// the production path where the running AWS account maps to one +// registered CloudAccount via h.resolveAWSCloudAccountID. +// 4. Assert the response only surfaces c5.large in alternatives, not +// r5.large — proving the recs query was scoped by AccountIDs +// rather than served as an unscoped read across both tenants. +// +// This is the cross-tenant leak guard the rest of the reshape path +// relies on. Without this test, the scoped branch had zero coverage — +// a regression that silently dropped the AccountIDs filter (e.g. a +// future refactor reintroducing the resolved-but-unregistered fall- +// through bug) would have shipped undetected. +func TestReshapeRecommendations_Integration_ScopedAccount_FiltersToAccount(t *testing.T) { + ctx := context.Background() + store, cleanup := setupReshapeHandlerIntegration(ctx, t) + defer cleanup() + + // Two registered accounts, distinct AWS account numbers. The recs + // scope filter compares against the CloudAccount UUID (not the AWS + // external ID), so the test resolver just hands back the UUID. + accountAID := seedCloudAccount(ctx, t, store, "111111111111", "Tenant A") + accountBID := seedCloudAccount(ctx, t, store, "222222222222", "Tenant B") + + // Seed one cross-family rec per tenant in the same region. If the + // scope filter is honoured, only Tenant A's c5.large surfaces; + // otherwise both alternatives leak through (the regression we're + // guarding against). + require.NoError(t, store.ReplaceRecommendations(ctx, time.Now(), []config.RecommendationRecord{ + { + Provider: "aws", + Service: "ec2", + Region: "us-east-1", + ResourceType: "c5.large", + Term: 1, + MonthlyCost: 50, + CloudAccountID: &accountAID, + }, + { + Provider: "aws", + Service: "ec2", + Region: "us-east-1", + ResourceType: "r5.large", + Term: 1, + MonthlyCost: 60, + CloudAccountID: &accountBID, + }, + }), "seeding scoped recommendations failed") + + ec2Fake := &fakeReshapeEC2{ + instances: []ec2svc.ConvertibleRI{ + {ReservedInstanceID: "ri-1", InstanceType: "m5.xlarge", InstanceCount: 1}, + }, + } + recsFake := &fakeReshapeRecs{ + utilization: []recommendations.RIUtilization{ + {ReservedInstanceID: "ri-1", UtilizationPercent: 50.0}, + }, + } + h := buildReshapeHandler(store, ec2Fake, recsFake) + // Override the unscoped default the helper sets: pretend the + // running AWS account resolves to Tenant A's CloudAccount UUID. + // Equivalent to h.resolveAWSCloudAccountID returning a real match. + h.reshapeAccountResolver = func(_ context.Context) (string, error) { + return accountAID, nil + } + + resp, err := h.getReshapeRecommendations(ctx, reshapeRequest()) + require.NoError(t, err) + body, ok := resp.(*ReshapeRecommendationsResponse) + require.True(t, ok, "response should be a ReshapeRecommendationsResponse") + require.Len(t, body.Recommendations, 1) + rec := body.Recommendations[0] + require.Equal(t, "ri-1", rec.SourceRIID) + + // Tenant A's c5.large is the only valid alternative; Tenant B's + // r5.large MUST be filtered out by the AccountIDs predicate. + require.Len(t, rec.AlternativeTargets, 1, + "scoped lookup must surface exactly one alternative — Tenant A's c5.large; "+ + "if Tenant B's r5.large appears, the cloud_account_id filter regressed "+ + "and recs are leaking across tenants") + require.Equal(t, "c5.large", rec.AlternativeTargets[0].InstanceType) + for _, alt := range rec.AlternativeTargets { + require.NotEqual(t, "r5.large", alt.InstanceType, + "r5.large belongs to Tenant B and must never surface for a Tenant-A-scoped request") + } +} From b877b80cf139e8fa23ce65f659d212ec34446712 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Tue, 28 Apr 2026 01:47:01 +0200 Subject: [PATCH 11/12] fix(api): short-circuit resolveAWSCallerIdentity on non-AWS hosts (#79) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit pass-6 inline finding: resolveAWSCallerIdentity used to call LoadDefaultConfig + STS regardless of host, then check sourceCloud() only to decide whether the failure was real. That wasted a config-load attempt on every reshape request from an Azure/GCP-hosted Lambda — pure overhead since the function will return ("", "", nil) anyway. Move the sourceCloud() check to the top of the function. Non-AWS hosts return immediately, no AWS SDK work attempted. AWS hosts with a broken SDK config continue to fail closed with the wrapped error so the multi-tenant scope filter in resolveAWSCloudAccountID doesn't degrade into an unscoped read. Comment block updated to make the short-circuit policy explicit. Tests: full internal/api unit suite green; build clean. --- internal/api/handler.go | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/internal/api/handler.go b/internal/api/handler.go index cbb797d9..83980e97 100644 --- a/internal/api/handler.go +++ b/internal/api/handler.go @@ -545,23 +545,24 @@ func (h *Handler) resolveAWSAccountID(ctx context.Context) (string, error) { // - (accountID, partition, nil) — success. partition may still be "" if the // ARN was malformed (frontend defaults to "aws"). // -// The sourceCloud() check on the config-load path prevents an -// AWS-hosted deployment from being silently treated as "no AWS context" -// when its own SDK config breaks — that would degrade the multi-tenant -// scope filter in resolveAWSCloudAccountID into an unscoped read. +// Non-AWS hosts (Azure/GCP) short-circuit before any AWS SDK work — +// no point burning a LoadDefaultConfig attempt + STS client on every +// reshape request from a host that genuinely has no AWS context. AWS +// hosts with a broken SDK config surface the load error so the +// multi-tenant scope filter in resolveAWSCloudAccountID fails closed +// instead of degrading into an unscoped read. func (h *Handler) resolveAWSCallerIdentity(ctx context.Context) (string, string, error) { + if sourceCloud() != "aws" { + // Azure/GCP host: short-circuit before any AWS SDK work. + return "", "", nil + } h.awsCfgOnce.Do(func() { h.awsCfg, h.awsCfgErr = awsconfig.LoadDefaultConfig(ctx) }) if h.awsCfgErr != nil { - if sourceCloud() == "aws" { - // AWS host but SDK config broken: real failure. Surface - // the error so security-sensitive callers fail closed. - return "", "", fmt.Errorf("aws sdk config load: %w", h.awsCfgErr) - } - // Azure/GCP host: AWS context is legitimately absent, not an - // error from this caller's perspective. - return "", "", nil + // AWS host but SDK config broken: real failure. Surface the + // error so security-sensitive callers fail closed. + return "", "", fmt.Errorf("aws sdk config load: %w", h.awsCfgErr) } client := sts.NewFromConfig(h.awsCfg) identity, err := client.GetCallerIdentity(ctx, &sts.GetCallerIdentityInput{}) From fa371a42c4796cef96e68101a5a273582dd17a83 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Tue, 28 Apr 2026 02:04:30 +0200 Subject: [PATCH 12/12] test(api): disable IMDS in loadAPIKey shared-cache regression test (#79) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit pass-7 nitpick: TestLoadAPIKey_DoesNotPoisonSharedAWSConfigCache cleared the credential env vars but left IMDS reachable. On CI runners or dev EC2 hosts where the metadata service is reachable, the SDK credential chain probes IMDS and waits the full retry window before falling through to "no credentials" — making the test slow and host-dependent. Add t.Setenv("AWS_EC2_METADATA_DISABLED", "true") so the chain short-circuits past IMDS and the test outcome depends only on the static env-var configuration we control. Test still passes; behaviour assertion (the shared cache stays untouched) is unchanged — this just makes the path deterministic. --- internal/api/handler_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/api/handler_test.go b/internal/api/handler_test.go index 1eed957a..82c1e022 100644 --- a/internal/api/handler_test.go +++ b/internal/api/handler_test.go @@ -107,6 +107,12 @@ func TestLoadAPIKey_DoesNotPoisonSharedAWSConfigCache(t *testing.T) { t.Setenv("AWS_CONFIG_FILE", "/dev/null") t.Setenv("AWS_SHARED_CREDENTIALS_FILE", "/dev/null") t.Setenv("AWS_PROFILE", "") + // Disable IMDS so the SDK doesn't probe EC2 instance metadata on + // hosts where it's reachable (CI runners, dev EC2). Without this, + // the credential provider chain can hang for the full IMDS retry + // window before falling through to "no credentials" — making the + // test slow and host-dependent. (CodeRabbit nit on PR #79.) + t.Setenv("AWS_EC2_METADATA_DISABLED", "true") cases := []struct { name string