From 8a8bc3582f06086f274d1293388bce8d924cccb3 Mon Sep 17 00:00:00 2001 From: nash Date: Sat, 14 Feb 2026 09:30:36 +0000 Subject: [PATCH] feat(k8s/cluster): add retry logic with exponential backoff to EKS provider Add retry logic to runAWS and runEksctl methods matching the pattern in GKE provider. This improves resilience for transient AWS API errors. Changes: - Add isRetryableError method to detect retryable AWS errors (throttling, timeouts, service unavailable, connection issues) - Add errorHint method to provide helpful guidance for common errors - Update runAWS with retry loop using exponential backoff (200ms, 500ms, 1200ms) - Update runEksctl with same retry pattern - Add AWS CLI presence check before running commands - Add comprehensive tests for both isRetryableError and errorHint Retryable error categories: - Throttling and rate limit errors - Timeout and deadline exceeded errors - Service unavailable and internal errors - Connection reset/refused errors - AWS-specific transient errors (RequestLimitExceeded, etc.) Refs #71 Co-Authored-By: Claude Opus 4.6 --- internal/k8s/cluster/eks.go | 173 +++++++++++++++++++--- internal/k8s/cluster/eks_test.go | 241 +++++++++++++++++++++++++++++++ 2 files changed, 396 insertions(+), 18 deletions(-) diff --git a/internal/k8s/cluster/eks.go b/internal/k8s/cluster/eks.go index 9e90e98..8dc36b1 100644 --- a/internal/k8s/cluster/eks.go +++ b/internal/k8s/cluster/eks.go @@ -1037,36 +1037,173 @@ func (p *EKSProvider) hasEksctl() bool { } func (p *EKSProvider) runEksctl(ctx context.Context, args ...string) (string, error) { - cmd := exec.CommandContext(ctx, "eksctl", args...) - cmd.Env = os.Environ() + backoffs := []time.Duration{200 * time.Millisecond, 500 * time.Millisecond, 1200 * time.Millisecond} + var lastErr error + var lastStderr string - var stdout, stderr bytes.Buffer - cmd.Stdout = &stdout - cmd.Stderr = &stderr + for attempt := 0; attempt <= len(backoffs); attempt++ { + cmd := exec.CommandContext(ctx, "eksctl", args...) + cmd.Env = os.Environ() - err := cmd.Run() - if err != nil { - return "", fmt.Errorf("eksctl command failed: %w, stderr: %s", err, stderr.String()) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err == nil { + return stdout.String(), nil + } + + lastErr = err + lastStderr = strings.TrimSpace(stderr.String()) + + if ctx.Err() != nil { + break + } + + if !p.isRetryableError(lastStderr) { + break + } + + if attempt < len(backoffs) { + time.Sleep(backoffs[attempt]) + } + } + + if lastErr == nil { + return "", fmt.Errorf("eksctl command failed") } - return stdout.String(), nil + return "", fmt.Errorf("eksctl command failed: %w, stderr: %s%s", lastErr, lastStderr, p.errorHint(lastStderr)) } func (p *EKSProvider) runAWS(ctx context.Context, args ...string) (string, error) { + if _, err := exec.LookPath("aws"); err != nil { + return "", fmt.Errorf("aws CLI not found in PATH (hint: install AWS CLI v2)") + } + // Add no-cli-pager to prevent interactive output args = append(args, "--no-cli-pager") - cmd := exec.CommandContext(ctx, "aws", args...) - cmd.Env = os.Environ() + backoffs := []time.Duration{200 * time.Millisecond, 500 * time.Millisecond, 1200 * time.Millisecond} + var lastErr error + var lastStderr string - var stdout, stderr bytes.Buffer - cmd.Stdout = &stdout - cmd.Stderr = &stderr + for attempt := 0; attempt <= len(backoffs); attempt++ { + cmd := exec.CommandContext(ctx, "aws", args...) + cmd.Env = os.Environ() - err := cmd.Run() - if err != nil { - return "", fmt.Errorf("aws command failed: %w, stderr: %s", err, stderr.String()) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err == nil { + return stdout.String(), nil + } + + lastErr = err + lastStderr = strings.TrimSpace(stderr.String()) + + if ctx.Err() != nil { + break + } + + if !p.isRetryableError(lastStderr) { + break + } + + if attempt < len(backoffs) { + time.Sleep(backoffs[attempt]) + } + } + + if lastErr == nil { + return "", fmt.Errorf("aws command failed") + } + + return "", fmt.Errorf("aws command failed: %w, stderr: %s%s", lastErr, lastStderr, p.errorHint(lastStderr)) +} + +// isRetryableError determines if an AWS CLI error should be retried +func (p *EKSProvider) isRetryableError(stderr string) bool { + lower := strings.ToLower(stderr) + + // Throttling errors + if strings.Contains(lower, "throttling") || strings.Contains(lower, "rate exceeded") { + return true + } + if strings.Contains(lower, "too many requests") || strings.Contains(lower, "request limit exceeded") { + return true } - return stdout.String(), nil + // Timeout errors + if strings.Contains(lower, "timeout") || strings.Contains(lower, "timed out") { + return true + } + if strings.Contains(lower, "deadline exceeded") { + return true + } + + // Transient service errors + if strings.Contains(lower, "service unavailable") || strings.Contains(lower, "internal error") { + return true + } + if strings.Contains(lower, "temporarily unavailable") { + return true + } + if strings.Contains(lower, "connection reset") || strings.Contains(lower, "connection refused") { + return true + } + + // AWS-specific transient errors + if strings.Contains(lower, "requestlimitexceeded") { + return true + } + if strings.Contains(lower, "provisionedthroughputexceeded") { + return true + } + + return false +} + +// errorHint provides helpful guidance for common AWS CLI errors +func (p *EKSProvider) errorHint(stderr string) string { + lower := strings.ToLower(stderr) + switch { + case strings.Contains(lower, "accessdenied") || strings.Contains(lower, "access denied"): + return " (hint: check IAM permissions for EKS operations)" + case strings.Contains(lower, "authorizationerror") || strings.Contains(lower, "not authorized"): + return " (hint: IAM user/role lacks required permissions for this operation)" + case strings.Contains(lower, "invalidparameterexception") || strings.Contains(lower, "invalid parameter"): + return " (hint: check parameter values match AWS requirements)" + case strings.Contains(lower, "resourceinuseexception") || strings.Contains(lower, "resource in use"): + return " (hint: resource is currently in use or being modified)" + case strings.Contains(lower, "resourcelimitexceeded") || strings.Contains(lower, "limit exceeded"): + return " (hint: AWS service quota exceeded, request limit increase)" + case strings.Contains(lower, "clusteralreadyexists"): + return " (hint: cluster with this name already exists in the region)" + case strings.Contains(lower, "unable to locate credentials") || strings.Contains(lower, "no credentials"): + return " (hint: run 'aws configure' or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY)" + case strings.Contains(lower, "expiredtoken") || strings.Contains(lower, "token has expired"): + return " (hint: AWS session token expired, refresh credentials)" + case strings.Contains(lower, "invalid region") || strings.Contains(lower, "could not connect"): + return " (hint: check region name is valid, e.g., us-east-1, eu-west-1)" + // Check specific resource types before general "not found" + case strings.Contains(lower, "vpc") && strings.Contains(lower, "not found"): + return " (hint: VPC does not exist or is not accessible)" + case strings.Contains(lower, "subnet") && strings.Contains(lower, "not found"): + return " (hint: subnet does not exist or is not in the specified VPC)" + case strings.Contains(lower, "security group") && strings.Contains(lower, "not found"): + return " (hint: security group does not exist or is not accessible)" + case strings.Contains(lower, "role") && strings.Contains(lower, "not found"): + return " (hint: IAM role ARN is invalid or role does not exist)" + // General "not found" check should be after specific resource checks + case strings.Contains(lower, "resourcenotfoundexception") || strings.Contains(lower, "not found"): + return " (hint: the specified resource does not exist in this region)" + case strings.Contains(lower, "throttling") || strings.Contains(lower, "rate exceeded"): + return " (hint: API rate limit exceeded, try again shortly)" + default: + return "" + } } diff --git a/internal/k8s/cluster/eks_test.go b/internal/k8s/cluster/eks_test.go index 43f5933..14d9fe9 100644 --- a/internal/k8s/cluster/eks_test.go +++ b/internal/k8s/cluster/eks_test.go @@ -263,3 +263,244 @@ func TestProviderManagerIntegration(t *testing.T) { t.Error("EKS not found in provider list") } } + +func TestEKSProviderIsRetryableError(t *testing.T) { + provider := NewEKSProvider(EKSProviderOptions{ + Region: "us-east-1", + }) + + tests := []struct { + name string + stderr string + retryable bool + }{ + { + name: "throttling error", + stderr: "Throttling: Rate exceeded", + retryable: true, + }, + { + name: "too many requests", + stderr: "TooManyRequestsException: Too many requests", + retryable: true, + }, + { + name: "request limit exceeded", + stderr: "Request limit exceeded for this account", + retryable: true, + }, + { + name: "timeout error", + stderr: "Connection timeout while connecting to endpoint", + retryable: true, + }, + { + name: "timed out", + stderr: "Operation timed out", + retryable: true, + }, + { + name: "deadline exceeded", + stderr: "Deadline exceeded while waiting for response", + retryable: true, + }, + { + name: "service unavailable", + stderr: "Service Unavailable: The service is currently unavailable", + retryable: true, + }, + { + name: "internal error", + stderr: "InternalError: An internal error occurred", + retryable: true, + }, + { + name: "temporarily unavailable", + stderr: "The service is temporarily unavailable", + retryable: true, + }, + { + name: "connection reset", + stderr: "Connection reset by peer", + retryable: true, + }, + { + name: "connection refused", + stderr: "Connection refused", + retryable: true, + }, + { + name: "request limit exceeded exception", + stderr: "RequestLimitExceeded: Request limit exceeded", + retryable: true, + }, + { + name: "provisioned throughput exceeded", + stderr: "ProvisionedThroughputExceededException: Rate exceeded", + retryable: true, + }, + { + name: "access denied not retryable", + stderr: "AccessDenied: User is not authorized", + retryable: false, + }, + { + name: "resource not found not retryable", + stderr: "ResourceNotFoundException: cluster not found", + retryable: false, + }, + { + name: "invalid parameter not retryable", + stderr: "InvalidParameterException: invalid value", + retryable: false, + }, + { + name: "cluster already exists not retryable", + stderr: "ClusterAlreadyExists: cluster test-cluster already exists", + retryable: false, + }, + { + name: "no credentials not retryable", + stderr: "Unable to locate credentials", + retryable: false, + }, + { + name: "unknown error not retryable", + stderr: "Some unknown error occurred", + retryable: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := provider.isRetryableError(tt.stderr) + if result != tt.retryable { + t.Errorf("isRetryableError(%q) = %v, want %v", tt.stderr, result, tt.retryable) + } + }) + } +} + +func TestEKSProviderErrorHint(t *testing.T) { + provider := NewEKSProvider(EKSProviderOptions{ + Region: "us-east-1", + }) + + tests := []struct { + name string + stderr string + contains string + }{ + { + name: "access denied", + stderr: "AccessDenied: User is not authorized", + contains: "IAM permissions", + }, + { + name: "authorization error", + stderr: "AuthorizationError: not authorized to perform operation", + contains: "IAM user/role lacks required permissions", + }, + { + name: "resource not found", + stderr: "ResourceNotFoundException: cluster not found", + contains: "does not exist", + }, + { + name: "invalid parameter", + stderr: "InvalidParameterException: invalid value for region", + contains: "check parameter values", + }, + { + name: "resource in use", + stderr: "ResourceInUseException: cluster is being updated", + contains: "currently in use", + }, + { + name: "limit exceeded", + stderr: "ResourceLimitExceeded: maximum number of clusters reached", + contains: "quota exceeded", + }, + { + name: "cluster already exists", + stderr: "ClusterAlreadyExists: cluster test-cluster already exists", + contains: "already exists", + }, + { + name: "no credentials", + stderr: "Unable to locate credentials", + contains: "aws configure", + }, + { + name: "expired token", + stderr: "ExpiredToken: security token has expired", + contains: "expired", + }, + { + name: "invalid region", + stderr: "Invalid region: us-invalid-1", + contains: "check region name", + }, + { + name: "vpc not found", + stderr: "VPC vpc-123 not found", + contains: "VPC does not exist", + }, + { + name: "subnet not found", + stderr: "Subnet subnet-123 not found", + contains: "subnet does not exist", + }, + { + name: "security group not found", + stderr: "Security group sg-123 not found", + contains: "security group does not exist", + }, + { + name: "role not found", + stderr: "Role arn:aws:iam::123:role/invalid not found", + contains: "IAM role ARN is invalid", + }, + { + name: "throttling", + stderr: "Throttling: Rate exceeded", + contains: "rate limit exceeded", + }, + { + name: "unknown error", + stderr: "Some unknown error occurred", + contains: "", // No hint expected + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + hint := provider.errorHint(tt.stderr) + if tt.contains == "" { + if hint != "" { + t.Errorf("expected no hint, got %q", hint) + } + } else { + if hint == "" { + t.Errorf("expected hint containing %q, got empty", tt.contains) + } else if !containsSubstring(hint, tt.contains) { + t.Errorf("expected hint containing %q, got %q", tt.contains, hint) + } + } + }) + } +} + +// containsSubstring checks if s contains substr (case-insensitive) +func containsSubstring(s, substr string) bool { + return len(s) >= len(substr) && (s == substr || len(substr) == 0 || findInString(s, substr)) +} + +func findInString(s, substr string) bool { + for i := 0; i <= len(s)-len(substr); i++ { + if s[i:i+len(substr)] == substr { + return true + } + } + return false +}