diff --git a/internal/k8s/cluster/eks.go b/internal/k8s/cluster/eks.go index 9e90e98..8dc36b1 100644 --- a/internal/k8s/cluster/eks.go +++ b/internal/k8s/cluster/eks.go @@ -1037,36 +1037,173 @@ func (p *EKSProvider) hasEksctl() bool { } func (p *EKSProvider) runEksctl(ctx context.Context, args ...string) (string, error) { - cmd := exec.CommandContext(ctx, "eksctl", args...) - cmd.Env = os.Environ() + backoffs := []time.Duration{200 * time.Millisecond, 500 * time.Millisecond, 1200 * time.Millisecond} + var lastErr error + var lastStderr string - var stdout, stderr bytes.Buffer - cmd.Stdout = &stdout - cmd.Stderr = &stderr + for attempt := 0; attempt <= len(backoffs); attempt++ { + cmd := exec.CommandContext(ctx, "eksctl", args...) + cmd.Env = os.Environ() - err := cmd.Run() - if err != nil { - return "", fmt.Errorf("eksctl command failed: %w, stderr: %s", err, stderr.String()) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err == nil { + return stdout.String(), nil + } + + lastErr = err + lastStderr = strings.TrimSpace(stderr.String()) + + if ctx.Err() != nil { + break + } + + if !p.isRetryableError(lastStderr) { + break + } + + if attempt < len(backoffs) { + time.Sleep(backoffs[attempt]) + } + } + + if lastErr == nil { + return "", fmt.Errorf("eksctl command failed") } - return stdout.String(), nil + return "", fmt.Errorf("eksctl command failed: %w, stderr: %s%s", lastErr, lastStderr, p.errorHint(lastStderr)) } func (p *EKSProvider) runAWS(ctx context.Context, args ...string) (string, error) { + if _, err := exec.LookPath("aws"); err != nil { + return "", fmt.Errorf("aws CLI not found in PATH (hint: install AWS CLI v2)") + } + // Add no-cli-pager to prevent interactive output args = append(args, "--no-cli-pager") - cmd := exec.CommandContext(ctx, "aws", args...) - cmd.Env = os.Environ() + backoffs := []time.Duration{200 * time.Millisecond, 500 * time.Millisecond, 1200 * time.Millisecond} + var lastErr error + var lastStderr string - var stdout, stderr bytes.Buffer - cmd.Stdout = &stdout - cmd.Stderr = &stderr + for attempt := 0; attempt <= len(backoffs); attempt++ { + cmd := exec.CommandContext(ctx, "aws", args...) + cmd.Env = os.Environ() - err := cmd.Run() - if err != nil { - return "", fmt.Errorf("aws command failed: %w, stderr: %s", err, stderr.String()) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err == nil { + return stdout.String(), nil + } + + lastErr = err + lastStderr = strings.TrimSpace(stderr.String()) + + if ctx.Err() != nil { + break + } + + if !p.isRetryableError(lastStderr) { + break + } + + if attempt < len(backoffs) { + time.Sleep(backoffs[attempt]) + } + } + + if lastErr == nil { + return "", fmt.Errorf("aws command failed") + } + + return "", fmt.Errorf("aws command failed: %w, stderr: %s%s", lastErr, lastStderr, p.errorHint(lastStderr)) +} + +// isRetryableError determines if an AWS CLI error should be retried +func (p *EKSProvider) isRetryableError(stderr string) bool { + lower := strings.ToLower(stderr) + + // Throttling errors + if strings.Contains(lower, "throttling") || strings.Contains(lower, "rate exceeded") { + return true + } + if strings.Contains(lower, "too many requests") || strings.Contains(lower, "request limit exceeded") { + return true } - return stdout.String(), nil + // Timeout errors + if strings.Contains(lower, "timeout") || strings.Contains(lower, "timed out") { + return true + } + if strings.Contains(lower, "deadline exceeded") { + return true + } + + // Transient service errors + if strings.Contains(lower, "service unavailable") || strings.Contains(lower, "internal error") { + return true + } + if strings.Contains(lower, "temporarily unavailable") { + return true + } + if strings.Contains(lower, "connection reset") || strings.Contains(lower, "connection refused") { + return true + } + + // AWS-specific transient errors + if strings.Contains(lower, "requestlimitexceeded") { + return true + } + if strings.Contains(lower, "provisionedthroughputexceeded") { + return true + } + + return false +} + +// errorHint provides helpful guidance for common AWS CLI errors +func (p *EKSProvider) errorHint(stderr string) string { + lower := strings.ToLower(stderr) + switch { + case strings.Contains(lower, "accessdenied") || strings.Contains(lower, "access denied"): + return " (hint: check IAM permissions for EKS operations)" + case strings.Contains(lower, "authorizationerror") || strings.Contains(lower, "not authorized"): + return " (hint: IAM user/role lacks required permissions for this operation)" + case strings.Contains(lower, "invalidparameterexception") || strings.Contains(lower, "invalid parameter"): + return " (hint: check parameter values match AWS requirements)" + case strings.Contains(lower, "resourceinuseexception") || strings.Contains(lower, "resource in use"): + return " (hint: resource is currently in use or being modified)" + case strings.Contains(lower, "resourcelimitexceeded") || strings.Contains(lower, "limit exceeded"): + return " (hint: AWS service quota exceeded, request limit increase)" + case strings.Contains(lower, "clusteralreadyexists"): + return " (hint: cluster with this name already exists in the region)" + case strings.Contains(lower, "unable to locate credentials") || strings.Contains(lower, "no credentials"): + return " (hint: run 'aws configure' or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY)" + case strings.Contains(lower, "expiredtoken") || strings.Contains(lower, "token has expired"): + return " (hint: AWS session token expired, refresh credentials)" + case strings.Contains(lower, "invalid region") || strings.Contains(lower, "could not connect"): + return " (hint: check region name is valid, e.g., us-east-1, eu-west-1)" + // Check specific resource types before general "not found" + case strings.Contains(lower, "vpc") && strings.Contains(lower, "not found"): + return " (hint: VPC does not exist or is not accessible)" + case strings.Contains(lower, "subnet") && strings.Contains(lower, "not found"): + return " (hint: subnet does not exist or is not in the specified VPC)" + case strings.Contains(lower, "security group") && strings.Contains(lower, "not found"): + return " (hint: security group does not exist or is not accessible)" + case strings.Contains(lower, "role") && strings.Contains(lower, "not found"): + return " (hint: IAM role ARN is invalid or role does not exist)" + // General "not found" check should be after specific resource checks + case strings.Contains(lower, "resourcenotfoundexception") || strings.Contains(lower, "not found"): + return " (hint: the specified resource does not exist in this region)" + case strings.Contains(lower, "throttling") || strings.Contains(lower, "rate exceeded"): + return " (hint: API rate limit exceeded, try again shortly)" + default: + return "" + } } diff --git a/internal/k8s/cluster/eks_test.go b/internal/k8s/cluster/eks_test.go index 43f5933..14d9fe9 100644 --- a/internal/k8s/cluster/eks_test.go +++ b/internal/k8s/cluster/eks_test.go @@ -263,3 +263,244 @@ func TestProviderManagerIntegration(t *testing.T) { t.Error("EKS not found in provider list") } } + +func TestEKSProviderIsRetryableError(t *testing.T) { + provider := NewEKSProvider(EKSProviderOptions{ + Region: "us-east-1", + }) + + tests := []struct { + name string + stderr string + retryable bool + }{ + { + name: "throttling error", + stderr: "Throttling: Rate exceeded", + retryable: true, + }, + { + name: "too many requests", + stderr: "TooManyRequestsException: Too many requests", + retryable: true, + }, + { + name: "request limit exceeded", + stderr: "Request limit exceeded for this account", + retryable: true, + }, + { + name: "timeout error", + stderr: "Connection timeout while connecting to endpoint", + retryable: true, + }, + { + name: "timed out", + stderr: "Operation timed out", + retryable: true, + }, + { + name: "deadline exceeded", + stderr: "Deadline exceeded while waiting for response", + retryable: true, + }, + { + name: "service unavailable", + stderr: "Service Unavailable: The service is currently unavailable", + retryable: true, + }, + { + name: "internal error", + stderr: "InternalError: An internal error occurred", + retryable: true, + }, + { + name: "temporarily unavailable", + stderr: "The service is temporarily unavailable", + retryable: true, + }, + { + name: "connection reset", + stderr: "Connection reset by peer", + retryable: true, + }, + { + name: "connection refused", + stderr: "Connection refused", + retryable: true, + }, + { + name: "request limit exceeded exception", + stderr: "RequestLimitExceeded: Request limit exceeded", + retryable: true, + }, + { + name: "provisioned throughput exceeded", + stderr: "ProvisionedThroughputExceededException: Rate exceeded", + retryable: true, + }, + { + name: "access denied not retryable", + stderr: "AccessDenied: User is not authorized", + retryable: false, + }, + { + name: "resource not found not retryable", + stderr: "ResourceNotFoundException: cluster not found", + retryable: false, + }, + { + name: "invalid parameter not retryable", + stderr: "InvalidParameterException: invalid value", + retryable: false, + }, + { + name: "cluster already exists not retryable", + stderr: "ClusterAlreadyExists: cluster test-cluster already exists", + retryable: false, + }, + { + name: "no credentials not retryable", + stderr: "Unable to locate credentials", + retryable: false, + }, + { + name: "unknown error not retryable", + stderr: "Some unknown error occurred", + retryable: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := provider.isRetryableError(tt.stderr) + if result != tt.retryable { + t.Errorf("isRetryableError(%q) = %v, want %v", tt.stderr, result, tt.retryable) + } + }) + } +} + +func TestEKSProviderErrorHint(t *testing.T) { + provider := NewEKSProvider(EKSProviderOptions{ + Region: "us-east-1", + }) + + tests := []struct { + name string + stderr string + contains string + }{ + { + name: "access denied", + stderr: "AccessDenied: User is not authorized", + contains: "IAM permissions", + }, + { + name: "authorization error", + stderr: "AuthorizationError: not authorized to perform operation", + contains: "IAM user/role lacks required permissions", + }, + { + name: "resource not found", + stderr: "ResourceNotFoundException: cluster not found", + contains: "does not exist", + }, + { + name: "invalid parameter", + stderr: "InvalidParameterException: invalid value for region", + contains: "check parameter values", + }, + { + name: "resource in use", + stderr: "ResourceInUseException: cluster is being updated", + contains: "currently in use", + }, + { + name: "limit exceeded", + stderr: "ResourceLimitExceeded: maximum number of clusters reached", + contains: "quota exceeded", + }, + { + name: "cluster already exists", + stderr: "ClusterAlreadyExists: cluster test-cluster already exists", + contains: "already exists", + }, + { + name: "no credentials", + stderr: "Unable to locate credentials", + contains: "aws configure", + }, + { + name: "expired token", + stderr: "ExpiredToken: security token has expired", + contains: "expired", + }, + { + name: "invalid region", + stderr: "Invalid region: us-invalid-1", + contains: "check region name", + }, + { + name: "vpc not found", + stderr: "VPC vpc-123 not found", + contains: "VPC does not exist", + }, + { + name: "subnet not found", + stderr: "Subnet subnet-123 not found", + contains: "subnet does not exist", + }, + { + name: "security group not found", + stderr: "Security group sg-123 not found", + contains: "security group does not exist", + }, + { + name: "role not found", + stderr: "Role arn:aws:iam::123:role/invalid not found", + contains: "IAM role ARN is invalid", + }, + { + name: "throttling", + stderr: "Throttling: Rate exceeded", + contains: "rate limit exceeded", + }, + { + name: "unknown error", + stderr: "Some unknown error occurred", + contains: "", // No hint expected + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + hint := provider.errorHint(tt.stderr) + if tt.contains == "" { + if hint != "" { + t.Errorf("expected no hint, got %q", hint) + } + } else { + if hint == "" { + t.Errorf("expected hint containing %q, got empty", tt.contains) + } else if !containsSubstring(hint, tt.contains) { + t.Errorf("expected hint containing %q, got %q", tt.contains, hint) + } + } + }) + } +} + +// containsSubstring checks if s contains substr (case-insensitive) +func containsSubstring(s, substr string) bool { + return len(s) >= len(substr) && (s == substr || len(substr) == 0 || findInString(s, substr)) +} + +func findInString(s, substr string) bool { + for i := 0; i <= len(s)-len(substr); i++ { + if s[i:i+len(substr)] == substr { + return true + } + } + return false +}