diff --git a/internal/k8s/cluster/eks.go b/internal/k8s/cluster/eks.go index 9e90e98..a798f28 100644 --- a/internal/k8s/cluster/eks.go +++ b/internal/k8s/cluster/eks.go @@ -693,13 +693,12 @@ func (p *EKSProvider) createNodeGroupWithAWSCLI(ctx context.Context, clusterName } func (p *EKSProvider) waitForNodeGroupActive(ctx context.Context, clusterName, nodeGroupName string) error { - timeout := 15 * time.Minute - deadline := time.Now().Add(timeout) + deadline := time.Now().Add(DefaultNodeGroupCreateTimeout) for time.Now().Before(deadline) { ng, err := p.describeNodeGroup(ctx, clusterName, nodeGroupName) if err != nil { - time.Sleep(30 * time.Second) + time.Sleep(DefaultPollInterval) continue } @@ -714,7 +713,7 @@ func (p *EKSProvider) waitForNodeGroupActive(ctx context.Context, clusterName, n return fmt.Errorf("node group creation failed") } - time.Sleep(30 * time.Second) + time.Sleep(DefaultPollInterval) } return fmt.Errorf("timeout waiting for node group to become active") @@ -911,14 +910,14 @@ func (p *EKSProvider) deleteNodeGroup(ctx context.Context, clusterName, nodeGrou func (p *EKSProvider) waitForClusterActive(ctx context.Context, clusterName, profile, region string, timeout time.Duration) error { if timeout <= 0 { - timeout = 20 * time.Minute + timeout = DefaultClusterCreateTimeout } deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { cluster, err := p.describeCluster(ctx, clusterName) if err != nil { - time.Sleep(30 * time.Second) + time.Sleep(DefaultPollInterval) continue } @@ -933,15 +932,14 @@ func (p *EKSProvider) waitForClusterActive(ctx context.Context, clusterName, pro return fmt.Errorf("cluster creation failed") } - time.Sleep(30 * time.Second) + time.Sleep(DefaultPollInterval) } return fmt.Errorf("timeout waiting for cluster to become active") } func (p *EKSProvider) waitForNodeGroupDeleted(ctx context.Context, clusterName, nodeGroupName string) error { - timeout := 10 * time.Minute - deadline := time.Now().Add(timeout) + deadline := time.Now().Add(DefaultNodeGroupDeleteTimeout) for time.Now().Before(deadline) { _, err := p.describeNodeGroup(ctx, clusterName, nodeGroupName) @@ -954,7 +952,7 @@ func (p *EKSProvider) waitForNodeGroupDeleted(ctx context.Context, clusterName, fmt.Printf("[aws] waiting for node group %s deletion\n", nodeGroupName) } - time.Sleep(30 * time.Second) + time.Sleep(DefaultPollInterval) } return fmt.Errorf("timeout waiting for node group deletion") diff --git a/internal/k8s/cluster/gke.go b/internal/k8s/cluster/gke.go index d1ff056..8530de2 100644 --- a/internal/k8s/cluster/gke.go +++ b/internal/k8s/cluster/gke.go @@ -579,14 +579,14 @@ func (p *GKEProvider) listNodePools(ctx context.Context, clusterName string) ([] func (p *GKEProvider) waitForClusterRunning(ctx context.Context, clusterName, project, region string, timeout time.Duration) error { if timeout <= 0 { - timeout = 15 * time.Minute + timeout = DefaultClusterCreateTimeout } deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { cluster, err := p.describeCluster(ctx, clusterName) if err != nil { - time.Sleep(30 * time.Second) + time.Sleep(DefaultPollInterval) continue } @@ -601,20 +601,19 @@ func (p *GKEProvider) waitForClusterRunning(ctx context.Context, clusterName, pr return fmt.Errorf("cluster creation failed with status: %s", cluster.Status) } - time.Sleep(30 * time.Second) + time.Sleep(DefaultPollInterval) } return fmt.Errorf("timeout waiting for cluster to become running") } func (p *GKEProvider) waitForNodePoolRunning(ctx context.Context, clusterName, nodePoolName string) error { - timeout := 10 * time.Minute - deadline := time.Now().Add(timeout) + deadline := time.Now().Add(DefaultNodeGroupCreateTimeout) for time.Now().Before(deadline) { nodePools, err := p.listNodePools(ctx, clusterName) if err != nil { - time.Sleep(30 * time.Second) + time.Sleep(DefaultPollInterval) continue } @@ -634,7 +633,7 @@ func (p *GKEProvider) waitForNodePoolRunning(ctx context.Context, clusterName, n } } - time.Sleep(30 * time.Second) + time.Sleep(DefaultPollInterval) } return fmt.Errorf("timeout waiting for node pool to become running") diff --git a/internal/k8s/cluster/kubeadm.go b/internal/k8s/cluster/kubeadm.go index 4752300..3c6dc2c 100644 --- a/internal/k8s/cluster/kubeadm.go +++ b/internal/k8s/cluster/kubeadm.go @@ -113,7 +113,7 @@ func (p *KubeadmProvider) Create(ctx context.Context, opts CreateOptions) (*Clus } // Wait for SSH to be available - if err := WaitForSSH(ctx, cpInstance.PublicIP, 22, 5*time.Minute); err != nil { + if err := WaitForSSH(ctx, cpInstance.PublicIP, 22, DefaultSSHConnectTimeout); err != nil { _ = p.terminateInstance(ctx, cpInstance.InstanceID) _ = p.deleteSecurityGroup(ctx, sgID) return nil, fmt.Errorf("control plane SSH not available: %w", err) @@ -209,7 +209,7 @@ func (p *KubeadmProvider) Create(ctx context.Context, opts CreateOptions) (*Clus } // Wait for SSH - if err := WaitForSSH(ctx, workerInstance.PublicIP, 22, 5*time.Minute); err != nil { + if err := WaitForSSH(ctx, workerInstance.PublicIP, 22, DefaultSSHConnectTimeout); err != nil { // Continue anyway, will fail on bootstrap if p.debug { fmt.Printf("[kubeadm] warning: worker %d SSH not available: %v\n", i, err) @@ -279,7 +279,7 @@ func (p *KubeadmProvider) Create(ctx context.Context, opts CreateOptions) (*Clus fmt.Println("[kubeadm] waiting for nodes to be ready...") } - if err := WaitForNodeReady(ctx, ssh, 5*time.Minute); err != nil { + if err := WaitForNodeReady(ctx, ssh, DefaultSSHConnectTimeout); err != nil { if p.debug { fmt.Printf("[kubeadm] warning: not all nodes ready: %v\n", err) } @@ -342,7 +342,7 @@ func (p *KubeadmProvider) Delete(ctx context.Context, clusterName string) error } // Wait for instances to terminate - time.Sleep(30 * time.Second) + time.Sleep(DefaultPollInterval) // Delete security group sgID, err := p.findSecurityGroup(ctx, clusterName) @@ -944,7 +944,7 @@ func (p *KubeadmProvider) scaleUp(ctx context.Context, cluster *ClusterInfo, cou } // Wait for SSH - if err := WaitForSSH(ctx, instance.PublicIP, 22, 5*time.Minute); err != nil { + if err := WaitForSSH(ctx, instance.PublicIP, 22, DefaultSSHConnectTimeout); err != nil { continue } diff --git a/internal/k8s/cluster/provider.go b/internal/k8s/cluster/provider.go index dcaa20c..61543fa 100644 --- a/internal/k8s/cluster/provider.go +++ b/internal/k8s/cluster/provider.go @@ -17,6 +17,30 @@ const ( ClusterTypeExisting ClusterType = "existing" ) +// Standard timeout constants for cluster operations +const ( + // DefaultClusterCreateTimeout is the default timeout for cluster creation + DefaultClusterCreateTimeout = 20 * time.Minute + + // DefaultNodeGroupCreateTimeout is the default timeout for node group creation + DefaultNodeGroupCreateTimeout = 15 * time.Minute + + // DefaultNodeGroupDeleteTimeout is the default timeout for node group deletion + DefaultNodeGroupDeleteTimeout = 10 * time.Minute + + // DefaultSSHConnectTimeout is the default timeout for SSH connection + DefaultSSHConnectTimeout = 5 * time.Minute + + // DefaultNodeReadyTimeout is the default timeout for node to become ready + DefaultNodeReadyTimeout = 5 * time.Minute + + // DefaultPollInterval is the standard polling interval for status checks + DefaultPollInterval = 30 * time.Second + + // DefaultCommandTimeout is the default timeout for CLI command execution + DefaultCommandTimeout = 2 * time.Minute +) + // NodeInfo contains information about a cluster node type NodeInfo struct { Name string `json:"name"` diff --git a/internal/k8s/cluster/provider_test.go b/internal/k8s/cluster/provider_test.go new file mode 100644 index 0000000..b48fb76 --- /dev/null +++ b/internal/k8s/cluster/provider_test.go @@ -0,0 +1,268 @@ +package cluster + +import ( + "testing" + "time" +) + +func TestTimeoutConstants(t *testing.T) { + // Verify timeout constants are defined with sensible values + + tests := []struct { + name string + timeout time.Duration + minValue time.Duration + maxValue time.Duration + }{ + { + name: "DefaultClusterCreateTimeout", + timeout: DefaultClusterCreateTimeout, + minValue: 10 * time.Minute, + maxValue: 60 * time.Minute, + }, + { + name: "DefaultNodeGroupCreateTimeout", + timeout: DefaultNodeGroupCreateTimeout, + minValue: 5 * time.Minute, + maxValue: 30 * time.Minute, + }, + { + name: "DefaultNodeGroupDeleteTimeout", + timeout: DefaultNodeGroupDeleteTimeout, + minValue: 5 * time.Minute, + maxValue: 30 * time.Minute, + }, + { + name: "DefaultSSHConnectTimeout", + timeout: DefaultSSHConnectTimeout, + minValue: 1 * time.Minute, + maxValue: 15 * time.Minute, + }, + { + name: "DefaultNodeReadyTimeout", + timeout: DefaultNodeReadyTimeout, + minValue: 1 * time.Minute, + maxValue: 15 * time.Minute, + }, + { + name: "DefaultPollInterval", + timeout: DefaultPollInterval, + minValue: 5 * time.Second, + maxValue: 60 * time.Second, + }, + { + name: "DefaultCommandTimeout", + timeout: DefaultCommandTimeout, + minValue: 30 * time.Second, + maxValue: 10 * time.Minute, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.timeout < tt.minValue { + t.Errorf("%s = %v, want >= %v", tt.name, tt.timeout, tt.minValue) + } + if tt.timeout > tt.maxValue { + t.Errorf("%s = %v, want <= %v", tt.name, tt.timeout, tt.maxValue) + } + }) + } +} + +func TestTimeoutConstantsRelationships(t *testing.T) { + // Verify logical relationships between timeouts + + // Cluster creation should take longer than node group creation + if DefaultClusterCreateTimeout < DefaultNodeGroupCreateTimeout { + t.Errorf("DefaultClusterCreateTimeout (%v) should be >= DefaultNodeGroupCreateTimeout (%v)", + DefaultClusterCreateTimeout, DefaultNodeGroupCreateTimeout) + } + + // Node group creation should take longer than deletion + if DefaultNodeGroupCreateTimeout < DefaultNodeGroupDeleteTimeout { + t.Errorf("DefaultNodeGroupCreateTimeout (%v) should be >= DefaultNodeGroupDeleteTimeout (%v)", + DefaultNodeGroupCreateTimeout, DefaultNodeGroupDeleteTimeout) + } + + // Poll interval should be much shorter than any operation timeout + if DefaultPollInterval >= DefaultNodeGroupDeleteTimeout { + t.Errorf("DefaultPollInterval (%v) should be < DefaultNodeGroupDeleteTimeout (%v)", + DefaultPollInterval, DefaultNodeGroupDeleteTimeout) + } +} + +func TestClusterTypeConstants(t *testing.T) { + // Verify cluster type constants + tests := []struct { + clusterType ClusterType + expected string + }{ + {ClusterTypeEKS, "eks"}, + {ClusterTypeGKE, "gke"}, + {ClusterTypeKubeadm, "kubeadm"}, + {ClusterTypeKops, "kops"}, + {ClusterTypeK3s, "k3s"}, + {ClusterTypeExisting, "existing"}, + } + + for _, tt := range tests { + t.Run(string(tt.clusterType), func(t *testing.T) { + if string(tt.clusterType) != tt.expected { + t.Errorf("ClusterType = %q, want %q", tt.clusterType, tt.expected) + } + }) + } +} + +func TestManagerCreation(t *testing.T) { + // Test manager creation and provider registration + manager := NewManager(true) + if manager == nil { + t.Fatal("NewManager returned nil") + } + + if len(manager.ListProviders()) != 0 { + t.Error("new manager should have no providers") + } +} + +func TestErrProviderNotFound(t *testing.T) { + err := &ErrProviderNotFound{ClusterType: ClusterTypeEKS} + expected := "cluster provider not found: eks" + if err.Error() != expected { + t.Errorf("ErrProviderNotFound.Error() = %q, want %q", err.Error(), expected) + } +} + +func TestErrClusterNotFound(t *testing.T) { + err := &ErrClusterNotFound{ClusterName: "test-cluster"} + expected := "cluster not found: test-cluster" + if err.Error() != expected { + t.Errorf("ErrClusterNotFound.Error() = %q, want %q", err.Error(), expected) + } +} + +func TestErrClusterExists(t *testing.T) { + err := &ErrClusterExists{ClusterName: "test-cluster"} + expected := "cluster already exists: test-cluster" + if err.Error() != expected { + t.Errorf("ErrClusterExists.Error() = %q, want %q", err.Error(), expected) + } +} + +func TestErrInvalidConfiguration(t *testing.T) { + err := &ErrInvalidConfiguration{Message: "missing required field"} + expected := "invalid cluster configuration: missing required field" + if err.Error() != expected { + t.Errorf("ErrInvalidConfiguration.Error() = %q, want %q", err.Error(), expected) + } +} + +func TestCreateOptions(t *testing.T) { + opts := CreateOptions{ + Name: "test-cluster", + Region: "us-east-1", + KubernetesVersion: "1.28", + WorkerCount: 3, + WorkerMinCount: 1, + WorkerMaxCount: 5, + Tags: map[string]string{ + "env": "test", + }, + } + + if opts.Name != "test-cluster" { + t.Errorf("expected name 'test-cluster', got %q", opts.Name) + } + + if opts.WorkerCount != 3 { + t.Errorf("expected worker count 3, got %d", opts.WorkerCount) + } + + if opts.Tags["env"] != "test" { + t.Errorf("expected tag 'env=test', got %q", opts.Tags["env"]) + } +} + +func TestScaleOptions(t *testing.T) { + opts := ScaleOptions{ + NodeGroupName: "workers", + DesiredCount: 5, + MinCount: 2, + MaxCount: 10, + } + + if opts.NodeGroupName != "workers" { + t.Errorf("expected node group name 'workers', got %q", opts.NodeGroupName) + } + + if opts.DesiredCount != 5 { + t.Errorf("expected desired count 5, got %d", opts.DesiredCount) + } +} + +func TestNodeInfo(t *testing.T) { + node := NodeInfo{ + Name: "worker-1", + Role: "worker", + Status: "Ready", + InternalIP: "10.0.0.1", + ExternalIP: "1.2.3.4", + InstanceID: "i-12345", + Labels: map[string]string{ + "node.kubernetes.io/instance-type": "t3.medium", + }, + } + + if node.Name != "worker-1" { + t.Errorf("expected name 'worker-1', got %q", node.Name) + } + + if node.Status != "Ready" { + t.Errorf("expected status 'Ready', got %q", node.Status) + } +} + +func TestClusterInfo(t *testing.T) { + info := ClusterInfo{ + Name: "test-cluster", + Type: ClusterTypeEKS, + Status: "ACTIVE", + KubernetesVersion: "1.28", + Endpoint: "https://example.eks.amazonaws.com", + Region: "us-east-1", + VPCID: "vpc-123", + } + + if info.Name != "test-cluster" { + t.Errorf("expected name 'test-cluster', got %q", info.Name) + } + + if info.Type != ClusterTypeEKS { + t.Errorf("expected type EKS, got %s", info.Type) + } +} + +func TestHealthStatus(t *testing.T) { + status := HealthStatus{ + Healthy: true, + Message: "cluster healthy", + Components: map[string]string{ + "cluster": "ACTIVE", + "nodegroup-main": "ACTIVE", + }, + NodeStatus: map[string]string{ + "worker-1": "Ready", + "worker-2": "Ready", + }, + } + + if !status.Healthy { + t.Error("expected healthy to be true") + } + + if status.Components["cluster"] != "ACTIVE" { + t.Errorf("expected cluster component ACTIVE, got %q", status.Components["cluster"]) + } +}