From 11956de6cc51637cbc9ec3c1a83cfbd45fbc8112 Mon Sep 17 00:00:00 2001
From: nash <nash@clankercloud.ai>
Date: Sat, 14 Feb 2026 09:34:09 +0000
Subject: [PATCH] feat(k8s/cluster): standardize timeout configuration across
 providers

Add standard timeout constants to provider.go and apply them consistently
across EKS, GKE, and kubeadm providers. This improves maintainability and
ensures consistent behavior across all cluster providers.

New constants:
- DefaultClusterCreateTimeout (20 min)
- DefaultNodeGroupCreateTimeout (15 min)
- DefaultNodeGroupDeleteTimeout (10 min)
- DefaultSSHConnectTimeout (5 min)
- DefaultNodeReadyTimeout (5 min)
- DefaultPollInterval (30 sec)
- DefaultCommandTimeout (2 min)

Files updated:
- provider.go: Add timeout constants
- eks.go: Use constants in wait functions
- gke.go: Use constants in wait functions
- kubeadm.go: Use constants for SSH and polling

Also adds comprehensive tests for timeout constants and provider types.

Refs #72

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 internal/k8s/cluster/eks.go           |  18 +-
 internal/k8s/cluster/gke.go           |  13 +-
 internal/k8s/cluster/kubeadm.go       |  10 +-
 internal/k8s/cluster/provider.go      |  24 +++
 internal/k8s/cluster/provider_test.go | 268 ++++++++++++++++++++++++++
 5 files changed, 311 insertions(+), 22 deletions(-)
 create mode 100644 internal/k8s/cluster/provider_test.go

diff --git a/internal/k8s/cluster/eks.go b/internal/k8s/cluster/eks.go
index 9e90e98..a798f28 100644
--- a/internal/k8s/cluster/eks.go
+++ b/internal/k8s/cluster/eks.go
@@ -693,13 +693,12 @@ func (p *EKSProvider) createNodeGroupWithAWSCLI(ctx context.Context, clusterName
 }
 
 func (p *EKSProvider) waitForNodeGroupActive(ctx context.Context, clusterName, nodeGroupName string) error {
-	timeout := 15 * time.Minute
-	deadline := time.Now().Add(timeout)
+	deadline := time.Now().Add(DefaultNodeGroupCreateTimeout)
 
 	for time.Now().Before(deadline) {
 		ng, err := p.describeNodeGroup(ctx, clusterName, nodeGroupName)
 		if err != nil {
-			time.Sleep(30 * time.Second)
+			time.Sleep(DefaultPollInterval)
 			continue
 		}
 
@@ -714,7 +713,7 @@ func (p *EKSProvider) waitForNodeGroupActive(ctx context.Context, clusterName, n
 			return fmt.Errorf("node group creation failed")
 		}
 
-		time.Sleep(30 * time.Second)
+		time.Sleep(DefaultPollInterval)
 	}
 
 	return fmt.Errorf("timeout waiting for node group to become active")
@@ -911,14 +910,14 @@ func (p *EKSProvider) deleteNodeGroup(ctx context.Context, clusterName, nodeGrou
 
 func (p *EKSProvider) waitForClusterActive(ctx context.Context, clusterName, profile, region string, timeout time.Duration) error {
 	if timeout <= 0 {
-		timeout = 20 * time.Minute
+		timeout = DefaultClusterCreateTimeout
 	}
 
 	deadline := time.Now().Add(timeout)
 	for time.Now().Before(deadline) {
 		cluster, err := p.describeCluster(ctx, clusterName)
 		if err != nil {
-			time.Sleep(30 * time.Second)
+			time.Sleep(DefaultPollInterval)
 			continue
 		}
 
@@ -933,15 +932,14 @@ func (p *EKSProvider) waitForClusterActive(ctx context.Context, clusterName, pro
 			return fmt.Errorf("cluster creation failed")
 		}
 
-		time.Sleep(30 * time.Second)
+		time.Sleep(DefaultPollInterval)
 	}
 
 	return fmt.Errorf("timeout waiting for cluster to become active")
 }
 
 func (p *EKSProvider) waitForNodeGroupDeleted(ctx context.Context, clusterName, nodeGroupName string) error {
-	timeout := 10 * time.Minute
-	deadline := time.Now().Add(timeout)
+	deadline := time.Now().Add(DefaultNodeGroupDeleteTimeout)
 
 	for time.Now().Before(deadline) {
 		_, err := p.describeNodeGroup(ctx, clusterName, nodeGroupName)
@@ -954,7 +952,7 @@ func (p *EKSProvider) waitForNodeGroupDeleted(ctx context.Context, clusterName,
 			fmt.Printf("[aws] waiting for node group %s deletion\n", nodeGroupName)
 		}
 
-		time.Sleep(30 * time.Second)
+		time.Sleep(DefaultPollInterval)
 	}
 
 	return fmt.Errorf("timeout waiting for node group deletion")
diff --git a/internal/k8s/cluster/gke.go b/internal/k8s/cluster/gke.go
index d1ff056..8530de2 100644
--- a/internal/k8s/cluster/gke.go
+++ b/internal/k8s/cluster/gke.go
@@ -579,14 +579,14 @@ func (p *GKEProvider) listNodePools(ctx context.Context, clusterName string) ([]
 
 func (p *GKEProvider) waitForClusterRunning(ctx context.Context, clusterName, project, region string, timeout time.Duration) error {
 	if timeout <= 0 {
-		timeout = 15 * time.Minute
+		timeout = DefaultClusterCreateTimeout
 	}
 
 	deadline := time.Now().Add(timeout)
 	for time.Now().Before(deadline) {
 		cluster, err := p.describeCluster(ctx, clusterName)
 		if err != nil {
-			time.Sleep(30 * time.Second)
+			time.Sleep(DefaultPollInterval)
 			continue
 		}
 
@@ -601,20 +601,19 @@ func (p *GKEProvider) waitForClusterRunning(ctx context.Context, clusterName, pr
 			return fmt.Errorf("cluster creation failed with status: %s", cluster.Status)
 		}
 
-		time.Sleep(30 * time.Second)
+		time.Sleep(DefaultPollInterval)
 	}
 
 	return fmt.Errorf("timeout waiting for cluster to become running")
 }
 
 func (p *GKEProvider) waitForNodePoolRunning(ctx context.Context, clusterName, nodePoolName string) error {
-	timeout := 10 * time.Minute
-	deadline := time.Now().Add(timeout)
+	deadline := time.Now().Add(DefaultNodeGroupCreateTimeout)
 
 	for time.Now().Before(deadline) {
 		nodePools, err := p.listNodePools(ctx, clusterName)
 		if err != nil {
-			time.Sleep(30 * time.Second)
+			time.Sleep(DefaultPollInterval)
 			continue
 		}
 
@@ -634,7 +633,7 @@ func (p *GKEProvider) waitForNodePoolRunning(ctx context.Context, clusterName, n
 			}
 		}
 
-		time.Sleep(30 * time.Second)
+		time.Sleep(DefaultPollInterval)
 	}
 
 	return fmt.Errorf("timeout waiting for node pool to become running")
diff --git a/internal/k8s/cluster/kubeadm.go b/internal/k8s/cluster/kubeadm.go
index 4752300..3c6dc2c 100644
--- a/internal/k8s/cluster/kubeadm.go
+++ b/internal/k8s/cluster/kubeadm.go
@@ -113,7 +113,7 @@ func (p *KubeadmProvider) Create(ctx context.Context, opts CreateOptions) (*Clus
 	}
 
 	// Wait for SSH to be available
-	if err := WaitForSSH(ctx, cpInstance.PublicIP, 22, 5*time.Minute); err != nil {
+	if err := WaitForSSH(ctx, cpInstance.PublicIP, 22, DefaultSSHConnectTimeout); err != nil {
 		_ = p.terminateInstance(ctx, cpInstance.InstanceID)
 		_ = p.deleteSecurityGroup(ctx, sgID)
 		return nil, fmt.Errorf("control plane SSH not available: %w", err)
@@ -209,7 +209,7 @@ func (p *KubeadmProvider) Create(ctx context.Context, opts CreateOptions) (*Clus
 		}
 
 		// Wait for SSH
-		if err := WaitForSSH(ctx, workerInstance.PublicIP, 22, 5*time.Minute); err != nil {
+		if err := WaitForSSH(ctx, workerInstance.PublicIP, 22, DefaultSSHConnectTimeout); err != nil {
 			// Continue anyway, will fail on bootstrap
 			if p.debug {
 				fmt.Printf("[kubeadm] warning: worker %d SSH not available: %v\n", i, err)
@@ -279,7 +279,7 @@ func (p *KubeadmProvider) Create(ctx context.Context, opts CreateOptions) (*Clus
 		fmt.Println("[kubeadm] waiting for nodes to be ready...")
 	}
 
-	if err := WaitForNodeReady(ctx, ssh, 5*time.Minute); err != nil {
+	if err := WaitForNodeReady(ctx, ssh, DefaultSSHConnectTimeout); err != nil {
 		if p.debug {
 			fmt.Printf("[kubeadm] warning: not all nodes ready: %v\n", err)
 		}
@@ -342,7 +342,7 @@ func (p *KubeadmProvider) Delete(ctx context.Context, clusterName string) error
 	}
 
 	// Wait for instances to terminate
-	time.Sleep(30 * time.Second)
+	time.Sleep(DefaultPollInterval)
 
 	// Delete security group
 	sgID, err := p.findSecurityGroup(ctx, clusterName)
@@ -944,7 +944,7 @@ func (p *KubeadmProvider) scaleUp(ctx context.Context, cluster *ClusterInfo, cou
 		}
 
 		// Wait for SSH
-		if err := WaitForSSH(ctx, instance.PublicIP, 22, 5*time.Minute); err != nil {
+		if err := WaitForSSH(ctx, instance.PublicIP, 22, DefaultSSHConnectTimeout); err != nil {
 			continue
 		}
 
diff --git a/internal/k8s/cluster/provider.go b/internal/k8s/cluster/provider.go
index dcaa20c..61543fa 100644
--- a/internal/k8s/cluster/provider.go
+++ b/internal/k8s/cluster/provider.go
@@ -17,6 +17,30 @@ const (
 	ClusterTypeExisting ClusterType = "existing"
 )
 
+// Standard timeout constants for cluster operations
+const (
+	// DefaultClusterCreateTimeout is the default timeout for cluster creation
+	DefaultClusterCreateTimeout = 20 * time.Minute
+
+	// DefaultNodeGroupCreateTimeout is the default timeout for node group creation
+	DefaultNodeGroupCreateTimeout = 15 * time.Minute
+
+	// DefaultNodeGroupDeleteTimeout is the default timeout for node group deletion
+	DefaultNodeGroupDeleteTimeout = 10 * time.Minute
+
+	// DefaultSSHConnectTimeout is the default timeout for SSH connection
+	DefaultSSHConnectTimeout = 5 * time.Minute
+
+	// DefaultNodeReadyTimeout is the default timeout for node to become ready
+	DefaultNodeReadyTimeout = 5 * time.Minute
+
+	// DefaultPollInterval is the standard polling interval for status checks
+	DefaultPollInterval = 30 * time.Second
+
+	// DefaultCommandTimeout is the default timeout for CLI command execution
+	DefaultCommandTimeout = 2 * time.Minute
+)
+
 // NodeInfo contains information about a cluster node
 type NodeInfo struct {
 	Name       string            `json:"name"`
diff --git a/internal/k8s/cluster/provider_test.go b/internal/k8s/cluster/provider_test.go
new file mode 100644
index 0000000..b48fb76
--- /dev/null
+++ b/internal/k8s/cluster/provider_test.go
@@ -0,0 +1,268 @@
+package cluster
+
+import (
+	"testing"
+	"time"
+)
+
+func TestTimeoutConstants(t *testing.T) {
+	// Verify timeout constants are defined with sensible values
+
+	tests := []struct {
+		name     string
+		timeout  time.Duration
+		minValue time.Duration
+		maxValue time.Duration
+	}{
+		{
+			name:     "DefaultClusterCreateTimeout",
+			timeout:  DefaultClusterCreateTimeout,
+			minValue: 10 * time.Minute,
+			maxValue: 60 * time.Minute,
+		},
+		{
+			name:     "DefaultNodeGroupCreateTimeout",
+			timeout:  DefaultNodeGroupCreateTimeout,
+			minValue: 5 * time.Minute,
+			maxValue: 30 * time.Minute,
+		},
+		{
+			name:     "DefaultNodeGroupDeleteTimeout",
+			timeout:  DefaultNodeGroupDeleteTimeout,
+			minValue: 5 * time.Minute,
+			maxValue: 30 * time.Minute,
+		},
+		{
+			name:     "DefaultSSHConnectTimeout",
+			timeout:  DefaultSSHConnectTimeout,
+			minValue: 1 * time.Minute,
+			maxValue: 15 * time.Minute,
+		},
+		{
+			name:     "DefaultNodeReadyTimeout",
+			timeout:  DefaultNodeReadyTimeout,
+			minValue: 1 * time.Minute,
+			maxValue: 15 * time.Minute,
+		},
+		{
+			name:     "DefaultPollInterval",
+			timeout:  DefaultPollInterval,
+			minValue: 5 * time.Second,
+			maxValue: 60 * time.Second,
+		},
+		{
+			name:     "DefaultCommandTimeout",
+			timeout:  DefaultCommandTimeout,
+			minValue: 30 * time.Second,
+			maxValue: 10 * time.Minute,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if tt.timeout < tt.minValue {
+				t.Errorf("%s = %v, want >= %v", tt.name, tt.timeout, tt.minValue)
+			}
+			if tt.timeout > tt.maxValue {
+				t.Errorf("%s = %v, want <= %v", tt.name, tt.timeout, tt.maxValue)
+			}
+		})
+	}
+}
+
+func TestTimeoutConstantsRelationships(t *testing.T) {
+	// Verify logical relationships between timeouts
+
+	// Cluster creation should take longer than node group creation
+	if DefaultClusterCreateTimeout < DefaultNodeGroupCreateTimeout {
+		t.Errorf("DefaultClusterCreateTimeout (%v) should be >= DefaultNodeGroupCreateTimeout (%v)",
+			DefaultClusterCreateTimeout, DefaultNodeGroupCreateTimeout)
+	}
+
+	// Node group creation should take longer than deletion
+	if DefaultNodeGroupCreateTimeout < DefaultNodeGroupDeleteTimeout {
+		t.Errorf("DefaultNodeGroupCreateTimeout (%v) should be >= DefaultNodeGroupDeleteTimeout (%v)",
+			DefaultNodeGroupCreateTimeout, DefaultNodeGroupDeleteTimeout)
+	}
+
+	// Poll interval should be much shorter than any operation timeout
+	if DefaultPollInterval >= DefaultNodeGroupDeleteTimeout {
+		t.Errorf("DefaultPollInterval (%v) should be < DefaultNodeGroupDeleteTimeout (%v)",
+			DefaultPollInterval, DefaultNodeGroupDeleteTimeout)
+	}
+}
+
+func TestClusterTypeConstants(t *testing.T) {
+	// Verify cluster type constants
+	tests := []struct {
+		clusterType ClusterType
+		expected    string
+	}{
+		{ClusterTypeEKS, "eks"},
+		{ClusterTypeGKE, "gke"},
+		{ClusterTypeKubeadm, "kubeadm"},
+		{ClusterTypeKops, "kops"},
+		{ClusterTypeK3s, "k3s"},
+		{ClusterTypeExisting, "existing"},
+	}
+
+	for _, tt := range tests {
+		t.Run(string(tt.clusterType), func(t *testing.T) {
+			if string(tt.clusterType) != tt.expected {
+				t.Errorf("ClusterType = %q, want %q", tt.clusterType, tt.expected)
+			}
+		})
+	}
+}
+
+func TestManagerCreation(t *testing.T) {
+	// Test manager creation and provider registration
+	manager := NewManager(true)
+	if manager == nil {
+		t.Fatal("NewManager returned nil")
+	}
+
+	if len(manager.ListProviders()) != 0 {
+		t.Error("new manager should have no providers")
+	}
+}
+
+func TestErrProviderNotFound(t *testing.T) {
+	err := &ErrProviderNotFound{ClusterType: ClusterTypeEKS}
+	expected := "cluster provider not found: eks"
+	if err.Error() != expected {
+		t.Errorf("ErrProviderNotFound.Error() = %q, want %q", err.Error(), expected)
+	}
+}
+
+func TestErrClusterNotFound(t *testing.T) {
+	err := &ErrClusterNotFound{ClusterName: "test-cluster"}
+	expected := "cluster not found: test-cluster"
+	if err.Error() != expected {
+		t.Errorf("ErrClusterNotFound.Error() = %q, want %q", err.Error(), expected)
+	}
+}
+
+func TestErrClusterExists(t *testing.T) {
+	err := &ErrClusterExists{ClusterName: "test-cluster"}
+	expected := "cluster already exists: test-cluster"
+	if err.Error() != expected {
+		t.Errorf("ErrClusterExists.Error() = %q, want %q", err.Error(), expected)
+	}
+}
+
+func TestErrInvalidConfiguration(t *testing.T) {
+	err := &ErrInvalidConfiguration{Message: "missing required field"}
+	expected := "invalid cluster configuration: missing required field"
+	if err.Error() != expected {
+		t.Errorf("ErrInvalidConfiguration.Error() = %q, want %q", err.Error(), expected)
+	}
+}
+
+func TestCreateOptions(t *testing.T) {
+	opts := CreateOptions{
+		Name:              "test-cluster",
+		Region:            "us-east-1",
+		KubernetesVersion: "1.28",
+		WorkerCount:       3,
+		WorkerMinCount:    1,
+		WorkerMaxCount:    5,
+		Tags: map[string]string{
+			"env": "test",
+		},
+	}
+
+	if opts.Name != "test-cluster" {
+		t.Errorf("expected name 'test-cluster', got %q", opts.Name)
+	}
+
+	if opts.WorkerCount != 3 {
+		t.Errorf("expected worker count 3, got %d", opts.WorkerCount)
+	}
+
+	if opts.Tags["env"] != "test" {
+		t.Errorf("expected tag 'env=test', got %q", opts.Tags["env"])
+	}
+}
+
+func TestScaleOptions(t *testing.T) {
+	opts := ScaleOptions{
+		NodeGroupName: "workers",
+		DesiredCount:  5,
+		MinCount:      2,
+		MaxCount:      10,
+	}
+
+	if opts.NodeGroupName != "workers" {
+		t.Errorf("expected node group name 'workers', got %q", opts.NodeGroupName)
+	}
+
+	if opts.DesiredCount != 5 {
+		t.Errorf("expected desired count 5, got %d", opts.DesiredCount)
+	}
+}
+
+func TestNodeInfo(t *testing.T) {
+	node := NodeInfo{
+		Name:       "worker-1",
+		Role:       "worker",
+		Status:     "Ready",
+		InternalIP: "10.0.0.1",
+		ExternalIP: "1.2.3.4",
+		InstanceID: "i-12345",
+		Labels: map[string]string{
+			"node.kubernetes.io/instance-type": "t3.medium",
+		},
+	}
+
+	if node.Name != "worker-1" {
+		t.Errorf("expected name 'worker-1', got %q", node.Name)
+	}
+
+	if node.Status != "Ready" {
+		t.Errorf("expected status 'Ready', got %q", node.Status)
+	}
+}
+
+func TestClusterInfo(t *testing.T) {
+	info := ClusterInfo{
+		Name:              "test-cluster",
+		Type:              ClusterTypeEKS,
+		Status:            "ACTIVE",
+		KubernetesVersion: "1.28",
+		Endpoint:          "https://example.eks.amazonaws.com",
+		Region:            "us-east-1",
+		VPCID:             "vpc-123",
+	}
+
+	if info.Name != "test-cluster" {
+		t.Errorf("expected name 'test-cluster', got %q", info.Name)
+	}
+
+	if info.Type != ClusterTypeEKS {
+		t.Errorf("expected type EKS, got %s", info.Type)
+	}
+}
+
+func TestHealthStatus(t *testing.T) {
+	status := HealthStatus{
+		Healthy: true,
+		Message: "cluster healthy",
+		Components: map[string]string{
+			"cluster":        "ACTIVE",
+			"nodegroup-main": "ACTIVE",
+		},
+		NodeStatus: map[string]string{
+			"worker-1": "Ready",
+			"worker-2": "Ready",
+		},
+	}
+
+	if !status.Healthy {
+		t.Error("expected healthy to be true")
+	}
+
+	if status.Components["cluster"] != "ACTIVE" {
+		t.Errorf("expected cluster component ACTIVE, got %q", status.Components["cluster"])
+	}
+}