diff --git a/cmd/main.go b/cmd/main.go index fed106fb..519ded67 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -149,6 +149,15 @@ func main() { // reconciling before enforcing readiness checks. healthManager.SuppressReadiness(2 * time.Minute) + // When leader election is enabled, mark this pod as standby until it wins + // the lease. Standby pods return 200 on /readyz so Kubernetes does not + // repeatedly mark them unhealthy while they wait. The flag is cleared + // (inside the goroutine below) the moment this pod is elected leader, at + // which point the normal 2-minute readiness grace period takes over. + if enableLeaderElection { + healthManager.SetStandby(true) + } + // No need to add the standard controller with kubebuilder:scaffold:builder // The env-based controller doesn't rely on CRDs @@ -204,8 +213,22 @@ func main() { os.Exit(1) } + ctx := ctrl.SetupSignalHandler() + + // Clear standby the moment this pod wins leader election so that normal + // readiness checks (with the 2-minute grace period) take over. + if enableLeaderElection { + go func() { + select { + case <-mgr.Elected(): + healthManager.SetStandby(false) + case <-ctx.Done(): + } + }() + } + setupLog.Info("starting manager") - if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + if err := mgr.Start(ctx); err != nil { setupLog.Error(err, "problem running manager") os.Exit(1) } diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 316bcda5..05e4e9bf 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -20,7 +20,7 @@ spec: selector: matchLabels: control-plane: controller-manager - replicas: 1 + replicas: 2 template: metadata: annotations: diff --git a/dist/backend-install.yaml b/dist/backend-install.yaml index 2d54a3c4..90cb4154 100644 --- a/dist/backend-install.yaml +++ b/dist/backend-install.yaml @@ -1480,7 +1480,7 @@ metadata: name: devzero-zxporter-controller-manager namespace: devzero-zxporter spec: - replicas: 1 + replicas: 2 selector: matchLabels: control-plane: controller-manager diff --git a/dist/install.yaml b/dist/install.yaml index 6de4b7c4..a32a65fc 100644 --- a/dist/install.yaml +++ b/dist/install.yaml @@ -1485,7 +1485,7 @@ metadata: name: devzero-zxporter-controller-manager namespace: devzero-zxporter spec: - replicas: 1 + replicas: 2 selector: matchLabels: control-plane: controller-manager diff --git a/dist/installer_updater.yaml b/dist/installer_updater.yaml index 080b0911..eceaec01 100644 --- a/dist/installer_updater.yaml +++ b/dist/installer_updater.yaml @@ -1424,7 +1424,7 @@ metadata: name: devzero-zxporter-controller-manager namespace: devzero-zxporter spec: - replicas: 1 + replicas: 2 selector: matchLabels: control-plane: controller-manager diff --git a/dist/zxporter.yaml b/dist/zxporter.yaml index a720ddba..245cfb15 100644 --- a/dist/zxporter.yaml +++ b/dist/zxporter.yaml @@ -664,7 +664,7 @@ metadata: name: devzero-zxporter-controller-manager namespace: devzero-zxporter spec: - replicas: 1 + replicas: 2 selector: matchLabels: control-plane: controller-manager diff --git a/helm-chart/zxporter/templates/deployment.yaml b/helm-chart/zxporter/templates/deployment.yaml index 811a3879..b775568b 100644 --- a/helm-chart/zxporter/templates/deployment.yaml +++ b/helm-chart/zxporter/templates/deployment.yaml @@ -9,7 +9,7 @@ metadata: name: devzero-zxporter-controller-manager namespace: {{ .Release.Namespace }} spec: - replicas: 1 + replicas: {{ if .Values.highAvailability.enabled }}2{{ else }}1{{ end }} selector: matchLabels: control-plane: controller-manager diff --git a/helm-chart/zxporter/values.yaml b/helm-chart/zxporter/values.yaml index 6d9763ef..bdf44c9e 100644 --- a/helm-chart/zxporter/values.yaml +++ b/helm-chart/zxporter/values.yaml @@ -140,11 +140,10 @@ affinity: {} # High Availability Configuration highAvailability: - enabled: false + enabled: true podDisruptionBudget: enabled: true minAvailable: 1 - # Alternative: maxUnavailable: 1 # MPA Server Configuration mpaServer: diff --git a/internal/health/manager.go b/internal/health/manager.go index c961c5a2..45ce2a7d 100644 --- a/internal/health/manager.go +++ b/internal/health/manager.go @@ -29,6 +29,7 @@ type HealthManager struct { components map[string]*ComponentStatus livenessGraceUntil time.Time // LivenessCheck always passes before this deadline readinessGraceUntil time.Time // ReadinessCheck always passes before this deadline + standby bool // standby=true when not leader; readiness passes unconditionally } // NewHealthManager creates a new HealthManager @@ -118,6 +119,16 @@ func (hm *HealthManager) BuildReport() map[string]ComponentStatus { return report } +// SetStandby marks the pod as a standby (non-leader) replica. While in standby, +// ReadinessCheck passes unconditionally — the pod is healthy and ready to take +// over leadership, it just isn't running collectors yet. Call with false when +// leader election is won so normal readiness checks resume. +func (hm *HealthManager) SetStandby(standby bool) { + hm.mu.Lock() + defer hm.mu.Unlock() + hm.standby = standby +} + // SuppressLiveness makes LivenessCheck pass unconditionally for the given // duration. Use this before a planned collector restart so that the transient // Unhealthy window does not trigger a pod kill. The grace period is cleared @@ -192,6 +203,9 @@ func (hm *HealthManager) ReadinessCheck() error { // readinessCheckLocked performs the readiness check while the caller holds mu. func (hm *HealthManager) readinessCheckLocked() error { + if hm.standby { + return nil // standby replica: healthy and ready to become leader + } if !hm.readinessGraceUntil.IsZero() && time.Now().Before(hm.readinessGraceUntil) { return nil } diff --git a/internal/health/manager_test.go b/internal/health/manager_test.go index 492aaf4a..f33e70f6 100644 --- a/internal/health/manager_test.go +++ b/internal/health/manager_test.go @@ -325,3 +325,34 @@ func TestLivenessCheck_FullRestartCycle(t *testing.T) { hm.UpdateStatus(ComponentCollectorManager, HealthStatusHealthy, "restarted", nil) assert.NoError(t, hm.LivenessCheck()) // passes normally } + +// TestReadinessCheck_StandbyPassesWhenComponentsUnspecified verifies that a +// standby (non-leader) pod passes readiness even though its components are +// unspecified — it is healthy and ready to take over leadership. +func TestReadinessCheck_StandbyPassesWhenComponentsUnspecified(t *testing.T) { + hm := NewHealthManager() + hm.Register(ComponentCollectorManager) + hm.Register(ComponentDakrTransport) + // Components remain Unspecified (collectors never started on non-leader) + + hm.SetStandby(true) + assert.NoError(t, hm.ReadinessCheck()) +} + +// TestReadinessCheck_StandbyClearedEnforcesNormalChecks verifies that after +// winning leader election (SetStandby(false)), normal readiness rules apply. +func TestReadinessCheck_StandbyClearedEnforcesNormalChecks(t *testing.T) { + hm := NewHealthManager() + hm.Register(ComponentCollectorManager) + hm.Register(ComponentDakrTransport) + + hm.SetStandby(true) + assert.NoError(t, hm.ReadinessCheck()) // standby: passes + + hm.SetStandby(false) + assert.Error(t, hm.ReadinessCheck()) // components still Unspecified → fails + + hm.UpdateStatus(ComponentCollectorManager, HealthStatusHealthy, "ok", nil) + hm.UpdateStatus(ComponentDakrTransport, HealthStatusHealthy, "ok", nil) + assert.NoError(t, hm.ReadinessCheck()) // now passes +} diff --git a/internal/transport/telemetry_sender.go b/internal/transport/telemetry_sender.go index 0c453571..0e04ebc2 100644 --- a/internal/transport/telemetry_sender.go +++ b/internal/transport/telemetry_sender.go @@ -74,6 +74,11 @@ func (s *TelemetrySender) Start(ctx context.Context) error { s.logger.Info("Starting telemetry sender", "interval", s.interval) s.isRunning = true + // Mark transport as healthy optimistically so the readiness probe does not + // return 503 while waiting for the first successful send. The circuit + // breaker will downgrade the status if actual sends fail. + s.updateHealthStatus(health.HealthStatusHealthy, "Transport starting", nil) + go s.run(ctx) return nil }