Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,15 @@ func main() {
// reconciling before enforcing readiness checks.
healthManager.SuppressReadiness(2 * time.Minute)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Edge Case: Readiness grace period expires while pod is still in standby

The 2-minute readiness grace period (SuppressReadiness(2 * time.Minute)) is set once at startup (line 136), before SetStandby(true) is called. If the standby pod takes longer than 2 minutes to win the leader election (e.g., the current leader is slow to release the lease), the grace period will have already expired by the time SetStandby(false) is called. At that point, readiness checks immediately enforce normal component health requirements, but the collectors haven't had time to start and register as healthy yet — causing the readiness probe to fail and the pod to receive no traffic/be restarted.

This is especially likely in scenarios where the existing leader is being drained (the whole point of this PR) and holds the lease until its graceful shutdown timeout expires.

Suggested fix:

Re-apply the readiness grace period when clearing standby:

```go
go func() {
    select {
    case <-mgr.Elected():
        healthManager.SetStandby(false)
        healthManager.SuppressReadiness(2 * time.Minute)
    case <-ctx.Done():
    }
}()
```

Was this helpful? React with 👍 / 👎 | Reply gitar fix to apply this suggestion


// When leader election is enabled, mark this pod as standby until it wins
// the lease. Standby pods return 200 on /readyz so Kubernetes does not
// repeatedly mark them unhealthy while they wait. The flag is cleared
// (inside the goroutine below) the moment this pod is elected leader, at
// which point the normal 2-minute readiness grace period takes over.
if enableLeaderElection {
healthManager.SetStandby(true)
}

// No need to add the standard controller with kubebuilder:scaffold:builder
// The env-based controller doesn't rely on CRDs

Expand Down Expand Up @@ -204,8 +213,22 @@ func main() {
os.Exit(1)
}

ctx := ctrl.SetupSignalHandler()

// Clear standby the moment this pod wins leader election so that normal
// readiness checks (with the 2-minute grace period) take over.
if enableLeaderElection {
go func() {
select {
case <-mgr.Elected():
healthManager.SetStandby(false)
case <-ctx.Done():
}
}()
}

setupLog.Info("starting manager")
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
if err := mgr.Start(ctx); err != nil {
setupLog.Error(err, "problem running manager")
os.Exit(1)
}
Expand Down
2 changes: 1 addition & 1 deletion config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
selector:
matchLabels:
control-plane: controller-manager
replicas: 1
replicas: 2
template:
metadata:
annotations:
Expand Down
2 changes: 1 addition & 1 deletion dist/backend-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1480,7 +1480,7 @@ metadata:
name: devzero-zxporter-controller-manager
namespace: devzero-zxporter
spec:
replicas: 1
replicas: 2
selector:
matchLabels:
control-plane: controller-manager
Expand Down
2 changes: 1 addition & 1 deletion dist/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1485,7 +1485,7 @@ metadata:
name: devzero-zxporter-controller-manager
namespace: devzero-zxporter
spec:
replicas: 1
replicas: 2
selector:
matchLabels:
control-plane: controller-manager
Expand Down
2 changes: 1 addition & 1 deletion dist/installer_updater.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1424,7 +1424,7 @@ metadata:
name: devzero-zxporter-controller-manager
namespace: devzero-zxporter
spec:
replicas: 1
replicas: 2
selector:
matchLabels:
control-plane: controller-manager
Expand Down
2 changes: 1 addition & 1 deletion dist/zxporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ metadata:
name: devzero-zxporter-controller-manager
namespace: devzero-zxporter
spec:
replicas: 1
replicas: 2
selector:
matchLabels:
control-plane: controller-manager
Expand Down
2 changes: 1 addition & 1 deletion helm-chart/zxporter/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
name: devzero-zxporter-controller-manager
namespace: {{ .Release.Namespace }}
spec:
replicas: 1
replicas: {{ if .Values.highAvailability.enabled }}2{{ else }}1{{ end }}
selector:
matchLabels:
control-plane: controller-manager
Expand Down
3 changes: 1 addition & 2 deletions helm-chart/zxporter/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,10 @@ affinity: {}

# High Availability Configuration
highAvailability:
enabled: false
enabled: true
podDisruptionBudget:
enabled: true
minAvailable: 1
# Alternative: maxUnavailable: 1

# MPA Server Configuration
mpaServer:
Expand Down
14 changes: 14 additions & 0 deletions internal/health/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ type HealthManager struct {
components map[string]*ComponentStatus
livenessGraceUntil time.Time // LivenessCheck always passes before this deadline
readinessGraceUntil time.Time // ReadinessCheck always passes before this deadline
standby bool // standby=true when not leader; readiness passes unconditionally
}

// NewHealthManager creates a new HealthManager
Expand Down Expand Up @@ -118,6 +119,16 @@ func (hm *HealthManager) BuildReport() map[string]ComponentStatus {
return report
}

// SetStandby marks the pod as a standby (non-leader) replica. While in standby,
// ReadinessCheck passes unconditionally — the pod is healthy and ready to take
// over leadership, it just isn't running collectors yet. Call with false when
// leader election is won so normal readiness checks resume.
func (hm *HealthManager) SetStandby(standby bool) {
hm.mu.Lock()
defer hm.mu.Unlock()
hm.standby = standby
}

// SuppressLiveness makes LivenessCheck pass unconditionally for the given
// duration. Use this before a planned collector restart so that the transient
// Unhealthy window does not trigger a pod kill. The grace period is cleared
Expand Down Expand Up @@ -192,6 +203,9 @@ func (hm *HealthManager) ReadinessCheck() error {

// readinessCheckLocked performs the readiness check while the caller holds mu.
func (hm *HealthManager) readinessCheckLocked() error {
if hm.standby {
return nil // standby replica: healthy and ready to become leader
}
if !hm.readinessGraceUntil.IsZero() && time.Now().Before(hm.readinessGraceUntil) {
return nil
}
Expand Down
31 changes: 31 additions & 0 deletions internal/health/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,34 @@ func TestLivenessCheck_FullRestartCycle(t *testing.T) {
hm.UpdateStatus(ComponentCollectorManager, HealthStatusHealthy, "restarted", nil)
assert.NoError(t, hm.LivenessCheck()) // passes normally
}

// TestReadinessCheck_StandbyPassesWhenComponentsUnspecified verifies that a
// standby (non-leader) pod passes readiness even though its components are
// unspecified — it is healthy and ready to take over leadership.
func TestReadinessCheck_StandbyPassesWhenComponentsUnspecified(t *testing.T) {
hm := NewHealthManager()
hm.Register(ComponentCollectorManager)
hm.Register(ComponentDakrTransport)
// Components remain Unspecified (collectors never started on non-leader)

hm.SetStandby(true)
assert.NoError(t, hm.ReadinessCheck())
}

// TestReadinessCheck_StandbyClearedEnforcesNormalChecks verifies that after
// winning leader election (SetStandby(false)), normal readiness rules apply.
func TestReadinessCheck_StandbyClearedEnforcesNormalChecks(t *testing.T) {
hm := NewHealthManager()
hm.Register(ComponentCollectorManager)
hm.Register(ComponentDakrTransport)

hm.SetStandby(true)
assert.NoError(t, hm.ReadinessCheck()) // standby: passes

hm.SetStandby(false)
assert.Error(t, hm.ReadinessCheck()) // components still Unspecified → fails

hm.UpdateStatus(ComponentCollectorManager, HealthStatusHealthy, "ok", nil)
hm.UpdateStatus(ComponentDakrTransport, HealthStatusHealthy, "ok", nil)
assert.NoError(t, hm.ReadinessCheck()) // now passes
}
5 changes: 5 additions & 0 deletions internal/transport/telemetry_sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ func (s *TelemetrySender) Start(ctx context.Context) error {
s.logger.Info("Starting telemetry sender", "interval", s.interval)
s.isRunning = true

// Mark transport as healthy optimistically so the readiness probe does not
// return 503 while waiting for the first successful send. The circuit
// breaker will downgrade the status if actual sends fail.
s.updateHealthStatus(health.HealthStatusHealthy, "Transport starting", nil)

go s.run(ctx)
return nil
}
Expand Down
Loading