From 40cd15df9626c1ae0de71daff6c696468546bc59 Mon Sep 17 00:00:00 2001 From: Antonio Nesic Date: Wed, 25 Feb 2026 15:50:57 +0100 Subject: [PATCH] fix: start health server before manager init to prevent 503 on upgrades Move HealthManager creation and health server startup to before ctrl.NewManager() so K8s probes are answered immediately, preventing connection refused and 503 errors during slow initialization. --- cmd/main.go | 41 ++++++++++++++++++++++------------- internal/controller/custom.go | 10 +-------- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index c3e4ff58..f439b343 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -101,6 +101,31 @@ func main() { tlsOpts = append(tlsOpts, disableHTTP2) } + // Create HealthManager and start health server early so probes are + // answered immediately, before the (potentially slow) manager and + // controller initialisation. + healthManager := health.NewHealthManager() + healthManager.Register(health.ComponentCollectorManager) + healthManager.Register(health.ComponentBufferQueue) + healthManager.Register(health.ComponentDakrTransport) + healthManager.Register(health.ComponentMpaServer) + healthManager.Register(health.ComponentPrometheus) + + healthServer := health.NewHealthServer(healthManager, probeAddr) + if err := healthServer.Start(); err != nil { + setupLog.Error(err, "unable to start health server") + os.Exit(1) + } + defer func() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := healthServer.Stop(ctx); err != nil { + setupLog.Error(err, "error stopping health server") + } + }() + + setupLog.Info("health server started, initializing manager") + webhookServer := webhook.NewServer(webhook.Options{ TLSOpts: tlsOpts, }) @@ -148,7 +173,7 @@ func main() { } // Setup the env-based controller instead of the standard controller - envController, err := controller.NewEnvBasedController(mgr, reconcileInterval, mpaServerPort) + envController, err := controller.NewEnvBasedController(mgr, healthManager, reconcileInterval, mpaServerPort) if err != nil { setupLog.Error(err, "unable to create environment-based controller") os.Exit(1) @@ -164,20 +189,6 @@ func main() { // No need to add the standard controller with kubebuilder:scaffold:builder // The env-based controller doesn't rely on CRDs - // New health server from health package - healthServer := health.NewHealthServer(envController.Reconciler.HealthManager, probeAddr) - if err := healthServer.Start(); err != nil { - setupLog.Error(err, "unable to start health server") - os.Exit(1) - } - defer func() { - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - if err := healthServer.Stop(ctx); err != nil { - setupLog.Error(err, "error stopping health server") - } - }() - setupLog.Info("starting manager") if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { setupLog.Error(err, "problem running manager") diff --git a/internal/controller/custom.go b/internal/controller/custom.go index 4c5c3c3d..89a185b6 100644 --- a/internal/controller/custom.go +++ b/internal/controller/custom.go @@ -62,7 +62,7 @@ type EnvBasedController struct { } // NewEnvBasedController creates a new environment-based controller -func NewEnvBasedController(mgr ctrl.Manager, reconcileInterval time.Duration, mpaServerPort int) (*EnvBasedController, error) { +func NewEnvBasedController(mgr ctrl.Manager, healthManager *health.HealthManager, reconcileInterval time.Duration, mpaServerPort int) (*EnvBasedController, error) { // Set up basic components logger := util.NewLogger("env-controller") zapLogger, err := zap.NewProduction() @@ -97,14 +97,6 @@ func NewEnvBasedController(mgr ctrl.Manager, reconcileInterval time.Duration, mp return nil, fmt.Errorf("failed to create apiextensions client: %w", err) } - // Initialize HealthManager and register components - healthManager := health.NewHealthManager() - healthManager.Register(health.ComponentCollectorManager) - healthManager.Register(health.ComponentBufferQueue) - healthManager.Register(health.ComponentDakrTransport) - healthManager.Register(health.ComponentMpaServer) - healthManager.Register(health.ComponentPrometheus) - // Create a shared Telemetry metrics instance sharedTelemetryMetrics := collector.NewTelemetryMetrics()