From 77dc2da420f0295d4c60384fba072d8359f32431 Mon Sep 17 00:00:00 2001 From: Miciah Masters Date: Tue, 14 May 2019 15:30:24 -0400 Subject: [PATCH] Add shell script for graceful shutdown Add a shutdown-haproxy shell script, which can be used as the pre-stop handler for a router deployment in order to provide graceful shutdown. Whereas a hard shutdown simply sends a SIGTERM signal to all haproxy processes to make them immediately shut down, a graceful shutdown does the following: 1. Create a marker file that tells the router's /healthz health check to return false. 2. Sleep for 90 seconds to allow time for external load-balancers to observe the failing health check and drain connections to the router. 3. Create a marker file that tells the reload-haproxy script to exit immediately. 3. Send a SIGUSR1 signal to all HAProxy processes to make them stop accepting new connections. 4. Wait up to 30 seconds for all HAProxy processes to terminate. 5. Perform a hard shutdown of any remaining HAProxy processes. This commit is related to bug 1709958. https://bugzilla.redhat.com/show_bug.cgi?id=1709958 * images/router/haproxy/reload-haproxy: Exit immediately if the file /var/lib/haproxy/run/terminating is present. * images/router/haproxy/shutdown-haproxy: New file. Perform a graceful shutdown as described above. * pkg/cmd/infra/router/template.go (Run): Configure a listener for readiness checks specifically intended for external load balancers. In particular, these checks include the new IsDraining check. * pkg/router/metrics/health.go (IsDraining): New function. Return a healthz check that fails if the "draining" marker that the shutdown script creates exists. * pkg/router/metrics/metrics.go (Listener): Rename LiveChecks to PodLiveChecks. Rename ReadyChecks to PodReadyChecks. Add a new field, LBReadyChecks for readiness checks specific to external load balancers. (handler): Register LBReadyChecks as /healthz, PodLiveChecks as /healthz/live, and PodReadyChecks as /healthz/ready. --- images/router/haproxy/reload-haproxy | 4 +++ images/router/haproxy/shutdown-haproxy | 49 ++++++++++++++++++++++++++ pkg/cmd/infra/router/template.go | 15 +++++--- pkg/router/metrics/health.go | 15 ++++++++ pkg/router/metrics/metrics.go | 10 +++--- 5 files changed, 85 insertions(+), 8 deletions(-) create mode 100755 images/router/haproxy/shutdown-haproxy diff --git a/images/router/haproxy/reload-haproxy b/images/router/haproxy/reload-haproxy index 7e6855156..fed5478f8 100755 --- a/images/router/haproxy/reload-haproxy +++ b/images/router/haproxy/reload-haproxy @@ -66,6 +66,10 @@ function haproxyHealthCheck() { } +if [[ -e ${TERMINATE_MARKER:-'/var/lib/haproxy/run/terminating'} ]]; then + exit 0 +fi + old_pids=$(pidof haproxy) reload_status=0 diff --git a/images/router/haproxy/shutdown-haproxy b/images/router/haproxy/shutdown-haproxy new file mode 100755 index 000000000..38c292be3 --- /dev/null +++ b/images/router/haproxy/shutdown-haproxy @@ -0,0 +1,49 @@ +#!/bin/bash + +set -euo pipefail + +# DRAIN_MARKER is the path to a file that indicates that proxies or load +# balancers should be draining connections to the router. The router's health +# check for load balancers checks for this file, and if the file is present, the +# health check fails in order to cause load balancers to drain connections to +# the router. +: ${DRAIN_MARKER:='/var/lib/haproxy/run/draining'} + +# DRAIN_PERIOD is the number of seconds that this script will wait for +# connections to drain before it sends the SIGUSR1 signal to terminate HAProxy +# processes. +: ${DRAIN_PERIOD:=90} + +# TERMINATE_MARKER is the path to a file that indicates that this router is +# terminating. The reload-haproxy script exits immediately if this marker is +# present. +: ${TERMINATE_MARKER:='/var/lib/haproxy/run/terminating'} + +# GRACE_PERIOD is the number of seconds that this script will wait for HAProxy +# processes to terminate after it sends the SIGUSR1 signal, before it sends +# SIGTERM to any remaining HAProxy processes. +: ${GRACE_PERIOD:=30} + +echo " - Setting drain marker..." +: > "$DRAIN_MARKER" + +echo " - Sleeping $DRAIN_PERIOD seconds to let connections drain..." +sleep "$DRAIN_PERIOD" + +echo " - Sending SIGUSR1 to HAProxy processes and waiting up to $GRACE_PERIOD seconds for processes to terminate..." +stop=$((SECONDS + GRACE_PERIOD)) +while pkill -USR1 haproxy; rc=$?; [[ $rc -ne 1 ]] && ((SECONDS < stop)); do + sleep 1 +done + +if [[ "$rc" -eq 1 ]]; then + echo ' - Done. All processes have exited.' + exit 0 +fi + +: > "$TERMINATE_MARKER" +echo ' - Sending SIGTERM to HAProxy processes...' +while pkill -TERM haproxy; [[ $? -ne 1 ]]; do + sleep 1 +done +echo ' - Done. All processes have been terminated.' diff --git a/pkg/cmd/infra/router/template.go b/pkg/cmd/infra/router/template.go index 6882c5fd4..b5f3f3857 100644 --- a/pkg/cmd/infra/router/template.go +++ b/pkg/cmd/infra/router/template.go @@ -374,10 +374,16 @@ func (o *TemplateRouterOptions) Run() error { return err } checkController := metrics.ControllerLive() - liveChecks := []healthz.HealthzChecker{checkController} + podLiveChecks := []healthz.HealthzChecker{checkController} if !(isTrue(env("ROUTER_BIND_PORTS_BEFORE_SYNC", ""))) { - liveChecks = append(liveChecks, checkBackend) + podLiveChecks = append(podLiveChecks, checkBackend) } + podReadyChecks := []healthz.HealthzChecker{ + checkBackend, + checkSync, + } + checkDraining := metrics.IsDraining() + lbReadyChecks := append(podReadyChecks, checkDraining) kubeconfig, _, err := o.Config.KubeConfig() if err != nil { @@ -420,8 +426,9 @@ func (o *TemplateRouterOptions) Run() error { Resource: "routers", Name: o.RouterName, }, - LiveChecks: liveChecks, - ReadyChecks: []healthz.HealthzChecker{checkBackend, checkSync}, + LBReadyChecks: lbReadyChecks, + PodLiveChecks: podLiveChecks, + PodReadyChecks: podReadyChecks, } if certFile := env("ROUTER_METRICS_TLS_CERT_FILE", ""); len(certFile) > 0 { certificate, err := tls.LoadX509KeyPair(certFile, env("ROUTER_METRICS_TLS_KEY_FILE", "")) diff --git a/pkg/router/metrics/health.go b/pkg/router/metrics/health.go index 9f5f28cff..ce215d1f4 100644 --- a/pkg/router/metrics/health.go +++ b/pkg/router/metrics/health.go @@ -8,6 +8,7 @@ import ( "net" "net/http" "net/url" + "os" "time" "github.com/golang/glog" @@ -53,6 +54,20 @@ func HasSynced(routerPtr **templateplugin.TemplatePlugin) (healthz.HealthzChecke }), nil } +// IsDraining returns a healthz check that fails if the "draining" marker is +// present, indicating that load balancers should be draining connections to the +// router. +func IsDraining() healthz.HealthzChecker { + const drainMarker string = "/var/lib/haproxy/run/draining" + + return healthz.NamedCheck("is-terminating", func(r *http.Request) error { + if _, err := os.Stat(drainMarker); os.IsNotExist(err) { + return nil + } + return fmt.Errorf("Terminating") + }) +} + func ControllerLive() healthz.HealthzChecker { return healthz.NamedCheck("controller", func(r *http.Request) error { return nil diff --git a/pkg/router/metrics/metrics.go b/pkg/router/metrics/metrics.go index f9b72e79d..918859866 100644 --- a/pkg/router/metrics/metrics.go +++ b/pkg/router/metrics/metrics.go @@ -31,14 +31,16 @@ type Listener struct { Authorizer authorizer.Authorizer Record authorizer.AttributesRecord - LiveChecks []healthz.HealthzChecker - ReadyChecks []healthz.HealthzChecker + LBReadyChecks []healthz.HealthzChecker + PodLiveChecks []healthz.HealthzChecker + PodReadyChecks []healthz.HealthzChecker } func (l Listener) handler() http.Handler { mux := http.NewServeMux() - healthz.InstallHandler(mux, l.LiveChecks...) - healthz.InstallPathHandler(mux, "/healthz/ready", l.ReadyChecks...) + healthz.InstallHandler(mux, l.LBReadyChecks...) + healthz.InstallPathHandler(mux, "/healthz/live", l.PodLiveChecks...) + healthz.InstallPathHandler(mux, "/healthz/ready", l.PodReadyChecks...) if l.Authenticator != nil { protected := http.NewServeMux()