From 018e66ff4ce705e72641b6273da6487468f1a6c6 Mon Sep 17 00:00:00 2001 From: Joseph Callen Date: Mon, 21 Sep 2020 10:31:05 -0400 Subject: [PATCH] vSphere: don't failover api vip if lb is responding Ports #1893 to the vSphere platform. vSphere CI is experiencing the issues described in the above PR. --- .../vsphere/files/vsphere-keepalived.yaml | 54 ++++++++++++++++--- .../files/vsphere-haproxy-haproxy.yaml | 2 +- .../vsphere/files/vsphere-haproxy.yaml | 2 +- .../files/vsphere-keepalived-keepalived.yaml | 51 ++++++++++++++++-- .../files/vsphere-keepalived-script-both.yaml | 6 +++ .../files/vsphere-keepalived-script.yaml | 6 +-- 6 files changed, 104 insertions(+), 17 deletions(-) create mode 100644 templates/master/00-master/vsphere/files/vsphere-keepalived-script-both.yaml diff --git a/templates/common/vsphere/files/vsphere-keepalived.yaml b/templates/common/vsphere/files/vsphere-keepalived.yaml index b9653c08fc..077e0b70e2 100644 --- a/templates/common/vsphere/files/vsphere-keepalived.yaml +++ b/templates/common/vsphere/files/vsphere-keepalived.yaml @@ -21,14 +21,46 @@ contents: - name: resource-dir hostPath: path: "/etc/kubernetes/static-pod-resources/keepalived" + - name: script-dir + hostPath: + path: "/etc/kubernetes/static-pod-resources/keepalived/scripts" - name: kubeconfig hostPath: - path: "/etc/kubernetes/kubeconfig" + path: "/etc/kubernetes" + - name: kubeconfigvarlib + hostPath: + path: "/var/lib/kubelet" - name: conf-dir hostPath: path: "/etc/keepalived" - name: run-dir empty-dir: {} + - name: chroot-host + hostPath: + path: "/" + initContainers: + - name: render-config-keepalived + image: {{ .Images.baremetalRuntimeCfgImage }} + command: + - runtimecfg + - render + - "/etc/kubernetes/kubeconfig" + - "--api-vip" + - "{{ .Infra.Status.PlatformStatus.VSphere.APIServerInternalIP }}" + - "--ingress-vip" + - "{{ .Infra.Status.PlatformStatus.VSphere.IngressIP }}" + - "/config" + - "--out-dir" + - "/etc/keepalived" + resources: {} + volumeMounts: + - name: kubeconfig + mountPath: "/etc/kubernetes" + - name: script-dir + mountPath: "/config" + - name: conf-dir + mountPath: "/etc/keepalived" + imagePullPolicy: IfNotPresent containers: - name: keepalived securityContext: @@ -84,19 +116,25 @@ contents: livenessProbe: exec: command: - - /bin/sh + - /bin/bash - -c - | - [[ -s /etc/keepalived/keepalived.conf ]] || \ kill -s SIGUSR1 "$(pgrep -o keepalived)" && ! grep -q "State = FAULT" /tmp/keepalived.data - initialDelaySeconds: 10 + initialDelaySeconds: 20 terminationMessagePolicy: FallbackToLogsOnError imagePullPolicy: IfNotPresent - name: keepalived-monitor + securityContext: + privileged: true image: {{ .Images.baremetalRuntimeCfgImage }} + env: + - name: ENABLE_UNICAST + value: "no" + - name: IS_BOOTSTRAP + value: "no" command: - dynkeepalived - - "/etc/kubernetes/kubeconfig" + - "/var/lib/kubelet/kubeconfig" - "/config/keepalived.conf.tmpl" - "/etc/keepalived/keepalived.conf" - "--api-vip" @@ -110,12 +148,14 @@ contents: volumeMounts: - name: resource-dir mountPath: "/config" - - name: kubeconfig - mountPath: "/etc/kubernetes/kubeconfig" + - name: kubeconfigvarlib + mountPath: "/var/lib/kubelet" - name: conf-dir mountPath: "/etc/keepalived" - name: run-dir mountPath: "/var/run/keepalived" + - name: chroot-host + mountPath: "/host" imagePullPolicy: IfNotPresent hostNetwork: true tolerations: diff --git a/templates/master/00-master/vsphere/files/vsphere-haproxy-haproxy.yaml b/templates/master/00-master/vsphere/files/vsphere-haproxy-haproxy.yaml index 8d8beeff09..f431d7f185 100644 --- a/templates/master/00-master/vsphere/files/vsphere-haproxy-haproxy.yaml +++ b/templates/master/00-master/vsphere/files/vsphere-haproxy-haproxy.yaml @@ -25,7 +25,7 @@ contents: listen health_check_http_url bind :::50936 v4v6 mode http - monitor-uri /readyz + monitor-uri /haproxy_ready option dontlognull listen stats bind localhost:{{`{{ .LBConfig.StatPort }}`}} diff --git a/templates/master/00-master/vsphere/files/vsphere-haproxy.yaml b/templates/master/00-master/vsphere/files/vsphere-haproxy.yaml index 9c4529a723..efdbbe2f53 100644 --- a/templates/master/00-master/vsphere/files/vsphere-haproxy.yaml +++ b/templates/master/00-master/vsphere/files/vsphere-haproxy.yaml @@ -105,7 +105,7 @@ contents: livenessProbe: initialDelaySeconds: 10 httpGet: - path: /readyz + path: /haproxy_ready port: 50936 terminationMessagePolicy: FallbackToLogsOnError imagePullPolicy: IfNotPresent diff --git a/templates/master/00-master/vsphere/files/vsphere-keepalived-keepalived.yaml b/templates/master/00-master/vsphere/files/vsphere-keepalived-keepalived.yaml index 7d7f94a59e..313eebe867 100644 --- a/templates/master/00-master/vsphere/files/vsphere-keepalived-keepalived.yaml +++ b/templates/master/00-master/vsphere/files/vsphere-keepalived-keepalived.yaml @@ -12,10 +12,31 @@ contents: script_user root } - vrrp_script chk_ocp { - script "/usr/bin/timeout 0.9 /etc/keepalived/chk_ocp_script.sh" - interval 1 - weight 50 + # These are separate checks to provide the following behavior: + # If the loadbalanced endpoint is responding then all is well regardless + # of what the local api status is. Both checks will return success and + # we'll have the maximum priority. This means as long as there is a node + # with a functional loadbalancer it will get the VIP. + # If all of the loadbalancers go down but the local api is still running, + # the _both check will still succeed and allow any node with a functional + # api to take the VIP. This isn't preferred because it means all api + # traffic will go through one node, but at least it keeps the api available. + vrrp_script chk_ocp_lb { + script "/usr/bin/timeout 1.9 /etc/keepalived/chk_ocp_script.sh" + interval 2 + weight 20 + rise 3 + fall 2 + } + + vrrp_script chk_ocp_both { + script "/usr/bin/timeout 1.9 /etc/keepalived/chk_ocp_script_both.sh" + interval 2 + # Use a smaller weight for this check so it won't trigger the move from + # bootstrap to master by itself. + weight 5 + rise 3 + fall 2 } # TODO: Improve this check. The port is assumed to be alive. @@ -26,12 +47,23 @@ contents: weight 50 } + {{`{{$nonVirtualIP := .NonVirtualIP}}`}} + vrrp_instance {{`{{ .Cluster.Name }}`}}_API { state BACKUP interface {{`{{ .VRRPInterface }}`}} virtual_router_id {{`{{ .Cluster.APIVirtualRouterID }}`}} priority 40 advert_int 1 + {{`{{if .EnableUnicast}}`}} + unicast_src_ip {{`{{.NonVirtualIP}}`}} + unicast_peer { + {{`{{ .BootstrapIP }}`}} + {{`{{range .LBConfig.Backends}} + {{if ne $nonVirtualIP .Address}}{{.Address}}{{end}} + {{end}}`}} + } + {{`{{end}}`}} authentication { auth_type PASS auth_pass {{`{{ .Cluster.Name }}`}}_api_vip @@ -40,7 +72,8 @@ contents: {{`{{ .Cluster.APIVIP }}`}}/{{`{{ .Cluster.VIPNetmask }}`}} } track_script { - chk_ocp + chk_ocp_lb + chk_ocp_both } } @@ -50,6 +83,14 @@ contents: virtual_router_id {{`{{ .Cluster.IngressVirtualRouterID }}`}} priority 40 advert_int 1 + {{`{{if .EnableUnicast}}`}} + unicast_src_ip {{`{{.NonVirtualIP}}`}} + unicast_peer { + {{`{{range .IngressConfig.Peers}} + {{if ne $nonVirtualIP .}}{{.}}{{end}} + {{end}}`}} + } + {{`{{end}}`}} authentication { auth_type PASS auth_pass {{`{{ .Cluster.Name }}`}}_ingress_vip diff --git a/templates/master/00-master/vsphere/files/vsphere-keepalived-script-both.yaml b/templates/master/00-master/vsphere/files/vsphere-keepalived-script-both.yaml new file mode 100644 index 0000000000..1b4e597711 --- /dev/null +++ b/templates/master/00-master/vsphere/files/vsphere-keepalived-script-both.yaml @@ -0,0 +1,6 @@ +mode: 0755 +path: "/etc/kubernetes/static-pod-resources/keepalived/scripts/chk_ocp_script_both.sh.tmpl" +contents: + inline: | + #!/bin/bash + /usr/bin/curl -o /dev/null -kLfs https://localhost:{{`{{ .LBConfig.LbPort }}`}}/readyz && [ -e /var/run/keepalived/iptables-rule-exists ] || /usr/bin/curl -kLfs https://localhost:{{`{{ .LBConfig.ApiPort }}`}}/readyz diff --git a/templates/master/00-master/vsphere/files/vsphere-keepalived-script.yaml b/templates/master/00-master/vsphere/files/vsphere-keepalived-script.yaml index e0d45b34bf..c500a60d2f 100644 --- a/templates/master/00-master/vsphere/files/vsphere-keepalived-script.yaml +++ b/templates/master/00-master/vsphere/files/vsphere-keepalived-script.yaml @@ -1,6 +1,6 @@ mode: 0755 -path: "/etc/keepalived/chk_ocp_script.sh" +path: "/etc/kubernetes/static-pod-resources/keepalived/scripts/chk_ocp_script.sh.tmpl" contents: inline: | - #!/bin/bash - /usr/bin/curl -o /dev/null -kLfs https://localhost:6443/readyz && /usr/bin/curl -o /dev/null -kLfs http://localhost:50936/readyz + #!/bin/bash + /usr/bin/curl -o /dev/null -kLfs https://localhost:{{`{{ .LBConfig.LbPort }}`}}/readyz && [ -e /var/run/keepalived/iptables-rule-exists ]