From 5acd73c7836f947d532f92b53afa848879fd3aad Mon Sep 17 00:00:00 2001 From: Danil-Grigorev Date: Wed, 27 May 2020 15:41:15 +0200 Subject: [PATCH 1/2] Add healthchecks for Machine, MachineSet and MHC controllers OCPCLOUD-785 - health checks for all machine API controllers This introduce support for readinessProbe and livenessProbe [1] for the owned machine controllers deployment and its machineSet, MHC and machine controller containers. This will let the kubelet to better acknowledge about these containers lifecycle and therefore letting us to be more robust to signal the operator degradability on the clusterOperator status. This PR needs [2], [3] and [4] to work and pass CI so the probes included in the container spec here can get a 200 from the machine controllers. Also additionally [5], [6] and [7] must the same to not break or the probes included in the container spec here will fail will and result in the containers getting restarted. This also reverts accidental rebase and put back the syncPeriod which was dropped by [8]. [1] https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes [2] openshift/cluster-api-provider-aws#329 [3] openshift/cluster-api-provider-azure#139 [4] openshift/cluster-api-provider-gcp#96 [5] https://github.com/openshift/cluster-api-provider-openstack [6] https://github.com/openshift/cluster-api-provider-ovirt [7] https://github.com/metal3-io/cluster-api-provider-metal3 [8] https://github.com/openshift/machine-api-operator/pull/590/files#diff-7417e4bc31a1bacc1a431704bee56978L41 --- cmd/machine-healthcheck/main.go | 33 +++++++++++++++-- cmd/machineset/main.go | 22 +++++++++-- cmd/vsphere/main.go | 18 ++++++++- pkg/operator/sync.go | 66 +++++++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 8 deletions(-) diff --git a/cmd/machine-healthcheck/main.go b/cmd/machine-healthcheck/main.go index 60a478a60a..cb8ae2be96 100644 --- a/cmd/machine-healthcheck/main.go +++ b/cmd/machine-healthcheck/main.go @@ -12,7 +12,9 @@ import ( "github.com/openshift/machine-api-operator/pkg/controller" sdkVersion "github.com/operator-framework/operator-sdk/version" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" + "k8s.io/klog" "sigs.k8s.io/controller-runtime/pkg/client/config" + "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/runtime/signals" ) @@ -24,8 +26,24 @@ func printVersion() { } func main() { - watchNamespace := flag.String("namespace", "", "Namespace that the controller watches to reconcile machine-api objects. If unspecified, the controller watches for machine-api objects across all namespaces.") - metricsAddress := flag.String("metrics-bind-address", metrics.DefaultHealthCheckMetricsAddress, "Address for hosting metrics") + watchNamespace := flag.String( + "namespace", + "", + "Namespace that the controller watches to reconcile machine-api objects. If unspecified, the controller watches for machine-api objects across all namespaces.", + ) + + metricsAddress := flag.String( + "metrics-bind-address", + metrics.DefaultHealthCheckMetricsAddress, + "Address for hosting metrics", + ) + + healthAddr := flag.String( + "health-addr", + ":9442", + "The address for health checking.", + ) + flag.Parse() printVersion() @@ -36,7 +54,8 @@ func main() { } opts := manager.Options{ - MetricsBindAddress: *metricsAddress, + MetricsBindAddress: *metricsAddress, + HealthProbeBindAddress: *healthAddr, } if *watchNamespace != "" { opts.Namespace = *watchNamespace @@ -63,6 +82,14 @@ func main() { glog.Fatal(err) } + if err := mgr.AddReadyzCheck("ping", healthz.Ping); err != nil { + klog.Fatal(err) + } + + if err := mgr.AddHealthzCheck("ping", healthz.Ping); err != nil { + klog.Fatal(err) + } + glog.Info("Starting the Cmd.") // Start the Cmd diff --git a/cmd/machineset/main.go b/cmd/machineset/main.go index 91d3d57546..0a2b531e4f 100644 --- a/cmd/machineset/main.go +++ b/cmd/machineset/main.go @@ -28,6 +28,7 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" "k8s.io/klog" "sigs.k8s.io/controller-runtime/pkg/client/config" + "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/runtime/signals" "sigs.k8s.io/controller-runtime/pkg/webhook" @@ -54,6 +55,12 @@ func main() { webhookCertdir := flag.String("webhook-cert-dir", defaultWebhookCertdir, "Webhook cert dir, only used when webhook-enabled is true.") + healthAddr := flag.String( + "health-addr", + ":9441", + "The address for health checking.", + ) + flag.Parse() if *watchNamespace != "" { log.Printf("Watching cluster-api objects only in namespace %q for reconciliation.", *watchNamespace) @@ -69,9 +76,10 @@ func main() { // Create a new Cmd to provide shared dependencies and start components syncPeriod := 10 * time.Minute opts := manager.Options{ - MetricsBindAddress: *metricsAddress, - SyncPeriod: &syncPeriod, - Namespace: *watchNamespace, + MetricsBindAddress: *metricsAddress, + SyncPeriod: &syncPeriod, + Namespace: *watchNamespace, + HealthProbeBindAddress: *healthAddr, } mgr, err := manager.New(cfg, opts) @@ -109,6 +117,14 @@ func main() { log.Fatal(err) } + if err := mgr.AddReadyzCheck("ping", healthz.Ping); err != nil { + klog.Fatal(err) + } + + if err := mgr.AddHealthzCheck("ping", healthz.Ping); err != nil { + klog.Fatal(err) + } + log.Printf("Starting the Cmd.") // Start the Cmd diff --git a/cmd/vsphere/main.go b/cmd/vsphere/main.go index 9b2b79b04d..2e2679b764 100644 --- a/cmd/vsphere/main.go +++ b/cmd/vsphere/main.go @@ -14,6 +14,7 @@ import ( "github.com/openshift/machine-api-operator/pkg/version" "k8s.io/klog" "sigs.k8s.io/controller-runtime/pkg/client/config" + "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/runtime/signals" ) @@ -26,6 +27,11 @@ func main() { watchNamespace := flag.String("namespace", "", "Namespace that the controller watches to reconcile machine-api objects. If unspecified, the controller watches for machine-api objects across all namespaces.") metricsAddress := flag.String("metrics-bind-address", metrics.DefaultMachineMetricsAddress, "Address for hosting metrics") flag.Set("logtostderr", "true") + healthAddr := flag.String( + "health-addr", + ":9440", + "The address for health checking.", + ) flag.Parse() if printVersion { @@ -34,9 +40,9 @@ func main() { } cfg := config.GetConfigOrDie() - opts := manager.Options{ - MetricsBindAddress: *metricsAddress, + MetricsBindAddress: *metricsAddress, + HealthProbeBindAddress: *healthAddr, } if *watchNamespace != "" { opts.Namespace = *watchNamespace @@ -70,6 +76,14 @@ func main() { capimachine.AddWithActuator(mgr, machineActuator) + if err := mgr.AddReadyzCheck("ping", healthz.Ping); err != nil { + klog.Fatal(err) + } + + if err := mgr.AddHealthzCheck("ping", healthz.Ping); err != nil { + klog.Fatal(err) + } + if err := mgr.Start(signals.SetupSignalHandler()); err != nil { klog.Fatalf("Failed to run manager: %v", err) } diff --git a/pkg/operator/sync.go b/pkg/operator/sync.go index 0e4298b15b..4dc6e67fd7 100644 --- a/pkg/operator/sync.go +++ b/pkg/operator/sync.go @@ -16,6 +16,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/utils/pointer" ) @@ -30,6 +31,9 @@ const ( machineExposeMetricsPort = 8441 machineSetExposeMetricsPort = 8442 machineHealthCheckExposeMetricsPort = 8444 + defaultMachineHealthPort = 9440 + defaultMachineSetHealthPort = 9441 + defaultMachineHealthCheckHealthPort = 9442 kubeRBACConfigName = "config" certStoreName = "machine-api-controllers-tls" ) @@ -379,6 +383,26 @@ func newContainers(config *OperatorConfig, features map[string]bool) []corev1.Co Name: "webhook-server", ContainerPort: 8443, }, + { + Name: "healthz", + ContainerPort: defaultMachineSetHealthPort, + }, + }, + ReadinessProbe: &corev1.Probe{ + Handler: corev1.Handler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.Parse("healthz"), + }, + }, + }, + LivenessProbe: &corev1.Probe{ + Handler: corev1.Handler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/readyz", + Port: intstr.Parse("healthz"), + }, + }, }, VolumeMounts: []corev1.VolumeMount{ { @@ -404,6 +428,26 @@ func newContainers(config *OperatorConfig, features map[string]bool) []corev1.Co }, }, }, + Ports: []corev1.ContainerPort{{ + Name: "healthz", + ContainerPort: defaultMachineHealthPort, + }}, + ReadinessProbe: &corev1.Probe{ + Handler: corev1.Handler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.Parse("healthz"), + }, + }, + }, + LivenessProbe: &corev1.Probe{ + Handler: corev1.Handler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/readyz", + Port: intstr.Parse("healthz"), + }, + }, + }, }, { Name: "nodelink-controller", @@ -418,6 +462,28 @@ func newContainers(config *OperatorConfig, features map[string]bool) []corev1.Co Command: []string{"/machine-healthcheck"}, Args: args, Resources: resources, + Ports: []corev1.ContainerPort{ + { + Name: "healthz", + ContainerPort: defaultMachineHealthCheckHealthPort, + }, + }, + ReadinessProbe: &corev1.Probe{ + Handler: corev1.Handler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.Parse("healthz"), + }, + }, + }, + LivenessProbe: &corev1.Probe{ + Handler: corev1.Handler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/readyz", + Port: intstr.Parse("healthz"), + }, + }, + }, }, } return containers From 64261c8acfcc75df44f11a351913ea1943c6e27a Mon Sep 17 00:00:00 2001 From: Alexander Demichev Date: Mon, 1 Jun 2020 12:11:17 +0200 Subject: [PATCH 2/2] [vSphere] Reduce sync period to 10 minutes - Reintroducing a fix dropped in 4c9abf94ef93ee1b495a8a9443ca1bda18d7171e --- cmd/vsphere/main.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmd/vsphere/main.go b/cmd/vsphere/main.go index 2e2679b764..1b44c4a6e3 100644 --- a/cmd/vsphere/main.go +++ b/cmd/vsphere/main.go @@ -4,6 +4,7 @@ import ( "flag" "fmt" "os" + "time" configv1 "github.com/openshift/api/config/v1" "github.com/openshift/machine-api-operator/pkg/apis/machine/v1beta1" @@ -40,9 +41,12 @@ func main() { } cfg := config.GetConfigOrDie() + syncPeriod := 10 * time.Minute + opts := manager.Options{ MetricsBindAddress: *metricsAddress, HealthProbeBindAddress: *healthAddr, + SyncPeriod: &syncPeriod, } if *watchNamespace != "" { opts.Namespace = *watchNamespace