From 3d3b575168713c08cc803e41b2b60a6848e41ae9 Mon Sep 17 00:00:00 2001 From: Danil-Grigorev Date: Wed, 12 Aug 2020 15:10:47 +0200 Subject: [PATCH 1/5] Slow the default lease retry and renew rate for MHC controller Prevent machine controllers from writing in etcd at idle too often by setting 30s retry and 90s deadline on all renewals. BZ 1858403 --- cmd/machine-healthcheck/main.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmd/machine-healthcheck/main.go b/cmd/machine-healthcheck/main.go index 812f8b61fb..0873dd7c38 100644 --- a/cmd/machine-healthcheck/main.go +++ b/cmd/machine-healthcheck/main.go @@ -20,6 +20,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/runtime/signals" ) +// The default durations for the leader electrion operations. +var ( + retryPeriod = 30 * time.Second + renewDealine = 90 * time.Second +) + func printVersion() { glog.Infof("Go Version: %s", runtime.Version()) glog.Infof("Go OS/Arch: %s/%s", runtime.GOOS, runtime.GOARCH) @@ -79,6 +85,9 @@ func main() { LeaderElectionNamespace: *leaderElectResourceNamespace, LeaderElectionID: "cluster-api-provider-healthcheck-leader", LeaseDuration: leaderElectLeaseDuration, + // Slow the default retry and renew election rate to reduce etcd writes at idle: BZ 1858400 + RetryPeriod: &retryPeriod, + RenewDeadline: &renewDealine, } if *watchNamespace != "" { From c939c9208557c35a446d4217e5a4d3ea7de9caf7 Mon Sep 17 00:00:00 2001 From: Danil-Grigorev Date: Wed, 12 Aug 2020 16:19:37 +0200 Subject: [PATCH 2/5] Slow the default lease retry and renew rate for MachineSet controller Prevent machine controllers from writing in etcd at idle too often by setting 30s retry and 90s deadline on all renewals. BZ 1858403 --- cmd/machineset/main.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmd/machineset/main.go b/cmd/machineset/main.go index 109b306bbb..90eaffcf59 100644 --- a/cmd/machineset/main.go +++ b/cmd/machineset/main.go @@ -39,6 +39,12 @@ const ( defaultWebhookCertdir = "/etc/machine-api-operator/tls" ) +// The default durations for the leader electrion operations. +var ( + retryPeriod = 30 * time.Second + renewDealine = 90 * time.Second +) + func main() { flag.Set("logtostderr", "true") klog.InitFlags(nil) @@ -102,6 +108,9 @@ func main() { LeaderElectionNamespace: *leaderElectResourceNamespace, LeaderElectionID: "cluster-api-provider-machineset-leader", LeaseDuration: leaderElectLeaseDuration, + // Slow the default retry and renew election rate to reduce etcd writes at idle: BZ 1858400 + RetryPeriod: &retryPeriod, + RenewDeadline: &renewDealine, } mgr, err := manager.New(cfg, opts) From 5fdd6efbccb119d4805c373c959330242bbd6e67 Mon Sep 17 00:00:00 2001 From: Danil-Grigorev Date: Wed, 12 Aug 2020 16:20:07 +0200 Subject: [PATCH 3/5] Slow the default lease retry and renew rate for nodelink controller Prevent machine controllers from writing in etcd at idle too often by setting 60s retry and delay on all renewals. BZ 1858403 --- cmd/nodelink-controller/main.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmd/nodelink-controller/main.go b/cmd/nodelink-controller/main.go index 99572c5a54..047c1f3984 100644 --- a/cmd/nodelink-controller/main.go +++ b/cmd/nodelink-controller/main.go @@ -15,6 +15,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/runtime/signals" ) +// The default durations for the leader electrion operations. +var ( + retryPeriod = 30 * time.Second + renewDealine = 90 * time.Second +) + func printVersion() { klog.Infof("Go Version: %s", runtime.Version()) klog.Infof("Go OS/Arch: %s/%s", runtime.GOOS, runtime.GOARCH) @@ -65,6 +71,9 @@ func main() { LeaderElectionNamespace: *leaderElectResourceNamespace, LeaderElectionID: "cluster-api-provider-nodelink-leader", LeaseDuration: leaderElectLeaseDuration, + // Slow the default retry and renew election rate to reduce etcd writes at idle: BZ 1858400 + RetryPeriod: &retryPeriod, + RenewDeadline: &renewDealine, } if *watchNamespace != "" { opts.Namespace = *watchNamespace From 48d9cce5d878be8b45f1d4b6df0345cee0d7593b Mon Sep 17 00:00:00 2001 From: Danil-Grigorev Date: Wed, 12 Aug 2020 16:20:25 +0200 Subject: [PATCH 4/5] Slow the default lease retry and renew rate for vsphere controller Prevent machine controllers from writing in etcd at idle too often by setting 60s retry and delay on all renewals. BZ 1858403 --- cmd/vsphere/main.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmd/vsphere/main.go b/cmd/vsphere/main.go index 89d3e767f6..61e7e95202 100644 --- a/cmd/vsphere/main.go +++ b/cmd/vsphere/main.go @@ -20,6 +20,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/runtime/signals" ) +// The default durations for the leader electrion operations. +var ( + retryPeriod = 30 * time.Second + renewDealine = 90 * time.Second +) + func main() { var printVersion bool flag.BoolVar(&printVersion, "version", false, "print version and exit") @@ -79,6 +85,9 @@ func main() { LeaderElectionNamespace: *leaderElectResourceNamespace, LeaderElectionID: "cluster-api-provider-vsphere-leader", LeaseDuration: leaderElectLeaseDuration, + // Slow the default retry and renew election rate to reduce etcd writes at idle: BZ 1858400 + RetryPeriod: &retryPeriod, + RenewDeadline: &renewDealine, } if *watchNamespace != "" { From fe7cbdc445c681292859752904012bebdd2c2a98 Mon Sep 17 00:00:00 2001 From: Danil-Grigorev Date: Thu, 13 Aug 2020 15:28:01 +0200 Subject: [PATCH 5/5] Reconcider the default lease durations on 120/110/90 --- cmd/machine-healthcheck/main.go | 7 ++++--- cmd/machineset/main.go | 7 ++++--- cmd/nodelink-controller/main.go | 7 ++++--- cmd/vsphere/main.go | 7 ++++--- pkg/operator/sync.go | 2 +- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/cmd/machine-healthcheck/main.go b/cmd/machine-healthcheck/main.go index 0873dd7c38..ded2554d13 100644 --- a/cmd/machine-healthcheck/main.go +++ b/cmd/machine-healthcheck/main.go @@ -22,8 +22,9 @@ import ( // The default durations for the leader electrion operations. var ( - retryPeriod = 30 * time.Second - renewDealine = 90 * time.Second + leaseDuration = 120 * time.Second + renewDealine = 110 * time.Second + retryPeriod = 90 * time.Second ) func printVersion() { @@ -65,7 +66,7 @@ func main() { leaderElectLeaseDuration := flag.Duration( "leader-elect-lease-duration", - 90*time.Second, + leaseDuration, "The duration that non-leader candidates will wait after observing a leadership renewal until attempting to acquire leadership of a led but unrenewed leader slot. This is effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. This is only applicable if leader election is enabled.", ) diff --git a/cmd/machineset/main.go b/cmd/machineset/main.go index 90eaffcf59..a975fb2474 100644 --- a/cmd/machineset/main.go +++ b/cmd/machineset/main.go @@ -41,8 +41,9 @@ const ( // The default durations for the leader electrion operations. var ( - retryPeriod = 30 * time.Second - renewDealine = 90 * time.Second + leaseDuration = 120 * time.Second + renewDealine = 110 * time.Second + retryPeriod = 90 * time.Second ) func main() { @@ -81,7 +82,7 @@ func main() { leaderElectLeaseDuration := flag.Duration( "leader-elect-lease-duration", - 90*time.Second, + leaseDuration, "The duration that non-leader candidates will wait after observing a leadership renewal until attempting to acquire leadership of a led but unrenewed leader slot. This is effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. This is only applicable if leader election is enabled.", ) diff --git a/cmd/nodelink-controller/main.go b/cmd/nodelink-controller/main.go index 047c1f3984..28367dc825 100644 --- a/cmd/nodelink-controller/main.go +++ b/cmd/nodelink-controller/main.go @@ -17,8 +17,9 @@ import ( // The default durations for the leader electrion operations. var ( - retryPeriod = 30 * time.Second - renewDealine = 90 * time.Second + leaseDuration = 120 * time.Second + renewDealine = 110 * time.Second + retryPeriod = 90 * time.Second ) func printVersion() { @@ -50,7 +51,7 @@ func main() { leaderElectLeaseDuration := flag.Duration( "leader-elect-lease-duration", - 90*time.Second, + leaseDuration, "The duration that non-leader candidates will wait after observing a leadership renewal until attempting to acquire leadership of a led but unrenewed leader slot. This is effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. This is only applicable if leader election is enabled.", ) diff --git a/cmd/vsphere/main.go b/cmd/vsphere/main.go index 61e7e95202..f44807572a 100644 --- a/cmd/vsphere/main.go +++ b/cmd/vsphere/main.go @@ -22,8 +22,9 @@ import ( // The default durations for the leader electrion operations. var ( - retryPeriod = 30 * time.Second - renewDealine = 90 * time.Second + leaseDuration = 120 * time.Second + renewDealine = 110 * time.Second + retryPeriod = 90 * time.Second ) func main() { @@ -51,7 +52,7 @@ func main() { leaderElectLeaseDuration := flag.Duration( "leader-elect-lease-duration", - 90*time.Second, + leaseDuration, "The duration that non-leader candidates will wait after observing a leadership renewal until attempting to acquire leadership of a led but unrenewed leader slot. This is effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. This is only applicable if leader election is enabled.", ) diff --git a/pkg/operator/sync.go b/pkg/operator/sync.go index 02b5d75c88..b4c455dcbd 100644 --- a/pkg/operator/sync.go +++ b/pkg/operator/sync.go @@ -446,7 +446,7 @@ func newContainers(config *OperatorConfig, features map[string]bool) []corev1.Co "--logtostderr=true", "--v=3", "--leader-elect=true", - "--leader-elect-lease-duration=90s", + "--leader-elect-lease-duration=120s", fmt.Sprintf("--namespace=%s", config.TargetNamespace), }