From f95dbea7c390d1b3f21b031708c279295a2288e7 Mon Sep 17 00:00:00 2001 From: Anil Vishnoi Date: Wed, 29 Apr 2020 09:49:45 -0700 Subject: [PATCH 1/4] Expose raft (nb-db/sb-db) election-timer and ovn-controller inactivity-probe. These timers are currently set to fixed value based on my current observation from scale tests. We might have to increase these values in future based on the scale we will be supporting for upcoming release. Currently election-timer values are limited by the raft jsonrpc inactivity-probe time of 5 seconds as well. To further increase the election-timer value, we need to disable jsonrpc inactivity-probe. Signed-off-by: Anil Vishnoi --- .gitignore | 2 + .../ovn-kubernetes/ovnkube-master.yaml | 122 +++++++++++++++++- .../network/ovn-kubernetes/ovnkube-node.yaml | 3 + ...luster-network-operator_03_deployment.yaml | 8 +- pkg/network/ovn_kubernetes.go | 3 + 5 files changed, 136 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 1fd1730a08..4373669dcf 100644 --- a/.gitignore +++ b/.gitignore @@ -106,5 +106,7 @@ Session.vim !.vscode/extensions.json .history +### GoLand files ### +.idea # End of https://www.gitignore.io/api/go,vim,emacs,visualstudiocode diff --git a/bindata/network/ovn-kubernetes/ovnkube-master.yaml b/bindata/network/ovn-kubernetes/ovnkube-master.yaml index 90899ec2b7..87bc719384 100644 --- a/bindata/network/ovn-kubernetes/ovnkube-master.yaml +++ b/bindata/network/ovn-kubernetes/ovnkube-master.yaml @@ -149,8 +149,11 @@ spec: - /bin/bash - -c - | + set -x MASTER_IP="{{.OVN_MASTER_IP}}" if [[ "${K8S_NODE_IP}" == "${MASTER_IP}" ]]; then + + # set the connection and disable inactivity probe retries=0 while ! ovn-nbctl --no-leader-only -t 5 set-connection pssl:{{.OVN_NB_PORT}}{{.LISTEN_DUAL_STACK}} -- set connection . inactivity_probe=60000; do (( retries += 1 )) @@ -161,6 +164,61 @@ spec: sleep 2 done fi + + election_timer="${OVN_NB_RAFT_ELECTION_TIMER}" + echo "Setting nb-db raft election timer to ${election_timer} ms" + retries=0 + while current_election_timer=$(ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound 2>/dev/null \ + | grep "Election" | sed "s/.*:[[:space:]]//"); do + if [[ -z "${current_election_timer}" ]]; then + (( retries += 1 )) + if [[ "${retries}" -gt 10 ]]; then + echo "Failed to get current nb-db raft election timer value after multiple attempts. Exiting..." + exit 1 + fi + sleep 2 + else + break + fi + done + + retries=0 + while is_candidate=$(ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound 2>/dev/null \ + | grep "Role: candidate" ); do + if [[ ! -z "${is_candidate}" ]]; then + (( retries += 1 )) + if [[ "${retries}" -gt 10 ]]; then + echo "Cluster node (nb-db raft) is in candidate role for prolonged time. Continuing..." + fi + sleep 2 + else + break + fi + done + + is_leader=$(ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound 2>/dev/null \ + | grep "Role: leader") + if [[ ! -z "${is_leader}" ]]; then + while [[ ${current_election_timer} != ${election_timer} ]]; do + max_electinon_timer=$((${current_election_timer} * 2)) + if [[ ${election_timer} -le ${max_electinon_timer} ]]; then + ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${election_timer} + if [[ $? != 0 ]]; then + echo "Failed to set nb-db raft election timer ${election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${electinon_timer} + else + ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${max_electinon_timer} + if [[ $? != 0 ]]; then + echo "Failed to set nb-db raft election timer ${max_election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${max_electinon_timer} + fi + done + fi + readinessProbe: initialDelaySeconds: 30 exec: @@ -173,6 +231,8 @@ spec: env: - name: OVN_LOG_LEVEL value: info + - name: OVN_NB_RAFT_ELECTION_TIMER + value: "{{.OVN_NB_RAFT_ELECTION_TIMER}}" - name: K8S_NODE_IP valueFrom: fieldRef: @@ -212,7 +272,7 @@ spec: - /bin/bash - -c - | - set -xe + set -x if [[ -f /env/_master ]]; then set -o allexport source /env/_master @@ -256,8 +316,11 @@ spec: - /bin/bash - -c - | + set -x MASTER_IP="{{.OVN_MASTER_IP}}" if [[ "${K8S_NODE_IP}" == "${MASTER_IP}" ]]; then + + # set the connection and disable inactivity probe retries=0 while ! ovn-sbctl --no-leader-only -t 5 set-connection pssl:{{.OVN_SB_PORT}}{{.LISTEN_DUAL_STACK}} -- set connection . inactivity_probe=60000; do (( retries += 1 )) @@ -268,6 +331,61 @@ spec: sleep 2 done fi + + election_timer="${OVN_SB_RAFT_ELECTION_TIMER}" + echo "Setting sb-db raft election timer to ${election_timer} ms" + retries=0 + while current_election_timer=$(ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound 2>/dev/null \ + | grep "Election" | sed "s/.*:[[:space:]]//"); do + if [[ -z "${current_election_timer}" ]]; then + (( retries += 1 )) + if [[ "${retries}" -gt 10 ]]; then + echo "Failed to get current sb-db raft election timer value after multiple attempts. Exiting..." + exit 1 + fi + sleep 2 + else + break + fi + done + + retries=0 + while is_candidate=$(ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound 2>/dev/null \ + | grep "Role: candidate" ); do + if [[ ! -z "${is_candidate}" ]]; then + (( retries += 1 )) + if [[ "${retries}" -gt 10 ]]; then + echo "Cluster node (sb-db raft) is in candidate role for prolonged time. Continuing..." + fi + sleep 2 + else + break + fi + done + + is_leader=$(ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound 2>/dev/null \ + | grep "Role: leader") + if [[ ! -z "${is_leader}" ]]; then + while [[ ${current_election_timer} != ${election_timer} ]]; do + max_electinon_timer=$((${current_election_timer} * 2)) + if [[ ${election_timer} -le ${max_electinon_timer} ]]; then + ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${election_timer} + if [[ $? != 0 ]]; then + echo "Failed to set sb-db raft election timer ${election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${electinon_timer} + else + ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${max_electinon_timer} + if [[ $? != 0 ]]; then + echo "Failed to set sb-db raft election timer ${max_election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${max_electinon_timer} + fi + done + fi + readinessProbe: initialDelaySeconds: 30 exec: @@ -280,6 +398,8 @@ spec: env: - name: OVN_LOG_LEVEL value: info + - name: OVN_SB_RAFT_ELECTION_TIMER + value: "{{.OVN_SB_RAFT_ELECTION_TIMER}}" - name: K8S_NODE_IP valueFrom: fieldRef: diff --git a/bindata/network/ovn-kubernetes/ovnkube-node.yaml b/bindata/network/ovn-kubernetes/ovnkube-node.yaml index 821712d502..4fc53ce46a 100644 --- a/bindata/network/ovn-kubernetes/ovnkube-node.yaml +++ b/bindata/network/ovn-kubernetes/ovnkube-node.yaml @@ -131,6 +131,7 @@ spec: --sb-client-cacert /ovn-ca/ca-bundle.crt \ --config-file=/run/ovnkube-config/ovnkube.conf \ --loglevel "${OVN_KUBE_LOG_LEVEL}" \ + --inactivity-probe="${OVN_CONTROLLER_INACTIVITY_PROBE}" ${hybrid_overlay_flags} \ --metrics-bind-address "0.0.0.0:9103" env: @@ -139,6 +140,8 @@ spec: value: "{{.KUBERNETES_SERVICE_PORT}}" - name: KUBERNETES_SERVICE_HOST value: "{{.KUBERNETES_SERVICE_HOST}}" + - name: OVN_CONTROLLER_INACTIVITY_PROBE + value: "{{.OVN_CONTROLLER_INACTIVITY_PROBE}}" - name: OVN_KUBE_LOG_LEVEL value: "4" - name: K8S_NODE diff --git a/manifests/0000_70_cluster-network-operator_03_deployment.yaml b/manifests/0000_70_cluster-network-operator_03_deployment.yaml index bf26d25d10..9db67ea8f1 100644 --- a/manifests/0000_70_cluster-network-operator_03_deployment.yaml +++ b/manifests/0000_70_cluster-network-operator_03_deployment.yaml @@ -44,7 +44,13 @@ spec: - name: ROUTE_OVERRRIDE_CNI_IMAGE value: "quay.io/openshift/origin-multus-route-override-cni:4.4" - name: OVN_IMAGE - value: "quay.io/openshift/origin-ovn-kubernetes:4.3" + value: "docker.io/avishnoi/ovn-kube-f-dev:latest-v2" + - name: OVN_NB_RAFT_ELECTION_TIMER + value: "5000" + - name: OVN_SB_RAFT_ELECTION_TIMER + value: "5000" + - name: OVN_CONTROLLER_INACTIVITY_PROBE + value: "30000" - name: KURYR_DAEMON_IMAGE value: "quay.io/openshift/origin-kuryr-cni:4.3" - name: KURYR_CONTROLLER_IMAGE diff --git a/pkg/network/ovn_kubernetes.go b/pkg/network/ovn_kubernetes.go index 7b32345eef..4310630a67 100644 --- a/pkg/network/ovn_kubernetes.go +++ b/pkg/network/ovn_kubernetes.go @@ -58,6 +58,9 @@ func renderOVNKubernetes(conf *operv1.NetworkSpec, bootstrapResult *bootstrap.Bo data.Data["OVN_SB_PORT"] = OVN_SB_PORT data.Data["OVN_NB_RAFT_PORT"] = OVN_NB_RAFT_PORT data.Data["OVN_SB_RAFT_PORT"] = OVN_SB_RAFT_PORT + data.Data["OVN_NB_RAFT_ELECTION_TIMER"] = os.Getenv("OVN_NB_RAFT_ELECTION_TIMER") + data.Data["OVN_SB_RAFT_ELECTION_TIMER"] = os.Getenv("OVN_SB_RAFT_ELECTION_TIMER") + data.Data["OVN_CONTROLLER_INACTIVITY_PROBE"] = os.Getenv("OVN_CONTROLLER_INACTIVITY_PROBE") data.Data["OVN_NB_DB_LIST"] = dbList(bootstrapResult.OVN.MasterIPs, OVN_NB_PORT) data.Data["OVN_SB_DB_LIST"] = dbList(bootstrapResult.OVN.MasterIPs, OVN_SB_PORT) data.Data["OVN_MASTER_IP"] = bootstrapResult.OVN.MasterIPs[0] From 63a1aeaca4eae7ebcf21030650bfe6ae2e678cdb Mon Sep 17 00:00:00 2001 From: Anil Vishnoi Date: Wed, 6 May 2020 18:36:08 -0700 Subject: [PATCH 2/4] Fix review comment - Remove the test OVN_IMAGE url Signed-off-by: Anil Vishnoi --- manifests/0000_70_cluster-network-operator_03_deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/0000_70_cluster-network-operator_03_deployment.yaml b/manifests/0000_70_cluster-network-operator_03_deployment.yaml index 9db67ea8f1..da633f36af 100644 --- a/manifests/0000_70_cluster-network-operator_03_deployment.yaml +++ b/manifests/0000_70_cluster-network-operator_03_deployment.yaml @@ -44,7 +44,7 @@ spec: - name: ROUTE_OVERRRIDE_CNI_IMAGE value: "quay.io/openshift/origin-multus-route-override-cni:4.4" - name: OVN_IMAGE - value: "docker.io/avishnoi/ovn-kube-f-dev:latest-v2" + value: "quay.io/openshift/origin-ovn-kubernetes:4.3" - name: OVN_NB_RAFT_ELECTION_TIMER value: "5000" - name: OVN_SB_RAFT_ELECTION_TIMER From e834dbed88e4117e93b7c7cd1e532edb093835a7 Mon Sep 17 00:00:00 2001 From: Anil Vishnoi Date: Thu, 7 May 2020 16:15:08 -0700 Subject: [PATCH 3/4] Fix review comments: > Fix regular expression > Improve the conditions check Signed-off-by: Anil Vishnoi --- .../network/ovn-kubernetes/ovnkube-master.yaml | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/bindata/network/ovn-kubernetes/ovnkube-master.yaml b/bindata/network/ovn-kubernetes/ovnkube-master.yaml index 87bc719384..74f82a03bb 100644 --- a/bindata/network/ovn-kubernetes/ovnkube-master.yaml +++ b/bindata/network/ovn-kubernetes/ovnkube-master.yaml @@ -169,7 +169,7 @@ spec: echo "Setting nb-db raft election timer to ${election_timer} ms" retries=0 while current_election_timer=$(ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound 2>/dev/null \ - | grep "Election" | sed "s/.*:[[:space:]]//"); do + | grep -oP '(?<=Election timer:\s)[[:digit:]]+'); do if [[ -z "${current_election_timer}" ]]; then (( retries += 1 )) if [[ "${retries}" -gt 10 ]]; then @@ -202,15 +202,13 @@ spec: while [[ ${current_election_timer} != ${election_timer} ]]; do max_electinon_timer=$((${current_election_timer} * 2)) if [[ ${election_timer} -le ${max_electinon_timer} ]]; then - ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${election_timer} - if [[ $? != 0 ]]; then + if ! ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${election_timer}; then echo "Failed to set nb-db raft election timer ${election_timer}. Exiting..." exit 2 fi current_election_timer=${electinon_timer} else - ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${max_electinon_timer} - if [[ $? != 0 ]]; then + if ! ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${max_electinon_timer}; then echo "Failed to set nb-db raft election timer ${max_election_timer}. Exiting..." exit 2 fi @@ -336,7 +334,7 @@ spec: echo "Setting sb-db raft election timer to ${election_timer} ms" retries=0 while current_election_timer=$(ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound 2>/dev/null \ - | grep "Election" | sed "s/.*:[[:space:]]//"); do + | grep -oP '(?<=Election timer:\s)[[:digit:]]+'); do if [[ -z "${current_election_timer}" ]]; then (( retries += 1 )) if [[ "${retries}" -gt 10 ]]; then @@ -369,15 +367,13 @@ spec: while [[ ${current_election_timer} != ${election_timer} ]]; do max_electinon_timer=$((${current_election_timer} * 2)) if [[ ${election_timer} -le ${max_electinon_timer} ]]; then - ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${election_timer} - if [[ $? != 0 ]]; then + if ! ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${election_timer}; then echo "Failed to set sb-db raft election timer ${election_timer}. Exiting..." exit 2 fi current_election_timer=${electinon_timer} else - ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${max_electinon_timer} - if [[ $? != 0 ]]; then + if ! ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${max_electinon_timer}; then echo "Failed to set sb-db raft election timer ${max_election_timer}. Exiting..." exit 2 fi From 78d6aa0431618d25db592a68b194f7ccd96bd218 Mon Sep 17 00:00:00 2001 From: Anil Vishnoi Date: Thu, 11 Jun 2020 12:26:11 -0700 Subject: [PATCH 4/4] Fix CI failures Typo in election_timer was assigning null value that was leading to failure in setting up the nb-db election-timer (only if you use != 2(n) number). Seems like it was delaying the deployment of ovnkube-master and causing issue with ovnkube-node readiness as the flow programming got delayed and ovnkube-node times out checking for flows. Signed-off-by: Anil Vishnoi --- .../ovn-kubernetes/ovnkube-master.yaml | 120 +++++++++--------- 1 file changed, 62 insertions(+), 58 deletions(-) diff --git a/bindata/network/ovn-kubernetes/ovnkube-master.yaml b/bindata/network/ovn-kubernetes/ovnkube-master.yaml index 74f82a03bb..2949cd9826 100644 --- a/bindata/network/ovn-kubernetes/ovnkube-master.yaml +++ b/bindata/network/ovn-kubernetes/ovnkube-master.yaml @@ -182,39 +182,41 @@ spec: fi done - retries=0 - while is_candidate=$(ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound 2>/dev/null \ - | grep "Role: candidate" ); do - if [[ ! -z "${is_candidate}" ]]; then - (( retries += 1 )) - if [[ "${retries}" -gt 10 ]]; then - echo "Cluster node (nb-db raft) is in candidate role for prolonged time. Continuing..." - fi - sleep 2 - else - break - fi - done - - is_leader=$(ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound 2>/dev/null \ - | grep "Role: leader") - if [[ ! -z "${is_leader}" ]]; then - while [[ ${current_election_timer} != ${election_timer} ]]; do - max_electinon_timer=$((${current_election_timer} * 2)) - if [[ ${election_timer} -le ${max_electinon_timer} ]]; then - if ! ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${election_timer}; then - echo "Failed to set nb-db raft election timer ${election_timer}. Exiting..." - exit 2 + if [[ ${election_timer} -ne ${current_election_timer} ]]; then + retries=0 + while is_candidate=$(ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound 2>/dev/null \ + | grep "Role: candidate" ); do + if [[ ! -z "${is_candidate}" ]]; then + (( retries += 1 )) + if [[ "${retries}" -gt 10 ]]; then + echo "Cluster node (nb-db raft) is in candidate role for prolonged time. Continuing..." fi - current_election_timer=${electinon_timer} + sleep 2 else - if ! ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${max_electinon_timer}; then - echo "Failed to set nb-db raft election timer ${max_election_timer}. Exiting..." - exit 2 - fi - current_election_timer=${max_electinon_timer} + break fi done + + is_leader=$(ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/status OVN_Northbound 2>/dev/null \ + | grep "Role: leader") + if [[ ! -z "${is_leader}" ]]; then + while [[ ${current_election_timer} != ${election_timer} ]]; do + max_election_timer=$((${current_election_timer} * 2)) + if [[ ${election_timer} -le ${max_election_timer} ]]; then + if ! ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${election_timer}; then + echo "Failed to set nb-db raft election timer ${election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${election_timer} + else + if ! ovs-appctl -t /var/run/ovn/ovnnb_db.ctl cluster/change-election-timer OVN_Northbound ${max_election_timer}; then + echo "Failed to set nb-db raft election timer ${max_election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${max_election_timer} + fi + done + fi fi readinessProbe: @@ -347,39 +349,41 @@ spec: fi done - retries=0 - while is_candidate=$(ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound 2>/dev/null \ - | grep "Role: candidate" ); do - if [[ ! -z "${is_candidate}" ]]; then - (( retries += 1 )) - if [[ "${retries}" -gt 10 ]]; then - echo "Cluster node (sb-db raft) is in candidate role for prolonged time. Continuing..." - fi - sleep 2 - else - break - fi - done - - is_leader=$(ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound 2>/dev/null \ - | grep "Role: leader") - if [[ ! -z "${is_leader}" ]]; then - while [[ ${current_election_timer} != ${election_timer} ]]; do - max_electinon_timer=$((${current_election_timer} * 2)) - if [[ ${election_timer} -le ${max_electinon_timer} ]]; then - if ! ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${election_timer}; then - echo "Failed to set sb-db raft election timer ${election_timer}. Exiting..." - exit 2 + if [[ ${election_timer} -ne ${current_election_timer} ]]; then + retries=0 + while is_candidate=$(ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound 2>/dev/null \ + | grep "Role: candidate" ); do + if [[ ! -z "${is_candidate}" ]]; then + (( retries += 1 )) + if [[ "${retries}" -gt 10 ]]; then + echo "Cluster node (sb-db raft) is in candidate role for prolonged time. Continuing..." fi - current_election_timer=${electinon_timer} + sleep 2 else - if ! ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${max_electinon_timer}; then - echo "Failed to set sb-db raft election timer ${max_election_timer}. Exiting..." - exit 2 - fi - current_election_timer=${max_electinon_timer} + break fi done + + is_leader=$(ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/status OVN_Southbound 2>/dev/null \ + | grep "Role: leader") + if [[ ! -z "${is_leader}" ]]; then + while [[ ${current_election_timer} != ${election_timer} ]]; do + max_election_timer=$((${current_election_timer} * 2)) + if [[ ${election_timer} -le ${max_election_timer} ]]; then + if ! ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${election_timer}; then + echo "Failed to set sb-db raft election timer ${election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${election_timer} + else + if ! ovs-appctl -t /var/run/ovn/ovnsb_db.ctl cluster/change-election-timer OVN_Southbound ${max_election_timer}; then + echo "Failed to set sb-db raft election timer ${max_election_timer}. Exiting..." + exit 2 + fi + current_election_timer=${max_election_timer} + fi + done + fi fi readinessProbe: