From 0442094858cb3fe3a1d094b28e956a69bd2f1cac Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 6 Aug 2020 12:56:57 -0700 Subject: [PATCH] lib/resourcebuilder/apps: Only error on Deployment Available=False *and* Progressing=False Available=True, Progressing=False is the happy, steady state. Available=True, Progressing=True is a happy update. Available=False, Progressing=True is acceptable outage, e.g. during an update with the Recreate strategy [1]: $ curl -s https://storage.googleapis.com/origin-ci-test/logs/release-openshift-origin-installer-e2e-gcp-upgrade-4.6/1291426211527921664/artifacts/e2e-gcp-upgrade/container-logs/test.log | grep MinimumReplicasUnavailable | head -n1 Aug 6 17:56:00.674: INFO: deployment status: v1.DeploymentStatus{ObservedGeneration:1, Replicas:1, UpdatedReplicas:1, ReadyReplicas:0, AvailableReplicas:0, UnavailableReplicas:1, Conditions:[]v1.DeploymentCondition{v1.DeploymentCondition{Type:"Available", Status:"False", LastUpdateTime:v1.Time{Time:time.Time{wall:0x0, ext:63732333358, loc:(*time.Location)(0x9e74040)}}, LastTransitionTime:v1.Time{Time:time.Time{wall:0x0, ext:63732333358, loc:(*time.Location)(0x9e74040)}}, Reason:"MinimumReplicasUnavailable", Message:"Deployment does not have minimum availability."}, v1.DeploymentCondition{Type:"Progressing", Status:"True", LastUpdateTime:v1.Time{Time:time.Time{wall:0x0, ext:63732333358, loc:(*time.Location)(0x9e74040)}}, LastTransitionTime:v1.Time{Time:time.Time{wall:0x0, ext:63732333358, loc:(*time.Location)(0x9e74040)}}, Reason:"ReplicaSetUpdated", Message:"ReplicaSet \"dp-7f9df745ff\" is progressing."}}, CollisionCount:(*int32)(nil)} Available=False, Progressing=False is the Deployment controller saying "I cannot deliver my expected service level for this Deployment", so that's when we should be complaining. Fixes noise like: Aug 6 18:03:00.500: INFO: cluster upgrade is Failing: Multiple errors are preventing progress: * Could not update namespace "openshift-service-ca-operator" (467 of 608) * deployment openshift-cluster-machine-approver/machine-approver is not available MinimumReplicasUnavailable: Deployment does not have minimum availability. * deployment openshift-ingress-operator/ingress-operator is not available MinimumReplicasUnavailable: Deployment does not have minimum availability. (the namespace part of that message is a separate issue). [1]: https://prow.ci.openshift.org/view/gcs/origin-ci-test/logs/release-openshift-origin-installer-e2e-gcp-upgrade-4.6/1291426211527921664 --- lib/resourcebuilder/apps.go | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/lib/resourcebuilder/apps.go b/lib/resourcebuilder/apps.go index 76eda94580..68cfe41ae2 100644 --- a/lib/resourcebuilder/apps.go +++ b/lib/resourcebuilder/apps.go @@ -114,20 +114,11 @@ func (b *builder) checkDeploymentHealth(ctx context.Context, deployment *appsv1. } } - if availableCondition != nil && availableCondition.Status == corev1.ConditionFalse { + if availableCondition != nil && availableCondition.Status == corev1.ConditionFalse && progressingCondition != nil && progressingCondition.Status == corev1.ConditionFalse { return &payload.UpdateError{ - Nested: fmt.Errorf("deployment %s is not available; updated replicas=%d of %d, available replicas=%d of %d", iden, d.Status.UpdatedReplicas, d.Status.Replicas, d.Status.AvailableReplicas, d.Status.Replicas), + Nested: fmt.Errorf("deployment %s is not available and not progressing; updated replicas=%d of %d, available replicas=%d of %d", iden, d.Status.UpdatedReplicas, d.Status.Replicas, d.Status.AvailableReplicas, d.Status.Replicas), Reason: "WorkloadNotAvailable", - Message: fmt.Sprintf("deployment %s is not available %s: %s", iden, availableCondition.Reason, availableCondition.Message), - Name: iden, - } - } - - if progressingCondition != nil && progressingCondition.Status == corev1.ConditionFalse { - return &payload.UpdateError{ - Nested: fmt.Errorf("deployment %s is not progressing; updated replicas=%d of %d, available replicas=%d of %d", iden, d.Status.UpdatedReplicas, d.Status.Replicas, d.Status.AvailableReplicas, d.Status.Replicas), - Reason: "WorkloadNotAvailable", - Message: fmt.Sprintf("deployment %s is not progressing %s: %s", iden, progressingCondition.Reason, progressingCondition.Message), + Message: fmt.Sprintf("deployment %s is not available %s (%s) or progressing %s (%s)", iden, availableCondition.Reason, availableCondition.Message, progressingCondition.Reason, progressingCondition.Message), Name: iden, } }