From 8fc06315d9a026095e4c5638e1d60e42ace74a9b Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Wed, 14 Aug 2019 13:56:32 -0700 Subject: [PATCH] pkg/cvo/metrics: Report ClusterVersion conditions with reasons This will allow us to discover upgrade and other failure reasons without having to resort to a must-gather or similar [1]. And also to look at any other version conditions in Telemetry. Stick this in cluster_operator_conditions, since we already have a 'reason' slot there. And ClusterVersion.Status.Conditions is pretty much the same thing as ClusterOperator.Status.Conditions; we'll want to see all of those. I don't see a reason to add a new metric to separate cluster-version operator failures from second-level operator failures; the name should be sufficient for that. [1]: https://bugzilla.redhat.com/show_bug.cgi?id=1741645 --- pkg/cvo/metrics.go | 13 +++++++++++++ pkg/cvo/metrics_test.go | 37 +++++++++++++++++++++---------------- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index 58bfe752b..12f91a436 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -230,6 +230,19 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { g.Set(float64(len(cv.Status.AvailableUpdates))) ch <- g } + + for _, condition := range cv.Status.Conditions { + if condition.Status == configv1.ConditionUnknown { + continue + } + g := m.clusterOperatorConditions.WithLabelValues("version", string(condition.Type), string(condition.Reason)) + if condition.Status == configv1.ConditionTrue { + g.Set(1) + } else { + g.Set(0) + } + ch <- g + } } g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version) diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index e41594725..c6190ac56 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -270,7 +270,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { }, Status: configv1.ClusterVersionStatus{ Conditions: []configv1.ClusterOperatorStatusCondition{ - {Type: configv1.RetrievedUpdates, Status: configv1.ConditionTrue}, + {Type: configv1.RetrievedUpdates, Status: configv1.ConditionTrue, Reason: "Because stuff"}, }, }, }, @@ -278,14 +278,16 @@ func Test_operatorMetrics_Collect(t *testing.T) { }, }, wants: func(t *testing.T, metrics []prometheus.Metric) { - if len(metrics) != 5 { + if len(metrics) != 6 { t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) } expectMetric(t, metrics[0], 2, map[string]string{"type": "initial", "version": "", "image": "", "from_version": ""}) expectMetric(t, metrics[1], 2, map[string]string{"type": "cluster", "version": "", "image": "", "from_version": ""}) + expectMetric(t, metrics[2], 0, map[string]string{"upstream": "", "channel": ""}) - expectMetric(t, metrics[3], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""}) - expectMetric(t, metrics[4], 1, map[string]string{"type": ""}) + expectMetric(t, metrics[3], 1, map[string]string{"name": "version", "condition": "RetrievedUpdates", "reason": "Because stuff"}) + expectMetric(t, metrics[4], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""}) + expectMetric(t, metrics[5], 1, map[string]string{"type": ""}) }, }, { @@ -306,7 +308,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { }, Status: configv1.ClusterVersionStatus{ Conditions: []configv1.ClusterOperatorStatusCondition{ - {Type: configv1.OperatorAvailable, Status: configv1.ConditionTrue, LastTransitionTime: metav1.Time{Time: time.Unix(5, 0)}}, + {Type: configv1.OperatorAvailable, Status: configv1.ConditionTrue, LastTransitionTime: metav1.Time{Time: time.Unix(5, 0)}, Reason: "Because stuff"}, }, }, }, @@ -314,14 +316,15 @@ func Test_operatorMetrics_Collect(t *testing.T) { }, }, wants: func(t *testing.T, metrics []prometheus.Metric) { - if len(metrics) != 5 { + if len(metrics) != 6 { t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) } expectMetric(t, metrics[0], 2, map[string]string{"type": "initial", "version": "", "image": "", "from_version": ""}) expectMetric(t, metrics[1], 2, map[string]string{"type": "cluster", "version": "0.0.2", "image": "test/image:1", "from_version": ""}) expectMetric(t, metrics[2], 5, map[string]string{"type": "desired", "version": "1.0.0", "image": "test/image:2", "from_version": ""}) - expectMetric(t, metrics[3], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""}) - expectMetric(t, metrics[4], 1, map[string]string{"type": ""}) + expectMetric(t, metrics[3], 1, map[string]string{"name": "version", "condition": "Available", "reason": "Because stuff"}) + expectMetric(t, metrics[4], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""}) + expectMetric(t, metrics[5], 1, map[string]string{"type": ""}) }, }, { @@ -343,7 +346,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { }, Status: configv1.ClusterVersionStatus{ Conditions: []configv1.ClusterOperatorStatusCondition{ - {Type: ClusterStatusFailing, Status: configv1.ConditionTrue, LastTransitionTime: metav1.Time{Time: time.Unix(4, 0)}}, + {Type: ClusterStatusFailing, Status: configv1.ConditionTrue, LastTransitionTime: metav1.Time{Time: time.Unix(4, 0)}, Reason: "Because stuff"}, }, }, }, @@ -351,7 +354,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { }, }, wants: func(t *testing.T, metrics []prometheus.Metric) { - if len(metrics) != 7 { + if len(metrics) != 8 { t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) } expectMetric(t, metrics[0], 5, map[string]string{"type": "initial", "version": "", "image": "", "from_version": ""}) @@ -359,8 +362,9 @@ func Test_operatorMetrics_Collect(t *testing.T) { expectMetric(t, metrics[2], 5, map[string]string{"type": "desired", "version": "1.0.0", "image": "test/image:2", "from_version": ""}) expectMetric(t, metrics[3], 4, map[string]string{"type": "failure", "version": "1.0.0", "image": "test/image:2", "from_version": ""}) expectMetric(t, metrics[4], 4, map[string]string{"type": "failure", "version": "0.0.2", "image": "test/image:1", "from_version": ""}) - expectMetric(t, metrics[5], 6, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""}) - expectMetric(t, metrics[6], 1, map[string]string{"type": ""}) + expectMetric(t, metrics[5], 1, map[string]string{"name": "version", "condition": "Failing", "reason": "Because stuff"}) + expectMetric(t, metrics[6], 6, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""}) + expectMetric(t, metrics[7], 1, map[string]string{"type": ""}) }, }, { @@ -378,7 +382,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { }, Status: configv1.ClusterVersionStatus{ Conditions: []configv1.ClusterOperatorStatusCondition{ - {Type: ClusterStatusFailing, Status: configv1.ConditionTrue}, + {Type: ClusterStatusFailing, Status: configv1.ConditionTrue, Reason: "Because stuff"}, }, }, }, @@ -386,14 +390,15 @@ func Test_operatorMetrics_Collect(t *testing.T) { }, }, wants: func(t *testing.T, metrics []prometheus.Metric) { - if len(metrics) != 5 { + if len(metrics) != 6 { t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) } expectMetric(t, metrics[0], 2, map[string]string{"type": "initial", "version": "", "image": "", "from_version": ""}) expectMetric(t, metrics[1], 2, map[string]string{"type": "cluster", "version": "0.0.2", "image": "test/image:1", "from_version": ""}) expectMetric(t, metrics[2], 0, map[string]string{"type": "failure", "version": "0.0.2", "image": "test/image:1", "from_version": ""}) - expectMetric(t, metrics[3], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""}) - expectMetric(t, metrics[4], 1, map[string]string{"type": ""}) + expectMetric(t, metrics[3], 1, map[string]string{"name": "version", "condition": "Failing", "reason": "Because stuff"}) + expectMetric(t, metrics[4], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""}) + expectMetric(t, metrics[5], 1, map[string]string{"type": ""}) }, }, {