From 5884308b90c526d76d8f003bbdd7212a59a8e7a6 Mon Sep 17 00:00:00 2001 From: awgreene Date: Wed, 30 Oct 2019 12:47:00 -0400 Subject: [PATCH] feat(metrics) Limit Cardinality of CSV metrics This commit introduces a change that limits the number of metrics that an OLM cluster reports at any given time for a CSV. The first metric introduced is called csv_up, which tracks CSVs that have reached the succeeded phase. The following information is provided about the CSV via labels: namespace, name, version. The value of this metric will always be 0 or 1. The second metric introduced is called csv_abnormal, which is reported whenever the CSV is updated and has not reached the succeeded phase. The following information is provided about the CSV via labels: namespace, name, version, phase, reason. Whenever a CSV is updated, the existing timeseries is deleted and replaced by an updated version. --- pkg/controller/operators/olm/operator.go | 4 +- pkg/metrics/metrics.go | 51 +++++++++++++++++++----- test/e2e/metrics_e2e_test.go | 4 +- 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/pkg/controller/operators/olm/operator.go b/pkg/controller/operators/olm/operator.go index d1c0b43dab..7165bb5b3a 100644 --- a/pkg/controller/operators/olm/operator.go +++ b/pkg/controller/operators/olm/operator.go @@ -930,8 +930,6 @@ func (a *Operator) syncClusterServiceVersion(obj interface{}) (syncError error) }) logger.Debug("syncing CSV") - metrics.EmitCSVMetric(clusterServiceVersion) - if a.csvNotification != nil { a.csvNotification.OnAddOrUpdate(clusterServiceVersion) } @@ -964,6 +962,8 @@ func (a *Operator) syncClusterServiceVersion(obj interface{}) (syncError error) } else { syncError = fmt.Errorf("error transitioning ClusterServiceVersion: %s and error updating CSV status: %s", syncError, updateErr) } + } else { + metrics.EmitCSVMetric(clusterServiceVersion, outCSV) } } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index e401cf7c75..bce8058b59 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -5,17 +5,17 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + olmv1alpha1 "github.com/operator-framework/operator-lifecycle-manager/pkg/api/apis/operators/v1alpha1" "github.com/operator-framework/operator-lifecycle-manager/pkg/api/client/clientset/versioned" v1alpha1 "github.com/operator-framework/operator-lifecycle-manager/pkg/api/client/listers/operators/v1alpha1" - olmv1alpha1 "github.com/operator-framework/operator-lifecycle-manager/pkg/api/apis/operators/v1alpha1" - ) const ( NAME_LABEL = "name" INSTALLED_LABEL = "installed" + NAMESPACE_LABEL = "namespace" VERSION_LABEL = "version" - PHASE_LABEL = "phase" + PHASE_LABEL = "phase" REASON_LABEL = "reason" ) @@ -151,18 +151,27 @@ var ( []string{NAME_LABEL, INSTALLED_LABEL}, ) - csvSyncCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "csv_sync_total", - Help: "Monotonic count of CSV syncs", + csvSucceeded = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "csv_succeeded", + Help: "Successful CSV install", + }, + []string{NAMESPACE_LABEL, NAME_LABEL, VERSION_LABEL}, + ) + + csvAbnormal = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "csv_abnormal", + Help: "CSV is not installed", }, - []string{NAME_LABEL, VERSION_LABEL, PHASE_LABEL, REASON_LABEL}, + []string{NAMESPACE_LABEL, NAME_LABEL, VERSION_LABEL, PHASE_LABEL, REASON_LABEL}, ) ) func RegisterOLM() { prometheus.MustRegister(csvCount) - prometheus.MustRegister(csvSyncCounter) + prometheus.MustRegister(csvSucceeded) + prometheus.MustRegister(csvAbnormal) prometheus.MustRegister(CSVUpgradeCount) } @@ -177,6 +186,26 @@ func CounterForSubscription(name, installedCSV string) prometheus.Counter { return SubscriptionSyncCount.WithLabelValues(name, installedCSV) } -func EmitCSVMetric(csv *olmv1alpha1.ClusterServiceVersion){ - csvSyncCounter.WithLabelValues(csv.Name, csv.Spec.Version.String(), string(csv.Status.Phase), string(csv.Status.Reason)).Inc() +func EmitCSVMetric(oldCSV *olmv1alpha1.ClusterServiceVersion, newCSV *olmv1alpha1.ClusterServiceVersion) { + if oldCSV == nil || newCSV == nil { + return + } + + // Don't update the metric for copies + if newCSV.Status.Reason == olmv1alpha1.CSVReasonCopied { + return + } + + // Delete the old CSV metrics + csvAbnormal.DeleteLabelValues(oldCSV.Namespace, oldCSV.Name, oldCSV.Spec.Version.String(), string(oldCSV.Status.Phase), string(oldCSV.Status.Reason)) + + // Get the phase of the new CSV + newCSVPhase := string(newCSV.Status.Phase) + csvSucceededGauge := csvSucceeded.WithLabelValues(newCSV.Namespace, newCSV.Name, newCSV.Spec.Version.String()) + if newCSVPhase == string(olmv1alpha1.CSVPhaseSucceeded) { + csvSucceededGauge.Set(1) + } else { + csvSucceededGauge.Set(0) + csvAbnormal.WithLabelValues(newCSV.Namespace, newCSV.Name, newCSV.Spec.Version.String(), string(newCSV.Status.Phase), string(newCSV.Status.Reason)).Set(1) + } } diff --git a/test/e2e/metrics_e2e_test.go b/test/e2e/metrics_e2e_test.go index 36444ee793..bd77107819 100644 --- a/test/e2e/metrics_e2e_test.go +++ b/test/e2e/metrics_e2e_test.go @@ -49,11 +49,13 @@ func TestMetricsEndpoint(t *testing.T) { } // Verify metrics have been emitted for packageserver csv - require.Contains(t, rawOutput, "csv_sync_total") + require.Contains(t, rawOutput, "csv_abnormal") require.Contains(t, rawOutput, "name=\""+failingCSV.Name+"\"") require.Contains(t, rawOutput, "phase=\"Failed\"") require.Contains(t, rawOutput, "reason=\"UnsupportedOperatorGroup\"") require.Contains(t, rawOutput, "version=\"0.0.0\"") + + require.Contains(t, rawOutput, "csv_succeeded") log.Info(rawOutput) }