From 7b3352c0dcb429f0579f7e21c4f102a8e9566c28 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Tue, 11 Feb 2020 23:49:09 -0500 Subject: [PATCH 1/6] add per-tenant alertmanager metrics Signed-off-by: Jacob Lisi --- pkg/alertmanager/alertmanager.go | 9 +- pkg/alertmanager/alertmanager_metrics.go | 210 ++++++++ pkg/alertmanager/alertmanager_metrics_test.go | 484 ++++++++++++++++++ pkg/alertmanager/multitenant.go | 16 +- pkg/cortex/modules.go | 2 +- pkg/util/metrics_helper.go | 30 ++ 6 files changed, 741 insertions(+), 10 deletions(-) create mode 100644 pkg/alertmanager/alertmanager_metrics.go create mode 100644 pkg/alertmanager/alertmanager_metrics_test.go diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 6371de0b3bb..e1ae4830bb5 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -84,17 +84,14 @@ func init() { } // New creates a new Alertmanager. -func New(cfg *Config) (*Alertmanager, error) { +func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am := &Alertmanager{ cfg: cfg, logger: log.With(cfg.Logger, "user", cfg.UserID), stop: make(chan struct{}), } - // TODO(cortex): Build a registry that can merge metrics from multiple users. - // For now, these metrics are ignored, as we can't register the same - // metric twice with a single registry. - am.registry = prometheus.NewRegistry() + am.registry = reg am.wg.Add(1) nflogID := fmt.Sprintf("nflog:%s", cfg.UserID) @@ -110,7 +107,7 @@ func New(cfg *Config) (*Alertmanager, error) { return nil, fmt.Errorf("failed to create notification log: %v", err) } if cfg.Peer != nil { - c := cfg.Peer.AddState("nfl:"+cfg.UserID, am.nflog, am.registry) + c := cfg.Peer.AddState("nfl:"+cfg.UserID, am.nflog, am.registry) // TODO add gossip metrics am.nflog.SetBroadcast(c.Broadcast) } diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go new file mode 100644 index 00000000000..3c079649ce7 --- /dev/null +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -0,0 +1,210 @@ +package alertmanager + +import ( + "sync" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/cortexproject/cortex/pkg/util" +) + +// This struct aggregates metrics exported by Alertmanager +// and re-exports those aggregates as Cortex metrics. +type alertmanagerMetrics struct { + // Maps userID -> registry + regsMu sync.Mutex + regs map[string]*prometheus.Registry + + // exported metrics, gathered from Alertmanager API + alertsReceived *prometheus.Desc + alertsInvalid *prometheus.Desc + + // exported metrics, gathered from Alertmanager PipelineBuilder + numNotifications *prometheus.Desc + numFailedNotifications *prometheus.Desc + notificationLatencySeconds *prometheus.Desc + + // exported metrics, gathered from Alertmanager nflog + nflogGCDuration *prometheus.Desc + nflogSnapshotDuration *prometheus.Desc + nflogSnapshotSize *prometheus.Desc + nflogQueriesTotal *prometheus.Desc + nflogQueryErrorsTotal *prometheus.Desc + nflogQueryDuration *prometheus.Desc + nflogPropagatedMessagesTotal *prometheus.Desc + + // exported metrics, gathered from Alertmanager Marker + markerAlerts *prometheus.Desc + + // exported metrics, gathered from Alertmanager Silences + silencesGCDuration *prometheus.Desc + silencesSnapshotDuration *prometheus.Desc + silencesSnapshotSize *prometheus.Desc + silencesQueriesTotal *prometheus.Desc + silencesQueryErrorsTotal *prometheus.Desc + silencesQueryDuration *prometheus.Desc + silences *prometheus.Desc + silencesPropagatedMessagesTotal *prometheus.Desc +} + +func newAlertmanagerMetrics() *alertmanagerMetrics { + return &alertmanagerMetrics{ + regs: map[string]*prometheus.Registry{}, + alertsReceived: prometheus.NewDesc( + "cortex_alertmanager_alerts_received_total", + "The total number of received alerts.", + []string{"user", "version", "status"}, nil), + alertsInvalid: prometheus.NewDesc( + "cortex_alertmanager_alerts_invalid_total", + "The total number of received alerts that were invalid.", + []string{"user", "version"}, nil), + numNotifications: prometheus.NewDesc( + "cortex_alertmanager_notifications_total", + "The total number of attempted notifications.", + []string{"user", "integration"}, nil), + numFailedNotifications: prometheus.NewDesc( + "cortex_alertmanager_notifications_failed_total", + "The total number of failed notifications.", + []string{"user", "integration"}, nil), + notificationLatencySeconds: prometheus.NewDesc( + "cortex_alertmanager_notification_latency_seconds", + "The latency of notifications in seconds.", + []string{"integration"}, nil), + nflogGCDuration: prometheus.NewDesc( + "cortex_alertmanager_nflog_gc_duration_seconds", + "Duration of the last notification log garbage collection cycle.", + nil, nil), + nflogSnapshotDuration: prometheus.NewDesc( + "cortex_alertmanager_nflog_snapshot_duration_seconds", + "Duration of the last notification log snapshot.", + nil, nil), + nflogSnapshotSize: prometheus.NewDesc( + "cortex_alertmanager_nflog_snapshot_size_bytes", + "Size of the last notification log snapshot in bytes.", + nil, nil), + nflogQueriesTotal: prometheus.NewDesc( + "cortex_alertmanager_nflog_queries_total", + "Number of notification log queries were received.", + nil, nil), + nflogQueryErrorsTotal: prometheus.NewDesc( + "cortex_alertmanager_nflog_query_errors_total", + "Number notification log received queries that failed.", + nil, nil), + nflogQueryDuration: prometheus.NewDesc( + "cortex_alertmanager_nflog_query_duration_seconds", + "Duration of notification log query evaluation.", + nil, nil), + nflogPropagatedMessagesTotal: prometheus.NewDesc( + "cortex_alertmanager_nflog_gossip_messages_propagated_total", + "Number of received gossip messages that have been further gossiped.", + nil, nil), + markerAlerts: prometheus.NewDesc( + "cortex_alertmanager_alerts", + "How many alerts by state.", + []string{"user", "state"}, nil), + silencesGCDuration: prometheus.NewDesc( + "cortex_alertmanager_silences_gc_duration_seconds", + "Duration of the last silence garbage collection cycle.", + nil, nil), + silencesSnapshotDuration: prometheus.NewDesc( + "cortex_alertmanager_silences_snapshot_duration_seconds", + "Duration of the last silence snapshot.", + nil, nil), + silencesSnapshotSize: prometheus.NewDesc( + "cortex_alertmanager_silences_snapshot_size_bytes", + "Size of the last silence snapshot in bytes.", + nil, nil), + silencesQueriesTotal: prometheus.NewDesc( + "cortex_alertmanager_silences_queries_total", + "How many silence queries were received.", + nil, nil), + silencesQueryErrorsTotal: prometheus.NewDesc( + "cortex_alertmanager_silences_query_errors_total", + "How many silence received queries did not succeed.", + nil, nil), + silencesQueryDuration: prometheus.NewDesc( + "cortex_alertmanager_silences_query_duration_seconds", + "Duration of silence query evaluation.", + nil, nil), + silencesPropagatedMessagesTotal: prometheus.NewDesc( + "cortex_alertmanager_silences_gossip_messages_propagated_total", + "Number of received gossip messages that have been further gossiped.", + nil, nil), + silences: prometheus.NewDesc( + "cortex_alertmanager_silences", + "How many silences by state.", + []string{"user", "state"}, nil), + } +} + +func (m *alertmanagerMetrics) addUserRegistry(user string, reg *prometheus.Registry) { + m.regsMu.Lock() + m.regs[user] = reg + m.regsMu.Unlock() +} + +func (m *alertmanagerMetrics) registries() map[string]*prometheus.Registry { + regs := map[string]*prometheus.Registry{} + + m.regsMu.Lock() + defer m.regsMu.Unlock() + for uid, r := range m.regs { + regs[uid] = r + } + + return regs +} + +func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { + out <- m.alertsReceived + out <- m.alertsInvalid + out <- m.numNotifications + out <- m.numFailedNotifications + out <- m.notificationLatencySeconds + out <- m.nflogGCDuration + out <- m.nflogSnapshotDuration + out <- m.nflogSnapshotSize + out <- m.nflogQueriesTotal + out <- m.nflogQueryErrorsTotal + out <- m.nflogQueryDuration + out <- m.nflogPropagatedMessagesTotal + out <- m.markerAlerts + out <- m.silencesGCDuration + out <- m.silencesSnapshotDuration + out <- m.silencesSnapshotSize + out <- m.silencesQueriesTotal + out <- m.silencesQueryErrorsTotal + out <- m.silencesQueryDuration + out <- m.silences + out <- m.silencesPropagatedMessagesTotal +} + +func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { + data := util.BuildMetricFamiliesPerUserFromUserRegistries(m.registries()) + + data.SendSumOfCountersPerUserWithLabels(out, m.alertsReceived, "alertmanager_alerts_received_total", "version", "status") + data.SendSumOfCountersPerUserWithLabels(out, m.alertsInvalid, "alertmanager_alerts_invalid_total", "version") + + data.SendSumOfCountersPerUserWithLabels(out, m.numNotifications, "alertmanager_notifications_total", "integration") + data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration") + data.SendSumOfCountersWithLabels(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds", "integration") + + data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state") + + data.SendSumOfSummaries(out, m.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds") + data.SendSumOfSummaries(out, m.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds") + data.SendSumOfGauges(out, m.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes") + data.SendSumOfCounters(out, m.nflogQueriesTotal, "alertmanager_nflog_queries_total") + data.SendSumOfCounters(out, m.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total") + data.SendSumOfHistograms(out, m.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds") + data.SendSumOfCounters(out, m.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total") + + data.SendSumOfSummaries(out, m.silencesGCDuration, "alertmanager_silences_gc_duration_seconds") + data.SendSumOfSummaries(out, m.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds") + data.SendSumOfGauges(out, m.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes") + data.SendSumOfCounters(out, m.silencesQueriesTotal, "alertmanager_silences_queries_total") + data.SendSumOfCounters(out, m.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total") + data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds") + data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total") + data.SendSumOfGaugesPerUserWithLabels(out, m.silences, "alertmanager_silences", "state") +} diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go new file mode 100644 index 00000000000..21d2a5b5c07 --- /dev/null +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -0,0 +1,484 @@ +package alertmanager + +import ( + "bytes" + "testing" + + "github.com/prometheus/alertmanager/types" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/require" +) + +func TestAlertmanagerMetricsStore(t *testing.T) { + mainReg := prometheus.NewPedanticRegistry() + + alertmanangerMetrics := newAlertmanagerMetrics() + mainReg.MustRegister(alertmanangerMetrics) + alertmanangerMetrics.addUserRegistry("user1", populateAlertmanager(1)) + alertmanangerMetrics.addUserRegistry("user2", populateAlertmanager(10)) + alertmanangerMetrics.addUserRegistry("user3", populateAlertmanager(100)) + + //noinspection ALL + err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(` + # HELP cortex_alertmanager_alerts How many alerts by state. + # TYPE cortex_alertmanager_alerts gauge + cortex_alertmanager_alerts{state="active",user="user1"} 1 + cortex_alertmanager_alerts{state="active",user="user2"} 10 + cortex_alertmanager_alerts{state="active",user="user3"} 100 + cortex_alertmanager_alerts{state="suppressed",user="user1"} 2 + cortex_alertmanager_alerts{state="suppressed",user="user2"} 20 + cortex_alertmanager_alerts{state="suppressed",user="user3"} 200 + # HELP cortex_alertmanager_alerts_invalid_total The total number of received alerts that were invalid. + # TYPE cortex_alertmanager_alerts_invalid_total counter + cortex_alertmanager_alerts_invalid_total{user="user1",version="v1"} 1 + cortex_alertmanager_alerts_invalid_total{user="user1",version="v2"} 1 + cortex_alertmanager_alerts_invalid_total{user="user2",version="v1"} 10 + cortex_alertmanager_alerts_invalid_total{user="user2",version="v2"} 10 + cortex_alertmanager_alerts_invalid_total{user="user3",version="v1"} 100 + cortex_alertmanager_alerts_invalid_total{user="user3",version="v2"} 100 + # HELP cortex_alertmanager_alerts_received_total The total number of received alerts. + # TYPE cortex_alertmanager_alerts_received_total counter + cortex_alertmanager_alerts_received_total{status="firing",user="user1",version="v1"} 2 + cortex_alertmanager_alerts_received_total{status="firing",user="user1",version="v2"} 2 + cortex_alertmanager_alerts_received_total{status="firing",user="user2",version="v1"} 20 + cortex_alertmanager_alerts_received_total{status="firing",user="user2",version="v2"} 20 + cortex_alertmanager_alerts_received_total{status="firing",user="user3",version="v1"} 200 + cortex_alertmanager_alerts_received_total{status="firing",user="user3",version="v2"} 200 + cortex_alertmanager_alerts_received_total{status="resolved",user="user1",version="v1"} 3 + cortex_alertmanager_alerts_received_total{status="resolved",user="user1",version="v2"} 3 + cortex_alertmanager_alerts_received_total{status="resolved",user="user2",version="v1"} 30 + cortex_alertmanager_alerts_received_total{status="resolved",user="user2",version="v2"} 30 + cortex_alertmanager_alerts_received_total{status="resolved",user="user3",version="v1"} 300 + cortex_alertmanager_alerts_received_total{status="resolved",user="user3",version="v2"} 300 + # HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle. + # TYPE cortex_alertmanager_nflog_gc_duration_seconds summary + cortex_alertmanager_nflog_gc_duration_seconds_sum 111 + cortex_alertmanager_nflog_gc_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_gossip_messages_propagated_total Number of received gossip messages that have been further gossiped. + # TYPE cortex_alertmanager_nflog_gossip_messages_propagated_total counter + cortex_alertmanager_nflog_gossip_messages_propagated_total 111 + # HELP cortex_alertmanager_nflog_queries_total Number of notification log queries were received. + # TYPE cortex_alertmanager_nflog_queries_total counter + cortex_alertmanager_nflog_queries_total 111 + # HELP cortex_alertmanager_nflog_query_duration_seconds Duration of notification log query evaluation. + # TYPE cortex_alertmanager_nflog_query_duration_seconds histogram + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.005"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.01"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.025"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.05"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.1"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.25"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.5"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="1"} 1 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="2.5"} 1 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="5"} 1 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="10"} 2 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="+Inf"} 3 + cortex_alertmanager_nflog_query_duration_seconds_sum 111 + cortex_alertmanager_nflog_query_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_query_errors_total Number notification log received queries that failed. + # TYPE cortex_alertmanager_nflog_query_errors_total counter + cortex_alertmanager_nflog_query_errors_total 111 + # HELP cortex_alertmanager_nflog_snapshot_duration_seconds Duration of the last notification log snapshot. + # TYPE cortex_alertmanager_nflog_snapshot_duration_seconds summary + cortex_alertmanager_nflog_snapshot_duration_seconds_sum 111 + cortex_alertmanager_nflog_snapshot_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_snapshot_size_bytes Size of the last notification log snapshot in bytes. + # TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge + cortex_alertmanager_nflog_snapshot_size_bytes 111 + # HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds. + # TYPE cortex_alertmanager_notification_latency_seconds counter + cortex_alertmanager_notification_latency_seconds{integration="email"} 0 + cortex_alertmanager_notification_latency_seconds{integration="hipchat"} 0 + cortex_alertmanager_notification_latency_seconds{integration="opsgenie"} 0 + cortex_alertmanager_notification_latency_seconds{integration="pagerduty"} 0 + cortex_alertmanager_notification_latency_seconds{integration="pushover"} 0 + cortex_alertmanager_notification_latency_seconds{integration="slack"} 0 + cortex_alertmanager_notification_latency_seconds{integration="victorops"} 0 + cortex_alertmanager_notification_latency_seconds{integration="webhook"} 0 + cortex_alertmanager_notification_latency_seconds{integration="wechat"} 0 + # HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications. + # TYPE cortex_alertmanager_notifications_failed_total counter + cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0 + cortex_alertmanager_notifications_failed_total{integration="hipchat",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="hipchat",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="hipchat",user="user3"} 0 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 0 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user3"} 0 + cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="pushover",user="user3"} 0 + cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="slack",user="user3"} 0 + cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="victorops",user="user3"} 0 + cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="webhook",user="user3"} 0 + cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="wechat",user="user3"} 0 + # HELP cortex_alertmanager_notifications_total The total number of attempted notifications. + # TYPE cortex_alertmanager_notifications_total counter + cortex_alertmanager_notifications_total{integration="email",user="user1"} 0 + cortex_alertmanager_notifications_total{integration="email",user="user2"} 0 + cortex_alertmanager_notifications_total{integration="email",user="user3"} 0 + cortex_alertmanager_notifications_total{integration="hipchat",user="user1"} 0 + cortex_alertmanager_notifications_total{integration="hipchat",user="user2"} 0 + cortex_alertmanager_notifications_total{integration="hipchat",user="user3"} 0 + cortex_alertmanager_notifications_total{integration="opsgenie",user="user1"} 0 + cortex_alertmanager_notifications_total{integration="opsgenie",user="user2"} 0 + cortex_alertmanager_notifications_total{integration="opsgenie",user="user3"} 0 + cortex_alertmanager_notifications_total{integration="pagerduty",user="user1"} 0 + cortex_alertmanager_notifications_total{integration="pagerduty",user="user2"} 0 + cortex_alertmanager_notifications_total{integration="pagerduty",user="user3"} 0 + cortex_alertmanager_notifications_total{integration="pushover",user="user1"} 0 + cortex_alertmanager_notifications_total{integration="pushover",user="user2"} 0 + cortex_alertmanager_notifications_total{integration="pushover",user="user3"} 0 + cortex_alertmanager_notifications_total{integration="slack",user="user1"} 0 + cortex_alertmanager_notifications_total{integration="slack",user="user2"} 0 + cortex_alertmanager_notifications_total{integration="slack",user="user3"} 0 + cortex_alertmanager_notifications_total{integration="victorops",user="user1"} 0 + cortex_alertmanager_notifications_total{integration="victorops",user="user2"} 0 + cortex_alertmanager_notifications_total{integration="victorops",user="user3"} 0 + cortex_alertmanager_notifications_total{integration="webhook",user="user1"} 0 + cortex_alertmanager_notifications_total{integration="webhook",user="user2"} 0 + cortex_alertmanager_notifications_total{integration="webhook",user="user3"} 0 + cortex_alertmanager_notifications_total{integration="wechat",user="user1"} 0 + cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 0 + cortex_alertmanager_notifications_total{integration="wechat",user="user3"} 0 + # HELP cortex_alertmanager_silences How many silences by state. + # TYPE cortex_alertmanager_silences gauge + cortex_alertmanager_silences{state="active",user="user1"} 1 + cortex_alertmanager_silences{state="active",user="user2"} 10 + cortex_alertmanager_silences{state="active",user="user3"} 100 + cortex_alertmanager_silences{state="expired",user="user1"} 2 + cortex_alertmanager_silences{state="expired",user="user2"} 20 + cortex_alertmanager_silences{state="expired",user="user3"} 200 + cortex_alertmanager_silences{state="pending",user="user1"} 3 + cortex_alertmanager_silences{state="pending",user="user2"} 30 + cortex_alertmanager_silences{state="pending",user="user3"} 300 + # HELP cortex_alertmanager_silences_gc_duration_seconds Duration of the last silence garbage collection cycle. + # TYPE cortex_alertmanager_silences_gc_duration_seconds summary + cortex_alertmanager_silences_gc_duration_seconds_sum 111 + cortex_alertmanager_silences_gc_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_gossip_messages_propagated_total Number of received gossip messages that have been further gossiped. + # TYPE cortex_alertmanager_silences_gossip_messages_propagated_total counter + cortex_alertmanager_silences_gossip_messages_propagated_total 111 + # HELP cortex_alertmanager_silences_queries_total How many silence queries were received. + # TYPE cortex_alertmanager_silences_queries_total counter + cortex_alertmanager_silences_queries_total 111 + # HELP cortex_alertmanager_silences_query_duration_seconds Duration of silence query evaluation. + # TYPE cortex_alertmanager_silences_query_duration_seconds histogram + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.005"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.01"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.025"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.05"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.1"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.25"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.5"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="1"} 1 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="2.5"} 1 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="5"} 1 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="10"} 2 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="+Inf"} 3 + cortex_alertmanager_silences_query_duration_seconds_sum 111 + cortex_alertmanager_silences_query_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_query_errors_total How many silence received queries did not succeed. + # TYPE cortex_alertmanager_silences_query_errors_total counter + cortex_alertmanager_silences_query_errors_total 111 + # HELP cortex_alertmanager_silences_snapshot_duration_seconds Duration of the last silence snapshot. + # TYPE cortex_alertmanager_silences_snapshot_duration_seconds summary + cortex_alertmanager_silences_snapshot_duration_seconds_sum 111 + cortex_alertmanager_silences_snapshot_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes. + # TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge + cortex_alertmanager_silences_snapshot_size_bytes 111 + +`)) + require.NoError(t, err) +} + +func populateAlertmanager(base float64) *prometheus.Registry { + reg := prometheus.NewRegistry() + s := newSilenceMetrics(reg) + s.gcDuration.Observe(base) + s.snapshotDuration.Observe(base) + s.snapshotSize.Add(base) + s.queriesTotal.Add(base) + s.queryErrorsTotal.Add(base) + s.queryDuration.Observe(base) + s.propagatedMessagesTotal.Add(base) + s.silencesActive.Set(base) + s.silencesExpired.Set(base * 2) + s.silencesPending.Set(base * 3) + + n := newNflogMetrics(reg) + n.gcDuration.Observe(base) + n.snapshotDuration.Observe(base) + n.snapshotSize.Add(base) + n.queriesTotal.Add(base) + n.queryErrorsTotal.Add(base) + n.queryDuration.Observe(base) + n.propagatedMessagesTotal.Add(base) + + _ = newNotifyMetrics(reg) + + m := newMarkerMetrics(reg) + m.alerts.WithLabelValues(string(types.AlertStateActive)).Add(base) + m.alerts.WithLabelValues(string(types.AlertStateSuppressed)).Add(base * 2) + + v1APIMetrics := newAPIMetrics("v1", reg) + v1APIMetrics.firing.Add(base * 2) + v1APIMetrics.invalid.Add(base) + v1APIMetrics.resolved.Add(base * 3) + + v2APIMetrics := newAPIMetrics("v2", reg) + v2APIMetrics.firing.Add(base * 2) + v2APIMetrics.invalid.Add(base) + v2APIMetrics.resolved.Add(base * 3) + + return reg +} + +const ( + cacheTypePostings string = "Postings" + cacheTypeSeries string = "Series" +) + +// Copied from github.com/alertmanager/nflog/nflog.go +type nflogMetrics struct { + gcDuration prometheus.Summary + snapshotDuration prometheus.Summary + snapshotSize prometheus.Gauge + queriesTotal prometheus.Counter + queryErrorsTotal prometheus.Counter + queryDuration prometheus.Histogram + propagatedMessagesTotal prometheus.Counter +} + +func newNflogMetrics(r prometheus.Registerer) *nflogMetrics { + m := &nflogMetrics{} + + m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_nflog_gc_duration_seconds", + Help: "Duration of the last notification log garbage collection cycle.", + Objectives: map[float64]float64{}, + }) + m.snapshotDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_nflog_snapshot_duration_seconds", + Help: "Duration of the last notification log snapshot.", + Objectives: map[float64]float64{}, + }) + m.snapshotSize = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_nflog_snapshot_size_bytes", + Help: "Size of the last notification log snapshot in bytes.", + }) + m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_queries_total", + Help: "Number of notification log queries were received.", + }) + m.queryErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_query_errors_total", + Help: "Number notification log received queries that failed.", + }) + m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "alertmanager_nflog_query_duration_seconds", + Help: "Duration of notification log query evaluation.", + }) + m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_gossip_messages_propagated_total", + Help: "Number of received gossip messages that have been further gossiped.", + }) + + if r != nil { + r.MustRegister( + m.gcDuration, + m.snapshotDuration, + m.snapshotSize, + m.queriesTotal, + m.queryErrorsTotal, + m.queryDuration, + m.propagatedMessagesTotal, + ) + } + return m +} + +// Copied from github.com/alertmanager/silence/silence.go +type silenceMetrics struct { + gcDuration prometheus.Summary + snapshotDuration prometheus.Summary + snapshotSize prometheus.Gauge + queriesTotal prometheus.Counter + queryErrorsTotal prometheus.Counter + queryDuration prometheus.Histogram + silencesActive prometheus.Gauge + silencesPending prometheus.Gauge + silencesExpired prometheus.Gauge + propagatedMessagesTotal prometheus.Counter +} + +func newSilenceMetrics(r prometheus.Registerer) *silenceMetrics { + m := &silenceMetrics{} + + m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_silences_gc_duration_seconds", + Help: "Duration of the last silence garbage collection cycle.", + Objectives: map[float64]float64{}, + }) + m.snapshotDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_silences_snapshot_duration_seconds", + Help: "Duration of the last silence snapshot.", + Objectives: map[float64]float64{}, + }) + m.snapshotSize = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences_snapshot_size_bytes", + Help: "Size of the last silence snapshot in bytes.", + }) + m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_queries_total", + Help: "How many silence queries were received.", + }) + m.queryErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_query_errors_total", + Help: "How many silence received queries did not succeed.", + }) + m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "alertmanager_silences_query_duration_seconds", + Help: "Duration of silence query evaluation.", + }) + m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_gossip_messages_propagated_total", + Help: "Number of received gossip messages that have been further gossiped.", + }) + m.silencesActive = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences", + Help: "How many silences by state.", + ConstLabels: prometheus.Labels{"state": string(types.SilenceStateActive)}, + }) + m.silencesPending = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences", + Help: "How many silences by state.", + ConstLabels: prometheus.Labels{"state": string(types.SilenceStatePending)}, + }) + m.silencesExpired = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences", + Help: "How many silences by state.", + ConstLabels: prometheus.Labels{"state": string(types.SilenceStateExpired)}, + }) + + if r != nil { + r.MustRegister( + m.gcDuration, + m.snapshotDuration, + m.snapshotSize, + m.queriesTotal, + m.queryErrorsTotal, + m.queryDuration, + m.silencesActive, + m.silencesPending, + m.silencesExpired, + m.propagatedMessagesTotal, + ) + } + return m +} + +// Copied from github.com/alertmanager/notify/notify.go +type notifyMetrics struct { + numNotifications *prometheus.CounterVec + numFailedNotifications *prometheus.CounterVec + notificationLatencySeconds *prometheus.HistogramVec +} + +func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics { + m := ¬ifyMetrics{ + numNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "alertmanager", + Name: "notifications_total", + Help: "The total number of attempted notifications.", + }, []string{"integration"}), + numFailedNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "alertmanager", + Name: "notifications_failed_total", + Help: "The total number of failed notifications.", + }, []string{"integration"}), + notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "alertmanager", + Name: "notification_latency_seconds", + Help: "The latency of notifications in seconds.", + Buckets: []float64{1, 5, 10, 15, 20}, + }, []string{"integration"}), + } + for _, integration := range []string{ + "email", + "hipchat", + "pagerduty", + "wechat", + "pushover", + "slack", + "opsgenie", + "webhook", + "victorops", + } { + m.numNotifications.WithLabelValues(integration) + m.numFailedNotifications.WithLabelValues(integration) + m.notificationLatencySeconds.WithLabelValues(integration) + } + r.MustRegister(m.numNotifications, m.numFailedNotifications, m.notificationLatencySeconds) + return m +} + +type markerMetrics struct { + alerts *prometheus.GaugeVec +} + +func newMarkerMetrics(r prometheus.Registerer) *markerMetrics { + m := &markerMetrics{ + alerts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "alertmanager_alerts", + Help: "How many alerts by state.", + }, []string{"state"}), + } + + r.MustRegister(m.alerts) + return m +} + +// Copied from github.com/alertmanager/api/metrics/metrics.go +type apiMetrics struct { + firing prometheus.Counter + resolved prometheus.Counter + invalid prometheus.Counter +} + +func newAPIMetrics(version string, r prometheus.Registerer) *apiMetrics { + numReceivedAlerts := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "alertmanager_alerts_received_total", + Help: "The total number of received alerts.", + ConstLabels: prometheus.Labels{"version": version}, + }, []string{"status"}) + numInvalidAlerts := prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_alerts_invalid_total", + Help: "The total number of received alerts that were invalid.", + ConstLabels: prometheus.Labels{"version": version}, + }) + if r != nil { + r.MustRegister(numReceivedAlerts, numInvalidAlerts) + } + return &apiMetrics{ + firing: numReceivedAlerts.WithLabelValues("firing"), + resolved: numReceivedAlerts.WithLabelValues("resolved"), + invalid: numInvalidAlerts, + } +} diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 69df130f2e9..dc22ea3bcfa 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -140,6 +140,8 @@ type MultitenantAlertmanager struct { alertmanagersMtx sync.Mutex alertmanagers map[string]*Alertmanager + metrics *alertmanagerMetrics + latestConfig configs.ID latestMutex sync.RWMutex @@ -150,7 +152,7 @@ type MultitenantAlertmanager struct { } // NewMultitenantAlertmanager creates a new MultitenantAlertmanager. -func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, cfgCfg configs_client.Config) (*MultitenantAlertmanager, error) { +func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, cfgCfg configs_client.Config, registerer prometheus.Registerer) (*MultitenantAlertmanager, error) { err := os.MkdirAll(cfg.DataDir, 0777) if err != nil { return nil, fmt.Errorf("unable to create Alertmanager data directory %q: %s", cfg.DataDir, err) @@ -177,7 +179,7 @@ func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, cfgCfg confi if cfg.ClusterBindAddr != "" { peer, err = cluster.Create( log.With(util.Logger, "component", "cluster"), - prometheus.DefaultRegisterer, + registerer, cfg.ClusterBindAddr, cfg.ClusterAdvertiseAddr, cfg.Peers, @@ -204,10 +206,16 @@ func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, cfgCfg confi fallbackConfig: string(fallbackConfig), cfgs: map[string]configs.Config{}, alertmanagers: map[string]*Alertmanager{}, + metrics: newAlertmanagerMetrics(), peer: peer, stop: make(chan struct{}), done: make(chan struct{}), } + + if registerer != nil { + registerer.MustRegister(am.metrics) + } + return am, nil } @@ -431,6 +439,8 @@ func (am *MultitenantAlertmanager) deleteUser(userID string) { } func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amconfig.Config) (*Alertmanager, error) { + reg := prometheus.NewRegistry() + am.metrics.addUserRegistry(userID, reg) newAM, err := New(&Config{ UserID: userID, DataDir: am.cfg.DataDir, @@ -439,7 +449,7 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amco PeerTimeout: am.cfg.PeerTimeout, Retention: am.cfg.Retention, ExternalURL: am.cfg.ExternalURL.URL, - }) + }, reg) if err != nil { return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err) } diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index 2ef73b6e021..06fd1f77fa1 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -478,7 +478,7 @@ func (t *Cortex) stopConfigs() error { } func (t *Cortex) initAlertmanager(cfg *Config) (err error) { - t.alertmanager, err = alertmanager.NewMultitenantAlertmanager(&cfg.Alertmanager, cfg.ConfigStore) + t.alertmanager, err = alertmanager.NewMultitenantAlertmanager(&cfg.Alertmanager, cfg.ConfigStore, prometheus.DefaultRegisterer) if err != nil { return err } diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index 234b154ebd6..7ffa6e6c9a7 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -150,6 +150,21 @@ func (d MetricFamiliesPerUser) SendSumOfCountersPerUser(out chan<- prometheus.Me } } +// SendSumOfCountersPerUserWithLabels provides metrics with the provided label names on a per-user basis. This function assumes that `user` is the +// first label on the provided metric Desc +func (d MetricFamiliesPerUser) SendSumOfCountersPerUserWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, metric string, labelNames ...string) { + for user, userMetrics := range d { + metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[metric], labelNames) + for _, mlv := range metricsPerLabelValue { + for _, m := range mlv.metrics { + val := counterValue(m) + labels := append([]string{user}, mlv.labelValues...) + out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, val, labels...) + } + } + } +} + func (d MetricFamiliesPerUser) SendSumOfGauges(out chan<- prometheus.Metric, desc *prometheus.Desc, gauge string) { result := float64(0) for _, userMetrics := range d { @@ -162,6 +177,21 @@ func (d MetricFamiliesPerUser) SendSumOfGaugesWithLabels(out chan<- prometheus.M d.sumOfSingleValuesWithLabels(gauge, gaugeValue, labelNames).WriteToMetricChannel(out, desc, prometheus.GaugeValue) } +// SendSumOfGaugesPerUserWithLabels provides metrics with the provided label names on a per-user basis. This function assumes that `user` is the +// first label on the provided metric Desc +func (d MetricFamiliesPerUser) SendSumOfGaugesPerUserWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, metric string, labelNames ...string) { + for user, userMetrics := range d { + metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[metric], labelNames) + for _, mlv := range metricsPerLabelValue { + for _, m := range mlv.metrics { + val := gaugeValue(m) + labels := append([]string{user}, mlv.labelValues...) + out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, val, labels...) + } + } + } +} + func (d MetricFamiliesPerUser) sumOfSingleValuesWithLabels(metric string, fn func(*dto.Metric) float64, labelNames []string) singleValueWithLabelsMap { result := singleValueWithLabelsMap{} for _, userMetrics := range d { From 3c539f4655c79678e920c53c4f16956151206b17 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Tue, 11 Feb 2020 23:53:07 -0500 Subject: [PATCH 2/6] update changelog Signed-off-by: Jacob Lisi --- CHANGELOG.md | 1 + pkg/alertmanager/alertmanager.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 15bbe1e849e..43f71be28e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ * `--experimental.distributor.user-subring-size` * [FEATURE] Added flag `-experimental.ruler.enable-api` to enable the ruler api which implements the Prometheus API `/api/v1/rules` and `/api/v1/alerts` endpoints under the configured `-http.prefix`. #1999 * [FEATURE] Added sharding support to compactor when using the experimental TSDB blocks storage. #2113 +* [ENHANCEMENT] Alertmanager: Expose Per-tenant alertmanager metrics #2116 * [ENHANCEMENT] Experimental TSDB: Export TSDB Syncer metrics from Compactor component, they are prefixed with `cortex_compactor_`. #2023 * [ENHANCEMENT] Experimental TSDB: Added dedicated flag `-experimental.tsdb.bucket-store.tenant-sync-concurrency` to configure the maximum number of concurrent tenants for which blocks are synched. #2026 * [ENHANCEMENT] Experimental TSDB: Expose metrics for objstore operations (prefixed with `cortex__thanos_objstore_`, component being one of `ingester`, `querier` and `compactor`). #2027 diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index e1ae4830bb5..ad88790b3b3 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -107,7 +107,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { return nil, fmt.Errorf("failed to create notification log: %v", err) } if cfg.Peer != nil { - c := cfg.Peer.AddState("nfl:"+cfg.UserID, am.nflog, am.registry) // TODO add gossip metrics + c := cfg.Peer.AddState("nfl:"+cfg.UserID, am.nflog, am.registry) am.nflog.SetBroadcast(c.Broadcast) } From 004565d51daaaca0d368f5875ce9d4cc2a912df7 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Wed, 12 Feb 2020 11:16:18 -0500 Subject: [PATCH 3/6] fix linting errors Signed-off-by: Jacob Lisi --- pkg/alertmanager/alertmanager_metrics_test.go | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index 21d2a5b5c07..4942fad816b 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -29,6 +29,7 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_alerts{state="suppressed",user="user1"} 2 cortex_alertmanager_alerts{state="suppressed",user="user2"} 20 cortex_alertmanager_alerts{state="suppressed",user="user3"} 200 + # HELP cortex_alertmanager_alerts_invalid_total The total number of received alerts that were invalid. # TYPE cortex_alertmanager_alerts_invalid_total counter cortex_alertmanager_alerts_invalid_total{user="user1",version="v1"} 1 @@ -37,6 +38,7 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_alerts_invalid_total{user="user2",version="v2"} 10 cortex_alertmanager_alerts_invalid_total{user="user3",version="v1"} 100 cortex_alertmanager_alerts_invalid_total{user="user3",version="v2"} 100 + # HELP cortex_alertmanager_alerts_received_total The total number of received alerts. # TYPE cortex_alertmanager_alerts_received_total counter cortex_alertmanager_alerts_received_total{status="firing",user="user1",version="v1"} 2 @@ -51,16 +53,20 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_alerts_received_total{status="resolved",user="user2",version="v2"} 30 cortex_alertmanager_alerts_received_total{status="resolved",user="user3",version="v1"} 300 cortex_alertmanager_alerts_received_total{status="resolved",user="user3",version="v2"} 300 + # HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle. # TYPE cortex_alertmanager_nflog_gc_duration_seconds summary cortex_alertmanager_nflog_gc_duration_seconds_sum 111 cortex_alertmanager_nflog_gc_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_gossip_messages_propagated_total Number of received gossip messages that have been further gossiped. # TYPE cortex_alertmanager_nflog_gossip_messages_propagated_total counter cortex_alertmanager_nflog_gossip_messages_propagated_total 111 + # HELP cortex_alertmanager_nflog_queries_total Number of notification log queries were received. # TYPE cortex_alertmanager_nflog_queries_total counter cortex_alertmanager_nflog_queries_total 111 + # HELP cortex_alertmanager_nflog_query_duration_seconds Duration of notification log query evaluation. # TYPE cortex_alertmanager_nflog_query_duration_seconds histogram cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.005"} 0 @@ -77,16 +83,20 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_nflog_query_duration_seconds_bucket{le="+Inf"} 3 cortex_alertmanager_nflog_query_duration_seconds_sum 111 cortex_alertmanager_nflog_query_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_query_errors_total Number notification log received queries that failed. # TYPE cortex_alertmanager_nflog_query_errors_total counter cortex_alertmanager_nflog_query_errors_total 111 + # HELP cortex_alertmanager_nflog_snapshot_duration_seconds Duration of the last notification log snapshot. # TYPE cortex_alertmanager_nflog_snapshot_duration_seconds summary cortex_alertmanager_nflog_snapshot_duration_seconds_sum 111 cortex_alertmanager_nflog_snapshot_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_snapshot_size_bytes Size of the last notification log snapshot in bytes. # TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge cortex_alertmanager_nflog_snapshot_size_bytes 111 + # HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds. # TYPE cortex_alertmanager_notification_latency_seconds counter cortex_alertmanager_notification_latency_seconds{integration="email"} 0 @@ -98,6 +108,7 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_notification_latency_seconds{integration="victorops"} 0 cortex_alertmanager_notification_latency_seconds{integration="webhook"} 0 cortex_alertmanager_notification_latency_seconds{integration="wechat"} 0 + # HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications. # TYPE cortex_alertmanager_notifications_failed_total counter cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0 @@ -127,6 +138,7 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 0 cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 0 cortex_alertmanager_notifications_failed_total{integration="wechat",user="user3"} 0 + # HELP cortex_alertmanager_notifications_total The total number of attempted notifications. # TYPE cortex_alertmanager_notifications_total counter cortex_alertmanager_notifications_total{integration="email",user="user1"} 0 @@ -156,6 +168,7 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_notifications_total{integration="wechat",user="user1"} 0 cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 0 cortex_alertmanager_notifications_total{integration="wechat",user="user3"} 0 + # HELP cortex_alertmanager_silences How many silences by state. # TYPE cortex_alertmanager_silences gauge cortex_alertmanager_silences{state="active",user="user1"} 1 @@ -167,16 +180,20 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_silences{state="pending",user="user1"} 3 cortex_alertmanager_silences{state="pending",user="user2"} 30 cortex_alertmanager_silences{state="pending",user="user3"} 300 + # HELP cortex_alertmanager_silences_gc_duration_seconds Duration of the last silence garbage collection cycle. # TYPE cortex_alertmanager_silences_gc_duration_seconds summary cortex_alertmanager_silences_gc_duration_seconds_sum 111 cortex_alertmanager_silences_gc_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_gossip_messages_propagated_total Number of received gossip messages that have been further gossiped. # TYPE cortex_alertmanager_silences_gossip_messages_propagated_total counter cortex_alertmanager_silences_gossip_messages_propagated_total 111 + # HELP cortex_alertmanager_silences_queries_total How many silence queries were received. # TYPE cortex_alertmanager_silences_queries_total counter cortex_alertmanager_silences_queries_total 111 + # HELP cortex_alertmanager_silences_query_duration_seconds Duration of silence query evaluation. # TYPE cortex_alertmanager_silences_query_duration_seconds histogram cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.005"} 0 @@ -193,13 +210,16 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_silences_query_duration_seconds_bucket{le="+Inf"} 3 cortex_alertmanager_silences_query_duration_seconds_sum 111 cortex_alertmanager_silences_query_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_query_errors_total How many silence received queries did not succeed. # TYPE cortex_alertmanager_silences_query_errors_total counter cortex_alertmanager_silences_query_errors_total 111 + # HELP cortex_alertmanager_silences_snapshot_duration_seconds Duration of the last silence snapshot. # TYPE cortex_alertmanager_silences_snapshot_duration_seconds summary cortex_alertmanager_silences_snapshot_duration_seconds_sum 111 cortex_alertmanager_silences_snapshot_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes. # TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge cortex_alertmanager_silences_snapshot_size_bytes 111 @@ -250,11 +270,6 @@ func populateAlertmanager(base float64) *prometheus.Registry { return reg } -const ( - cacheTypePostings string = "Postings" - cacheTypeSeries string = "Series" -) - // Copied from github.com/alertmanager/nflog/nflog.go type nflogMetrics struct { gcDuration prometheus.Summary From 1bd3599053ac50c53e16ea5c114b0e80d6cb478e Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Wed, 12 Feb 2020 15:14:13 -0500 Subject: [PATCH 4/6] update SendSumOfGaugesPerUserWithLabels and add unit test Signed-off-by: Jacob Lisi --- pkg/util/metrics_helper.go | 22 ++----- pkg/util/metrics_helper_test.go | 108 ++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 18 deletions(-) diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index 7ffa6e6c9a7..7b2cf4a2fbd 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -150,21 +150,6 @@ func (d MetricFamiliesPerUser) SendSumOfCountersPerUser(out chan<- prometheus.Me } } -// SendSumOfCountersPerUserWithLabels provides metrics with the provided label names on a per-user basis. This function assumes that `user` is the -// first label on the provided metric Desc -func (d MetricFamiliesPerUser) SendSumOfCountersPerUserWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, metric string, labelNames ...string) { - for user, userMetrics := range d { - metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[metric], labelNames) - for _, mlv := range metricsPerLabelValue { - for _, m := range mlv.metrics { - val := counterValue(m) - labels := append([]string{user}, mlv.labelValues...) - out <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, val, labels...) - } - } - } -} - func (d MetricFamiliesPerUser) SendSumOfGauges(out chan<- prometheus.Metric, desc *prometheus.Desc, gauge string) { result := float64(0) for _, userMetrics := range d { @@ -183,11 +168,12 @@ func (d MetricFamiliesPerUser) SendSumOfGaugesPerUserWithLabels(out chan<- prome for user, userMetrics := range d { metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[metric], labelNames) for _, mlv := range metricsPerLabelValue { + var val float64 + labels := append([]string{user}, mlv.labelValues...) for _, m := range mlv.metrics { - val := gaugeValue(m) - labels := append([]string{user}, mlv.labelValues...) - out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, val, labels...) + val += gaugeValue(m) } + out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, val, labels...) } } } diff --git a/pkg/util/metrics_helper_test.go b/pkg/util/metrics_helper_test.go index 2b4b3f6ab81..530f5edf324 100644 --- a/pkg/util/metrics_helper_test.go +++ b/pkg/util/metrics_helper_test.go @@ -4,6 +4,7 @@ import ( "testing" "github.com/gogo/protobuf/proto" + "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" ) @@ -81,3 +82,110 @@ func makeLabels(namesAndValues ...string) []*dto.LabelPair { return out } + +// TestSendSumOfGaugesPerUserWithLabels tests to ensure multiple metrics for the same user with a matching label are +// summed correctly +func TestSendSumOfGaugesPerUserWithLabels(t *testing.T) { + metricName := "test_metric" + metricType := dto.MetricType_GAUGE + labelOneName := "label_one" + labelTwoName := "label_two" + labelValueOne := "a" + labelValueTwo := "b" + ts := int64(1000) + val := float64(100) + + desc := prometheus.NewDesc( + metricName, + "", + []string{"user", labelOneName}, nil) + + baseFamily := MetricFamilyMap{ + "test_metric": &dto.MetricFamily{ + Name: &metricName, + Type: &metricType, + Metric: []*dto.Metric{ + &dto.Metric{ + Label: []*dto.LabelPair{ + &dto.LabelPair{ + Name: &labelOneName, + Value: &labelValueOne, + }, + &dto.LabelPair{ + Name: &labelTwoName, + Value: &labelValueOne, + }, + }, + Gauge: &dto.Gauge{ + Value: &val, + }, + TimestampMs: &ts, + }, + &dto.Metric{ + Label: []*dto.LabelPair{ + &dto.LabelPair{ + Name: &labelOneName, + Value: &labelValueOne, + }, + &dto.LabelPair{ + Name: &labelTwoName, + Value: &labelValueTwo, + }, + }, + Gauge: &dto.Gauge{ + Value: &val, + }, + TimestampMs: &ts, + }, + }, + }, + } + + m := MetricFamiliesPerUser{ + "user1": baseFamily, + "user2": baseFamily, + } + out := make(chan prometheus.Metric, 10) + + go func() { + m.SendSumOfGaugesPerUserWithLabels(out, desc, metricName, labelOneName) + close(out) + }() + + userOneMet := <-out + userOneBuf := &dto.Metric{} + err := userOneMet.Write(userOneBuf) + require.NoError(t, err) + userOneSummedVal := userOneBuf.GetGauge().GetValue() + require.Equal(t, val*2, userOneSummedVal) + for _, l := range userOneBuf.GetLabel() { + switch l.GetName() { + case "user": + require.Equal(t, "user1", l.GetValue()) + case labelOneName: + require.Equal(t, labelValueOne, l.GetValue()) + default: + require.FailNow(t, "unexpected label="+l.GetName()) + } + } + + userTwoMet := <-out + userTwoBuf := &dto.Metric{} + err = userTwoMet.Write(userTwoBuf) + require.NoError(t, err) + userTwoSummedVal := userTwoBuf.GetGauge().GetValue() + require.Equal(t, val*2, userTwoSummedVal) + for _, l := range userTwoBuf.GetLabel() { + switch l.GetName() { + case "user": + require.Equal(t, "user2", l.GetValue()) + case labelOneName: + require.Equal(t, labelValueOne, l.GetValue()) + default: + require.FailNow(t, "unexpected label="+l.GetName()) + } + } + + _, closed := <-out + require.False(t, closed) +} From 8291d66cbf3336422324128c56d9baa193c5fd56 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Wed, 12 Feb 2020 15:23:06 -0500 Subject: [PATCH 5/6] refactor per PR comments Signed-off-by: Jacob Lisi --- pkg/alertmanager/alertmanager_metrics.go | 21 ++- pkg/alertmanager/alertmanager_metrics_test.go | 150 +++++------------- 2 files changed, 46 insertions(+), 125 deletions(-) diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go index 3c079649ce7..98a61bf82c0 100644 --- a/pkg/alertmanager/alertmanager_metrics.go +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -53,23 +53,23 @@ func newAlertmanagerMetrics() *alertmanagerMetrics { alertsReceived: prometheus.NewDesc( "cortex_alertmanager_alerts_received_total", "The total number of received alerts.", - []string{"user", "version", "status"}, nil), + nil, nil), alertsInvalid: prometheus.NewDesc( "cortex_alertmanager_alerts_invalid_total", "The total number of received alerts that were invalid.", - []string{"user", "version"}, nil), + nil, nil), numNotifications: prometheus.NewDesc( "cortex_alertmanager_notifications_total", "The total number of attempted notifications.", - []string{"user", "integration"}, nil), + []string{"user"}, nil), numFailedNotifications: prometheus.NewDesc( "cortex_alertmanager_notifications_failed_total", "The total number of failed notifications.", - []string{"user", "integration"}, nil), + []string{"user"}, nil), notificationLatencySeconds: prometheus.NewDesc( "cortex_alertmanager_notification_latency_seconds", "The latency of notifications in seconds.", - []string{"integration"}, nil), + nil, nil), nflogGCDuration: prometheus.NewDesc( "cortex_alertmanager_nflog_gc_duration_seconds", "Duration of the last notification log garbage collection cycle.", @@ -182,13 +182,12 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data := util.BuildMetricFamiliesPerUserFromUserRegistries(m.registries()) - data.SendSumOfCountersPerUserWithLabels(out, m.alertsReceived, "alertmanager_alerts_received_total", "version", "status") - data.SendSumOfCountersPerUserWithLabels(out, m.alertsInvalid, "alertmanager_alerts_invalid_total", "version") - - data.SendSumOfCountersPerUserWithLabels(out, m.numNotifications, "alertmanager_notifications_total", "integration") - data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration") - data.SendSumOfCountersWithLabels(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds", "integration") + data.SendSumOfCounters(out, m.alertsReceived, "alertmanager_alerts_received_total") + data.SendSumOfCounters(out, m.alertsInvalid, "alertmanager_alerts_invalid_total") + data.SendSumOfCountersPerUser(out, m.numNotifications, "alertmanager_notifications_total") + data.SendSumOfCountersPerUser(out, m.numFailedNotifications, "alertmanager_notifications_failed_total") + data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds") data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state") data.SendSumOfSummaries(out, m.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds") diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index 4942fad816b..bd893c646ba 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -10,6 +10,18 @@ import ( "github.com/stretchr/testify/require" ) +var integrations = []string{ + "email", + "hipchat", + "pagerduty", + "wechat", + "pushover", + "slack", + "opsgenie", + "webhook", + "victorops", +} + func TestAlertmanagerMetricsStore(t *testing.T) { mainReg := prometheus.NewPedanticRegistry() @@ -29,44 +41,22 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_alerts{state="suppressed",user="user1"} 2 cortex_alertmanager_alerts{state="suppressed",user="user2"} 20 cortex_alertmanager_alerts{state="suppressed",user="user3"} 200 - # HELP cortex_alertmanager_alerts_invalid_total The total number of received alerts that were invalid. # TYPE cortex_alertmanager_alerts_invalid_total counter - cortex_alertmanager_alerts_invalid_total{user="user1",version="v1"} 1 - cortex_alertmanager_alerts_invalid_total{user="user1",version="v2"} 1 - cortex_alertmanager_alerts_invalid_total{user="user2",version="v1"} 10 - cortex_alertmanager_alerts_invalid_total{user="user2",version="v2"} 10 - cortex_alertmanager_alerts_invalid_total{user="user3",version="v1"} 100 - cortex_alertmanager_alerts_invalid_total{user="user3",version="v2"} 100 - + cortex_alertmanager_alerts_invalid_total 222 # HELP cortex_alertmanager_alerts_received_total The total number of received alerts. # TYPE cortex_alertmanager_alerts_received_total counter - cortex_alertmanager_alerts_received_total{status="firing",user="user1",version="v1"} 2 - cortex_alertmanager_alerts_received_total{status="firing",user="user1",version="v2"} 2 - cortex_alertmanager_alerts_received_total{status="firing",user="user2",version="v1"} 20 - cortex_alertmanager_alerts_received_total{status="firing",user="user2",version="v2"} 20 - cortex_alertmanager_alerts_received_total{status="firing",user="user3",version="v1"} 200 - cortex_alertmanager_alerts_received_total{status="firing",user="user3",version="v2"} 200 - cortex_alertmanager_alerts_received_total{status="resolved",user="user1",version="v1"} 3 - cortex_alertmanager_alerts_received_total{status="resolved",user="user1",version="v2"} 3 - cortex_alertmanager_alerts_received_total{status="resolved",user="user2",version="v1"} 30 - cortex_alertmanager_alerts_received_total{status="resolved",user="user2",version="v2"} 30 - cortex_alertmanager_alerts_received_total{status="resolved",user="user3",version="v1"} 300 - cortex_alertmanager_alerts_received_total{status="resolved",user="user3",version="v2"} 300 - + cortex_alertmanager_alerts_received_total 1110 # HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle. # TYPE cortex_alertmanager_nflog_gc_duration_seconds summary cortex_alertmanager_nflog_gc_duration_seconds_sum 111 cortex_alertmanager_nflog_gc_duration_seconds_count 3 - # HELP cortex_alertmanager_nflog_gossip_messages_propagated_total Number of received gossip messages that have been further gossiped. # TYPE cortex_alertmanager_nflog_gossip_messages_propagated_total counter cortex_alertmanager_nflog_gossip_messages_propagated_total 111 - # HELP cortex_alertmanager_nflog_queries_total Number of notification log queries were received. # TYPE cortex_alertmanager_nflog_queries_total counter cortex_alertmanager_nflog_queries_total 111 - # HELP cortex_alertmanager_nflog_query_duration_seconds Duration of notification log query evaluation. # TYPE cortex_alertmanager_nflog_query_duration_seconds histogram cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.005"} 0 @@ -83,92 +73,36 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_nflog_query_duration_seconds_bucket{le="+Inf"} 3 cortex_alertmanager_nflog_query_duration_seconds_sum 111 cortex_alertmanager_nflog_query_duration_seconds_count 3 - # HELP cortex_alertmanager_nflog_query_errors_total Number notification log received queries that failed. # TYPE cortex_alertmanager_nflog_query_errors_total counter cortex_alertmanager_nflog_query_errors_total 111 - # HELP cortex_alertmanager_nflog_snapshot_duration_seconds Duration of the last notification log snapshot. # TYPE cortex_alertmanager_nflog_snapshot_duration_seconds summary cortex_alertmanager_nflog_snapshot_duration_seconds_sum 111 cortex_alertmanager_nflog_snapshot_duration_seconds_count 3 - # HELP cortex_alertmanager_nflog_snapshot_size_bytes Size of the last notification log snapshot in bytes. # TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge cortex_alertmanager_nflog_snapshot_size_bytes 111 - # HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds. - # TYPE cortex_alertmanager_notification_latency_seconds counter - cortex_alertmanager_notification_latency_seconds{integration="email"} 0 - cortex_alertmanager_notification_latency_seconds{integration="hipchat"} 0 - cortex_alertmanager_notification_latency_seconds{integration="opsgenie"} 0 - cortex_alertmanager_notification_latency_seconds{integration="pagerduty"} 0 - cortex_alertmanager_notification_latency_seconds{integration="pushover"} 0 - cortex_alertmanager_notification_latency_seconds{integration="slack"} 0 - cortex_alertmanager_notification_latency_seconds{integration="victorops"} 0 - cortex_alertmanager_notification_latency_seconds{integration="webhook"} 0 - cortex_alertmanager_notification_latency_seconds{integration="wechat"} 0 - + # TYPE cortex_alertmanager_notification_latency_seconds histogram + cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 15 + cortex_alertmanager_notification_latency_seconds_bucket{le="5"} 21 + cortex_alertmanager_notification_latency_seconds_bucket{le="10"} 23 + cortex_alertmanager_notification_latency_seconds_bucket{le="15"} 25 + cortex_alertmanager_notification_latency_seconds_bucket{le="20"} 27 + cortex_alertmanager_notification_latency_seconds_bucket{le="+Inf"} 27 + cortex_alertmanager_notification_latency_seconds_sum 99.9 + cortex_alertmanager_notification_latency_seconds_count 27 # HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications. # TYPE cortex_alertmanager_notifications_failed_total counter - cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0 - cortex_alertmanager_notifications_failed_total{integration="hipchat",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="hipchat",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="hipchat",user="user3"} 0 - cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 0 - cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user3"} 0 - cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="pushover",user="user3"} 0 - cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="slack",user="user3"} 0 - cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="victorops",user="user3"} 0 - cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="webhook",user="user3"} 0 - cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="wechat",user="user3"} 0 - + cortex_alertmanager_notifications_failed_total{user="user1"} 36 + cortex_alertmanager_notifications_failed_total{user="user2"} 360 + cortex_alertmanager_notifications_failed_total{user="user3"} 3600 # HELP cortex_alertmanager_notifications_total The total number of attempted notifications. # TYPE cortex_alertmanager_notifications_total counter - cortex_alertmanager_notifications_total{integration="email",user="user1"} 0 - cortex_alertmanager_notifications_total{integration="email",user="user2"} 0 - cortex_alertmanager_notifications_total{integration="email",user="user3"} 0 - cortex_alertmanager_notifications_total{integration="hipchat",user="user1"} 0 - cortex_alertmanager_notifications_total{integration="hipchat",user="user2"} 0 - cortex_alertmanager_notifications_total{integration="hipchat",user="user3"} 0 - cortex_alertmanager_notifications_total{integration="opsgenie",user="user1"} 0 - cortex_alertmanager_notifications_total{integration="opsgenie",user="user2"} 0 - cortex_alertmanager_notifications_total{integration="opsgenie",user="user3"} 0 - cortex_alertmanager_notifications_total{integration="pagerduty",user="user1"} 0 - cortex_alertmanager_notifications_total{integration="pagerduty",user="user2"} 0 - cortex_alertmanager_notifications_total{integration="pagerduty",user="user3"} 0 - cortex_alertmanager_notifications_total{integration="pushover",user="user1"} 0 - cortex_alertmanager_notifications_total{integration="pushover",user="user2"} 0 - cortex_alertmanager_notifications_total{integration="pushover",user="user3"} 0 - cortex_alertmanager_notifications_total{integration="slack",user="user1"} 0 - cortex_alertmanager_notifications_total{integration="slack",user="user2"} 0 - cortex_alertmanager_notifications_total{integration="slack",user="user3"} 0 - cortex_alertmanager_notifications_total{integration="victorops",user="user1"} 0 - cortex_alertmanager_notifications_total{integration="victorops",user="user2"} 0 - cortex_alertmanager_notifications_total{integration="victorops",user="user3"} 0 - cortex_alertmanager_notifications_total{integration="webhook",user="user1"} 0 - cortex_alertmanager_notifications_total{integration="webhook",user="user2"} 0 - cortex_alertmanager_notifications_total{integration="webhook",user="user3"} 0 - cortex_alertmanager_notifications_total{integration="wechat",user="user1"} 0 - cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 0 - cortex_alertmanager_notifications_total{integration="wechat",user="user3"} 0 - + cortex_alertmanager_notifications_total{user="user1"} 36 + cortex_alertmanager_notifications_total{user="user2"} 360 + cortex_alertmanager_notifications_total{user="user3"} 3600 # HELP cortex_alertmanager_silences How many silences by state. # TYPE cortex_alertmanager_silences gauge cortex_alertmanager_silences{state="active",user="user1"} 1 @@ -180,20 +114,16 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_silences{state="pending",user="user1"} 3 cortex_alertmanager_silences{state="pending",user="user2"} 30 cortex_alertmanager_silences{state="pending",user="user3"} 300 - # HELP cortex_alertmanager_silences_gc_duration_seconds Duration of the last silence garbage collection cycle. # TYPE cortex_alertmanager_silences_gc_duration_seconds summary cortex_alertmanager_silences_gc_duration_seconds_sum 111 cortex_alertmanager_silences_gc_duration_seconds_count 3 - # HELP cortex_alertmanager_silences_gossip_messages_propagated_total Number of received gossip messages that have been further gossiped. # TYPE cortex_alertmanager_silences_gossip_messages_propagated_total counter cortex_alertmanager_silences_gossip_messages_propagated_total 111 - # HELP cortex_alertmanager_silences_queries_total How many silence queries were received. # TYPE cortex_alertmanager_silences_queries_total counter cortex_alertmanager_silences_queries_total 111 - # HELP cortex_alertmanager_silences_query_duration_seconds Duration of silence query evaluation. # TYPE cortex_alertmanager_silences_query_duration_seconds histogram cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.005"} 0 @@ -210,16 +140,13 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_silences_query_duration_seconds_bucket{le="+Inf"} 3 cortex_alertmanager_silences_query_duration_seconds_sum 111 cortex_alertmanager_silences_query_duration_seconds_count 3 - # HELP cortex_alertmanager_silences_query_errors_total How many silence received queries did not succeed. # TYPE cortex_alertmanager_silences_query_errors_total counter cortex_alertmanager_silences_query_errors_total 111 - # HELP cortex_alertmanager_silences_snapshot_duration_seconds Duration of the last silence snapshot. # TYPE cortex_alertmanager_silences_snapshot_duration_seconds summary cortex_alertmanager_silences_snapshot_duration_seconds_sum 111 cortex_alertmanager_silences_snapshot_duration_seconds_count 3 - # HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes. # TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge cortex_alertmanager_silences_snapshot_size_bytes 111 @@ -251,7 +178,12 @@ func populateAlertmanager(base float64) *prometheus.Registry { n.queryDuration.Observe(base) n.propagatedMessagesTotal.Add(base) - _ = newNotifyMetrics(reg) + nm := newNotifyMetrics(reg) + for i, integration := range integrations { + nm.numNotifications.WithLabelValues(integration).Add(base * float64(i)) + nm.numFailedNotifications.WithLabelValues(integration).Add(base * float64(i)) + nm.notificationLatencySeconds.WithLabelValues(integration).Observe(base * float64(i) * 0.025) + } m := newMarkerMetrics(reg) m.alerts.WithLabelValues(string(types.AlertStateActive)).Add(base) @@ -435,17 +367,7 @@ func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics { Buckets: []float64{1, 5, 10, 15, 20}, }, []string{"integration"}), } - for _, integration := range []string{ - "email", - "hipchat", - "pagerduty", - "wechat", - "pushover", - "slack", - "opsgenie", - "webhook", - "victorops", - } { + for _, integration := range integrations { m.numNotifications.WithLabelValues(integration) m.numFailedNotifications.WithLabelValues(integration) m.notificationLatencySeconds.WithLabelValues(integration) From d2580b573ba9a8c8004481c5c39d306c0d207e47 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Wed, 12 Feb 2020 15:36:04 -0500 Subject: [PATCH 6/6] goformat Signed-off-by: Jacob Lisi --- pkg/util/metrics_helper_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/util/metrics_helper_test.go b/pkg/util/metrics_helper_test.go index 530f5edf324..70af8af496b 100644 --- a/pkg/util/metrics_helper_test.go +++ b/pkg/util/metrics_helper_test.go @@ -105,13 +105,13 @@ func TestSendSumOfGaugesPerUserWithLabels(t *testing.T) { Name: &metricName, Type: &metricType, Metric: []*dto.Metric{ - &dto.Metric{ + { Label: []*dto.LabelPair{ - &dto.LabelPair{ + { Name: &labelOneName, Value: &labelValueOne, }, - &dto.LabelPair{ + { Name: &labelTwoName, Value: &labelValueOne, }, @@ -121,13 +121,13 @@ func TestSendSumOfGaugesPerUserWithLabels(t *testing.T) { }, TimestampMs: &ts, }, - &dto.Metric{ + { Label: []*dto.LabelPair{ - &dto.LabelPair{ + { Name: &labelOneName, Value: &labelValueOne, }, - &dto.LabelPair{ + { Name: &labelTwoName, Value: &labelValueTwo, },