diff --git a/CHANGELOG.md b/CHANGELOG.md index 15bbe1e849e..43f71be28e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ * `--experimental.distributor.user-subring-size` * [FEATURE] Added flag `-experimental.ruler.enable-api` to enable the ruler api which implements the Prometheus API `/api/v1/rules` and `/api/v1/alerts` endpoints under the configured `-http.prefix`. #1999 * [FEATURE] Added sharding support to compactor when using the experimental TSDB blocks storage. #2113 +* [ENHANCEMENT] Alertmanager: Expose Per-tenant alertmanager metrics #2116 * [ENHANCEMENT] Experimental TSDB: Export TSDB Syncer metrics from Compactor component, they are prefixed with `cortex_compactor_`. #2023 * [ENHANCEMENT] Experimental TSDB: Added dedicated flag `-experimental.tsdb.bucket-store.tenant-sync-concurrency` to configure the maximum number of concurrent tenants for which blocks are synched. #2026 * [ENHANCEMENT] Experimental TSDB: Expose metrics for objstore operations (prefixed with `cortex__thanos_objstore_`, component being one of `ingester`, `querier` and `compactor`). #2027 diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 6371de0b3bb..ad88790b3b3 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -84,17 +84,14 @@ func init() { } // New creates a new Alertmanager. -func New(cfg *Config) (*Alertmanager, error) { +func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am := &Alertmanager{ cfg: cfg, logger: log.With(cfg.Logger, "user", cfg.UserID), stop: make(chan struct{}), } - // TODO(cortex): Build a registry that can merge metrics from multiple users. - // For now, these metrics are ignored, as we can't register the same - // metric twice with a single registry. - am.registry = prometheus.NewRegistry() + am.registry = reg am.wg.Add(1) nflogID := fmt.Sprintf("nflog:%s", cfg.UserID) diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go new file mode 100644 index 00000000000..98a61bf82c0 --- /dev/null +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -0,0 +1,209 @@ +package alertmanager + +import ( + "sync" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/cortexproject/cortex/pkg/util" +) + +// This struct aggregates metrics exported by Alertmanager +// and re-exports those aggregates as Cortex metrics. +type alertmanagerMetrics struct { + // Maps userID -> registry + regsMu sync.Mutex + regs map[string]*prometheus.Registry + + // exported metrics, gathered from Alertmanager API + alertsReceived *prometheus.Desc + alertsInvalid *prometheus.Desc + + // exported metrics, gathered from Alertmanager PipelineBuilder + numNotifications *prometheus.Desc + numFailedNotifications *prometheus.Desc + notificationLatencySeconds *prometheus.Desc + + // exported metrics, gathered from Alertmanager nflog + nflogGCDuration *prometheus.Desc + nflogSnapshotDuration *prometheus.Desc + nflogSnapshotSize *prometheus.Desc + nflogQueriesTotal *prometheus.Desc + nflogQueryErrorsTotal *prometheus.Desc + nflogQueryDuration *prometheus.Desc + nflogPropagatedMessagesTotal *prometheus.Desc + + // exported metrics, gathered from Alertmanager Marker + markerAlerts *prometheus.Desc + + // exported metrics, gathered from Alertmanager Silences + silencesGCDuration *prometheus.Desc + silencesSnapshotDuration *prometheus.Desc + silencesSnapshotSize *prometheus.Desc + silencesQueriesTotal *prometheus.Desc + silencesQueryErrorsTotal *prometheus.Desc + silencesQueryDuration *prometheus.Desc + silences *prometheus.Desc + silencesPropagatedMessagesTotal *prometheus.Desc +} + +func newAlertmanagerMetrics() *alertmanagerMetrics { + return &alertmanagerMetrics{ + regs: map[string]*prometheus.Registry{}, + alertsReceived: prometheus.NewDesc( + "cortex_alertmanager_alerts_received_total", + "The total number of received alerts.", + nil, nil), + alertsInvalid: prometheus.NewDesc( + "cortex_alertmanager_alerts_invalid_total", + "The total number of received alerts that were invalid.", + nil, nil), + numNotifications: prometheus.NewDesc( + "cortex_alertmanager_notifications_total", + "The total number of attempted notifications.", + []string{"user"}, nil), + numFailedNotifications: prometheus.NewDesc( + "cortex_alertmanager_notifications_failed_total", + "The total number of failed notifications.", + []string{"user"}, nil), + notificationLatencySeconds: prometheus.NewDesc( + "cortex_alertmanager_notification_latency_seconds", + "The latency of notifications in seconds.", + nil, nil), + nflogGCDuration: prometheus.NewDesc( + "cortex_alertmanager_nflog_gc_duration_seconds", + "Duration of the last notification log garbage collection cycle.", + nil, nil), + nflogSnapshotDuration: prometheus.NewDesc( + "cortex_alertmanager_nflog_snapshot_duration_seconds", + "Duration of the last notification log snapshot.", + nil, nil), + nflogSnapshotSize: prometheus.NewDesc( + "cortex_alertmanager_nflog_snapshot_size_bytes", + "Size of the last notification log snapshot in bytes.", + nil, nil), + nflogQueriesTotal: prometheus.NewDesc( + "cortex_alertmanager_nflog_queries_total", + "Number of notification log queries were received.", + nil, nil), + nflogQueryErrorsTotal: prometheus.NewDesc( + "cortex_alertmanager_nflog_query_errors_total", + "Number notification log received queries that failed.", + nil, nil), + nflogQueryDuration: prometheus.NewDesc( + "cortex_alertmanager_nflog_query_duration_seconds", + "Duration of notification log query evaluation.", + nil, nil), + nflogPropagatedMessagesTotal: prometheus.NewDesc( + "cortex_alertmanager_nflog_gossip_messages_propagated_total", + "Number of received gossip messages that have been further gossiped.", + nil, nil), + markerAlerts: prometheus.NewDesc( + "cortex_alertmanager_alerts", + "How many alerts by state.", + []string{"user", "state"}, nil), + silencesGCDuration: prometheus.NewDesc( + "cortex_alertmanager_silences_gc_duration_seconds", + "Duration of the last silence garbage collection cycle.", + nil, nil), + silencesSnapshotDuration: prometheus.NewDesc( + "cortex_alertmanager_silences_snapshot_duration_seconds", + "Duration of the last silence snapshot.", + nil, nil), + silencesSnapshotSize: prometheus.NewDesc( + "cortex_alertmanager_silences_snapshot_size_bytes", + "Size of the last silence snapshot in bytes.", + nil, nil), + silencesQueriesTotal: prometheus.NewDesc( + "cortex_alertmanager_silences_queries_total", + "How many silence queries were received.", + nil, nil), + silencesQueryErrorsTotal: prometheus.NewDesc( + "cortex_alertmanager_silences_query_errors_total", + "How many silence received queries did not succeed.", + nil, nil), + silencesQueryDuration: prometheus.NewDesc( + "cortex_alertmanager_silences_query_duration_seconds", + "Duration of silence query evaluation.", + nil, nil), + silencesPropagatedMessagesTotal: prometheus.NewDesc( + "cortex_alertmanager_silences_gossip_messages_propagated_total", + "Number of received gossip messages that have been further gossiped.", + nil, nil), + silences: prometheus.NewDesc( + "cortex_alertmanager_silences", + "How many silences by state.", + []string{"user", "state"}, nil), + } +} + +func (m *alertmanagerMetrics) addUserRegistry(user string, reg *prometheus.Registry) { + m.regsMu.Lock() + m.regs[user] = reg + m.regsMu.Unlock() +} + +func (m *alertmanagerMetrics) registries() map[string]*prometheus.Registry { + regs := map[string]*prometheus.Registry{} + + m.regsMu.Lock() + defer m.regsMu.Unlock() + for uid, r := range m.regs { + regs[uid] = r + } + + return regs +} + +func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { + out <- m.alertsReceived + out <- m.alertsInvalid + out <- m.numNotifications + out <- m.numFailedNotifications + out <- m.notificationLatencySeconds + out <- m.nflogGCDuration + out <- m.nflogSnapshotDuration + out <- m.nflogSnapshotSize + out <- m.nflogQueriesTotal + out <- m.nflogQueryErrorsTotal + out <- m.nflogQueryDuration + out <- m.nflogPropagatedMessagesTotal + out <- m.markerAlerts + out <- m.silencesGCDuration + out <- m.silencesSnapshotDuration + out <- m.silencesSnapshotSize + out <- m.silencesQueriesTotal + out <- m.silencesQueryErrorsTotal + out <- m.silencesQueryDuration + out <- m.silences + out <- m.silencesPropagatedMessagesTotal +} + +func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { + data := util.BuildMetricFamiliesPerUserFromUserRegistries(m.registries()) + + data.SendSumOfCounters(out, m.alertsReceived, "alertmanager_alerts_received_total") + data.SendSumOfCounters(out, m.alertsInvalid, "alertmanager_alerts_invalid_total") + + data.SendSumOfCountersPerUser(out, m.numNotifications, "alertmanager_notifications_total") + data.SendSumOfCountersPerUser(out, m.numFailedNotifications, "alertmanager_notifications_failed_total") + data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds") + data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state") + + data.SendSumOfSummaries(out, m.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds") + data.SendSumOfSummaries(out, m.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds") + data.SendSumOfGauges(out, m.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes") + data.SendSumOfCounters(out, m.nflogQueriesTotal, "alertmanager_nflog_queries_total") + data.SendSumOfCounters(out, m.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total") + data.SendSumOfHistograms(out, m.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds") + data.SendSumOfCounters(out, m.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total") + + data.SendSumOfSummaries(out, m.silencesGCDuration, "alertmanager_silences_gc_duration_seconds") + data.SendSumOfSummaries(out, m.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds") + data.SendSumOfGauges(out, m.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes") + data.SendSumOfCounters(out, m.silencesQueriesTotal, "alertmanager_silences_queries_total") + data.SendSumOfCounters(out, m.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total") + data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds") + data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total") + data.SendSumOfGaugesPerUserWithLabels(out, m.silences, "alertmanager_silences", "state") +} diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go new file mode 100644 index 00000000000..bd893c646ba --- /dev/null +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -0,0 +1,421 @@ +package alertmanager + +import ( + "bytes" + "testing" + + "github.com/prometheus/alertmanager/types" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/require" +) + +var integrations = []string{ + "email", + "hipchat", + "pagerduty", + "wechat", + "pushover", + "slack", + "opsgenie", + "webhook", + "victorops", +} + +func TestAlertmanagerMetricsStore(t *testing.T) { + mainReg := prometheus.NewPedanticRegistry() + + alertmanangerMetrics := newAlertmanagerMetrics() + mainReg.MustRegister(alertmanangerMetrics) + alertmanangerMetrics.addUserRegistry("user1", populateAlertmanager(1)) + alertmanangerMetrics.addUserRegistry("user2", populateAlertmanager(10)) + alertmanangerMetrics.addUserRegistry("user3", populateAlertmanager(100)) + + //noinspection ALL + err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(` + # HELP cortex_alertmanager_alerts How many alerts by state. + # TYPE cortex_alertmanager_alerts gauge + cortex_alertmanager_alerts{state="active",user="user1"} 1 + cortex_alertmanager_alerts{state="active",user="user2"} 10 + cortex_alertmanager_alerts{state="active",user="user3"} 100 + cortex_alertmanager_alerts{state="suppressed",user="user1"} 2 + cortex_alertmanager_alerts{state="suppressed",user="user2"} 20 + cortex_alertmanager_alerts{state="suppressed",user="user3"} 200 + # HELP cortex_alertmanager_alerts_invalid_total The total number of received alerts that were invalid. + # TYPE cortex_alertmanager_alerts_invalid_total counter + cortex_alertmanager_alerts_invalid_total 222 + # HELP cortex_alertmanager_alerts_received_total The total number of received alerts. + # TYPE cortex_alertmanager_alerts_received_total counter + cortex_alertmanager_alerts_received_total 1110 + # HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle. + # TYPE cortex_alertmanager_nflog_gc_duration_seconds summary + cortex_alertmanager_nflog_gc_duration_seconds_sum 111 + cortex_alertmanager_nflog_gc_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_gossip_messages_propagated_total Number of received gossip messages that have been further gossiped. + # TYPE cortex_alertmanager_nflog_gossip_messages_propagated_total counter + cortex_alertmanager_nflog_gossip_messages_propagated_total 111 + # HELP cortex_alertmanager_nflog_queries_total Number of notification log queries were received. + # TYPE cortex_alertmanager_nflog_queries_total counter + cortex_alertmanager_nflog_queries_total 111 + # HELP cortex_alertmanager_nflog_query_duration_seconds Duration of notification log query evaluation. + # TYPE cortex_alertmanager_nflog_query_duration_seconds histogram + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.005"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.01"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.025"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.05"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.1"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.25"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.5"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="1"} 1 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="2.5"} 1 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="5"} 1 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="10"} 2 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="+Inf"} 3 + cortex_alertmanager_nflog_query_duration_seconds_sum 111 + cortex_alertmanager_nflog_query_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_query_errors_total Number notification log received queries that failed. + # TYPE cortex_alertmanager_nflog_query_errors_total counter + cortex_alertmanager_nflog_query_errors_total 111 + # HELP cortex_alertmanager_nflog_snapshot_duration_seconds Duration of the last notification log snapshot. + # TYPE cortex_alertmanager_nflog_snapshot_duration_seconds summary + cortex_alertmanager_nflog_snapshot_duration_seconds_sum 111 + cortex_alertmanager_nflog_snapshot_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_snapshot_size_bytes Size of the last notification log snapshot in bytes. + # TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge + cortex_alertmanager_nflog_snapshot_size_bytes 111 + # HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds. + # TYPE cortex_alertmanager_notification_latency_seconds histogram + cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 15 + cortex_alertmanager_notification_latency_seconds_bucket{le="5"} 21 + cortex_alertmanager_notification_latency_seconds_bucket{le="10"} 23 + cortex_alertmanager_notification_latency_seconds_bucket{le="15"} 25 + cortex_alertmanager_notification_latency_seconds_bucket{le="20"} 27 + cortex_alertmanager_notification_latency_seconds_bucket{le="+Inf"} 27 + cortex_alertmanager_notification_latency_seconds_sum 99.9 + cortex_alertmanager_notification_latency_seconds_count 27 + # HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications. + # TYPE cortex_alertmanager_notifications_failed_total counter + cortex_alertmanager_notifications_failed_total{user="user1"} 36 + cortex_alertmanager_notifications_failed_total{user="user2"} 360 + cortex_alertmanager_notifications_failed_total{user="user3"} 3600 + # HELP cortex_alertmanager_notifications_total The total number of attempted notifications. + # TYPE cortex_alertmanager_notifications_total counter + cortex_alertmanager_notifications_total{user="user1"} 36 + cortex_alertmanager_notifications_total{user="user2"} 360 + cortex_alertmanager_notifications_total{user="user3"} 3600 + # HELP cortex_alertmanager_silences How many silences by state. + # TYPE cortex_alertmanager_silences gauge + cortex_alertmanager_silences{state="active",user="user1"} 1 + cortex_alertmanager_silences{state="active",user="user2"} 10 + cortex_alertmanager_silences{state="active",user="user3"} 100 + cortex_alertmanager_silences{state="expired",user="user1"} 2 + cortex_alertmanager_silences{state="expired",user="user2"} 20 + cortex_alertmanager_silences{state="expired",user="user3"} 200 + cortex_alertmanager_silences{state="pending",user="user1"} 3 + cortex_alertmanager_silences{state="pending",user="user2"} 30 + cortex_alertmanager_silences{state="pending",user="user3"} 300 + # HELP cortex_alertmanager_silences_gc_duration_seconds Duration of the last silence garbage collection cycle. + # TYPE cortex_alertmanager_silences_gc_duration_seconds summary + cortex_alertmanager_silences_gc_duration_seconds_sum 111 + cortex_alertmanager_silences_gc_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_gossip_messages_propagated_total Number of received gossip messages that have been further gossiped. + # TYPE cortex_alertmanager_silences_gossip_messages_propagated_total counter + cortex_alertmanager_silences_gossip_messages_propagated_total 111 + # HELP cortex_alertmanager_silences_queries_total How many silence queries were received. + # TYPE cortex_alertmanager_silences_queries_total counter + cortex_alertmanager_silences_queries_total 111 + # HELP cortex_alertmanager_silences_query_duration_seconds Duration of silence query evaluation. + # TYPE cortex_alertmanager_silences_query_duration_seconds histogram + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.005"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.01"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.025"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.05"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.1"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.25"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.5"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="1"} 1 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="2.5"} 1 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="5"} 1 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="10"} 2 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="+Inf"} 3 + cortex_alertmanager_silences_query_duration_seconds_sum 111 + cortex_alertmanager_silences_query_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_query_errors_total How many silence received queries did not succeed. + # TYPE cortex_alertmanager_silences_query_errors_total counter + cortex_alertmanager_silences_query_errors_total 111 + # HELP cortex_alertmanager_silences_snapshot_duration_seconds Duration of the last silence snapshot. + # TYPE cortex_alertmanager_silences_snapshot_duration_seconds summary + cortex_alertmanager_silences_snapshot_duration_seconds_sum 111 + cortex_alertmanager_silences_snapshot_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes. + # TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge + cortex_alertmanager_silences_snapshot_size_bytes 111 + +`)) + require.NoError(t, err) +} + +func populateAlertmanager(base float64) *prometheus.Registry { + reg := prometheus.NewRegistry() + s := newSilenceMetrics(reg) + s.gcDuration.Observe(base) + s.snapshotDuration.Observe(base) + s.snapshotSize.Add(base) + s.queriesTotal.Add(base) + s.queryErrorsTotal.Add(base) + s.queryDuration.Observe(base) + s.propagatedMessagesTotal.Add(base) + s.silencesActive.Set(base) + s.silencesExpired.Set(base * 2) + s.silencesPending.Set(base * 3) + + n := newNflogMetrics(reg) + n.gcDuration.Observe(base) + n.snapshotDuration.Observe(base) + n.snapshotSize.Add(base) + n.queriesTotal.Add(base) + n.queryErrorsTotal.Add(base) + n.queryDuration.Observe(base) + n.propagatedMessagesTotal.Add(base) + + nm := newNotifyMetrics(reg) + for i, integration := range integrations { + nm.numNotifications.WithLabelValues(integration).Add(base * float64(i)) + nm.numFailedNotifications.WithLabelValues(integration).Add(base * float64(i)) + nm.notificationLatencySeconds.WithLabelValues(integration).Observe(base * float64(i) * 0.025) + } + + m := newMarkerMetrics(reg) + m.alerts.WithLabelValues(string(types.AlertStateActive)).Add(base) + m.alerts.WithLabelValues(string(types.AlertStateSuppressed)).Add(base * 2) + + v1APIMetrics := newAPIMetrics("v1", reg) + v1APIMetrics.firing.Add(base * 2) + v1APIMetrics.invalid.Add(base) + v1APIMetrics.resolved.Add(base * 3) + + v2APIMetrics := newAPIMetrics("v2", reg) + v2APIMetrics.firing.Add(base * 2) + v2APIMetrics.invalid.Add(base) + v2APIMetrics.resolved.Add(base * 3) + + return reg +} + +// Copied from github.com/alertmanager/nflog/nflog.go +type nflogMetrics struct { + gcDuration prometheus.Summary + snapshotDuration prometheus.Summary + snapshotSize prometheus.Gauge + queriesTotal prometheus.Counter + queryErrorsTotal prometheus.Counter + queryDuration prometheus.Histogram + propagatedMessagesTotal prometheus.Counter +} + +func newNflogMetrics(r prometheus.Registerer) *nflogMetrics { + m := &nflogMetrics{} + + m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_nflog_gc_duration_seconds", + Help: "Duration of the last notification log garbage collection cycle.", + Objectives: map[float64]float64{}, + }) + m.snapshotDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_nflog_snapshot_duration_seconds", + Help: "Duration of the last notification log snapshot.", + Objectives: map[float64]float64{}, + }) + m.snapshotSize = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_nflog_snapshot_size_bytes", + Help: "Size of the last notification log snapshot in bytes.", + }) + m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_queries_total", + Help: "Number of notification log queries were received.", + }) + m.queryErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_query_errors_total", + Help: "Number notification log received queries that failed.", + }) + m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "alertmanager_nflog_query_duration_seconds", + Help: "Duration of notification log query evaluation.", + }) + m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_gossip_messages_propagated_total", + Help: "Number of received gossip messages that have been further gossiped.", + }) + + if r != nil { + r.MustRegister( + m.gcDuration, + m.snapshotDuration, + m.snapshotSize, + m.queriesTotal, + m.queryErrorsTotal, + m.queryDuration, + m.propagatedMessagesTotal, + ) + } + return m +} + +// Copied from github.com/alertmanager/silence/silence.go +type silenceMetrics struct { + gcDuration prometheus.Summary + snapshotDuration prometheus.Summary + snapshotSize prometheus.Gauge + queriesTotal prometheus.Counter + queryErrorsTotal prometheus.Counter + queryDuration prometheus.Histogram + silencesActive prometheus.Gauge + silencesPending prometheus.Gauge + silencesExpired prometheus.Gauge + propagatedMessagesTotal prometheus.Counter +} + +func newSilenceMetrics(r prometheus.Registerer) *silenceMetrics { + m := &silenceMetrics{} + + m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_silences_gc_duration_seconds", + Help: "Duration of the last silence garbage collection cycle.", + Objectives: map[float64]float64{}, + }) + m.snapshotDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_silences_snapshot_duration_seconds", + Help: "Duration of the last silence snapshot.", + Objectives: map[float64]float64{}, + }) + m.snapshotSize = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences_snapshot_size_bytes", + Help: "Size of the last silence snapshot in bytes.", + }) + m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_queries_total", + Help: "How many silence queries were received.", + }) + m.queryErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_query_errors_total", + Help: "How many silence received queries did not succeed.", + }) + m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "alertmanager_silences_query_duration_seconds", + Help: "Duration of silence query evaluation.", + }) + m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_gossip_messages_propagated_total", + Help: "Number of received gossip messages that have been further gossiped.", + }) + m.silencesActive = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences", + Help: "How many silences by state.", + ConstLabels: prometheus.Labels{"state": string(types.SilenceStateActive)}, + }) + m.silencesPending = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences", + Help: "How many silences by state.", + ConstLabels: prometheus.Labels{"state": string(types.SilenceStatePending)}, + }) + m.silencesExpired = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences", + Help: "How many silences by state.", + ConstLabels: prometheus.Labels{"state": string(types.SilenceStateExpired)}, + }) + + if r != nil { + r.MustRegister( + m.gcDuration, + m.snapshotDuration, + m.snapshotSize, + m.queriesTotal, + m.queryErrorsTotal, + m.queryDuration, + m.silencesActive, + m.silencesPending, + m.silencesExpired, + m.propagatedMessagesTotal, + ) + } + return m +} + +// Copied from github.com/alertmanager/notify/notify.go +type notifyMetrics struct { + numNotifications *prometheus.CounterVec + numFailedNotifications *prometheus.CounterVec + notificationLatencySeconds *prometheus.HistogramVec +} + +func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics { + m := ¬ifyMetrics{ + numNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "alertmanager", + Name: "notifications_total", + Help: "The total number of attempted notifications.", + }, []string{"integration"}), + numFailedNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "alertmanager", + Name: "notifications_failed_total", + Help: "The total number of failed notifications.", + }, []string{"integration"}), + notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "alertmanager", + Name: "notification_latency_seconds", + Help: "The latency of notifications in seconds.", + Buckets: []float64{1, 5, 10, 15, 20}, + }, []string{"integration"}), + } + for _, integration := range integrations { + m.numNotifications.WithLabelValues(integration) + m.numFailedNotifications.WithLabelValues(integration) + m.notificationLatencySeconds.WithLabelValues(integration) + } + r.MustRegister(m.numNotifications, m.numFailedNotifications, m.notificationLatencySeconds) + return m +} + +type markerMetrics struct { + alerts *prometheus.GaugeVec +} + +func newMarkerMetrics(r prometheus.Registerer) *markerMetrics { + m := &markerMetrics{ + alerts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "alertmanager_alerts", + Help: "How many alerts by state.", + }, []string{"state"}), + } + + r.MustRegister(m.alerts) + return m +} + +// Copied from github.com/alertmanager/api/metrics/metrics.go +type apiMetrics struct { + firing prometheus.Counter + resolved prometheus.Counter + invalid prometheus.Counter +} + +func newAPIMetrics(version string, r prometheus.Registerer) *apiMetrics { + numReceivedAlerts := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "alertmanager_alerts_received_total", + Help: "The total number of received alerts.", + ConstLabels: prometheus.Labels{"version": version}, + }, []string{"status"}) + numInvalidAlerts := prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_alerts_invalid_total", + Help: "The total number of received alerts that were invalid.", + ConstLabels: prometheus.Labels{"version": version}, + }) + if r != nil { + r.MustRegister(numReceivedAlerts, numInvalidAlerts) + } + return &apiMetrics{ + firing: numReceivedAlerts.WithLabelValues("firing"), + resolved: numReceivedAlerts.WithLabelValues("resolved"), + invalid: numInvalidAlerts, + } +} diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 69df130f2e9..dc22ea3bcfa 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -140,6 +140,8 @@ type MultitenantAlertmanager struct { alertmanagersMtx sync.Mutex alertmanagers map[string]*Alertmanager + metrics *alertmanagerMetrics + latestConfig configs.ID latestMutex sync.RWMutex @@ -150,7 +152,7 @@ type MultitenantAlertmanager struct { } // NewMultitenantAlertmanager creates a new MultitenantAlertmanager. -func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, cfgCfg configs_client.Config) (*MultitenantAlertmanager, error) { +func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, cfgCfg configs_client.Config, registerer prometheus.Registerer) (*MultitenantAlertmanager, error) { err := os.MkdirAll(cfg.DataDir, 0777) if err != nil { return nil, fmt.Errorf("unable to create Alertmanager data directory %q: %s", cfg.DataDir, err) @@ -177,7 +179,7 @@ func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, cfgCfg confi if cfg.ClusterBindAddr != "" { peer, err = cluster.Create( log.With(util.Logger, "component", "cluster"), - prometheus.DefaultRegisterer, + registerer, cfg.ClusterBindAddr, cfg.ClusterAdvertiseAddr, cfg.Peers, @@ -204,10 +206,16 @@ func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, cfgCfg confi fallbackConfig: string(fallbackConfig), cfgs: map[string]configs.Config{}, alertmanagers: map[string]*Alertmanager{}, + metrics: newAlertmanagerMetrics(), peer: peer, stop: make(chan struct{}), done: make(chan struct{}), } + + if registerer != nil { + registerer.MustRegister(am.metrics) + } + return am, nil } @@ -431,6 +439,8 @@ func (am *MultitenantAlertmanager) deleteUser(userID string) { } func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amconfig.Config) (*Alertmanager, error) { + reg := prometheus.NewRegistry() + am.metrics.addUserRegistry(userID, reg) newAM, err := New(&Config{ UserID: userID, DataDir: am.cfg.DataDir, @@ -439,7 +449,7 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amco PeerTimeout: am.cfg.PeerTimeout, Retention: am.cfg.Retention, ExternalURL: am.cfg.ExternalURL.URL, - }) + }, reg) if err != nil { return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err) } diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index 2ef73b6e021..06fd1f77fa1 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -478,7 +478,7 @@ func (t *Cortex) stopConfigs() error { } func (t *Cortex) initAlertmanager(cfg *Config) (err error) { - t.alertmanager, err = alertmanager.NewMultitenantAlertmanager(&cfg.Alertmanager, cfg.ConfigStore) + t.alertmanager, err = alertmanager.NewMultitenantAlertmanager(&cfg.Alertmanager, cfg.ConfigStore, prometheus.DefaultRegisterer) if err != nil { return err } diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index 234b154ebd6..7b2cf4a2fbd 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -162,6 +162,22 @@ func (d MetricFamiliesPerUser) SendSumOfGaugesWithLabels(out chan<- prometheus.M d.sumOfSingleValuesWithLabels(gauge, gaugeValue, labelNames).WriteToMetricChannel(out, desc, prometheus.GaugeValue) } +// SendSumOfGaugesPerUserWithLabels provides metrics with the provided label names on a per-user basis. This function assumes that `user` is the +// first label on the provided metric Desc +func (d MetricFamiliesPerUser) SendSumOfGaugesPerUserWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, metric string, labelNames ...string) { + for user, userMetrics := range d { + metricsPerLabelValue := getMetricsWithLabelNames(userMetrics[metric], labelNames) + for _, mlv := range metricsPerLabelValue { + var val float64 + labels := append([]string{user}, mlv.labelValues...) + for _, m := range mlv.metrics { + val += gaugeValue(m) + } + out <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, val, labels...) + } + } +} + func (d MetricFamiliesPerUser) sumOfSingleValuesWithLabels(metric string, fn func(*dto.Metric) float64, labelNames []string) singleValueWithLabelsMap { result := singleValueWithLabelsMap{} for _, userMetrics := range d { diff --git a/pkg/util/metrics_helper_test.go b/pkg/util/metrics_helper_test.go index 2b4b3f6ab81..70af8af496b 100644 --- a/pkg/util/metrics_helper_test.go +++ b/pkg/util/metrics_helper_test.go @@ -4,6 +4,7 @@ import ( "testing" "github.com/gogo/protobuf/proto" + "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" ) @@ -81,3 +82,110 @@ func makeLabels(namesAndValues ...string) []*dto.LabelPair { return out } + +// TestSendSumOfGaugesPerUserWithLabels tests to ensure multiple metrics for the same user with a matching label are +// summed correctly +func TestSendSumOfGaugesPerUserWithLabels(t *testing.T) { + metricName := "test_metric" + metricType := dto.MetricType_GAUGE + labelOneName := "label_one" + labelTwoName := "label_two" + labelValueOne := "a" + labelValueTwo := "b" + ts := int64(1000) + val := float64(100) + + desc := prometheus.NewDesc( + metricName, + "", + []string{"user", labelOneName}, nil) + + baseFamily := MetricFamilyMap{ + "test_metric": &dto.MetricFamily{ + Name: &metricName, + Type: &metricType, + Metric: []*dto.Metric{ + { + Label: []*dto.LabelPair{ + { + Name: &labelOneName, + Value: &labelValueOne, + }, + { + Name: &labelTwoName, + Value: &labelValueOne, + }, + }, + Gauge: &dto.Gauge{ + Value: &val, + }, + TimestampMs: &ts, + }, + { + Label: []*dto.LabelPair{ + { + Name: &labelOneName, + Value: &labelValueOne, + }, + { + Name: &labelTwoName, + Value: &labelValueTwo, + }, + }, + Gauge: &dto.Gauge{ + Value: &val, + }, + TimestampMs: &ts, + }, + }, + }, + } + + m := MetricFamiliesPerUser{ + "user1": baseFamily, + "user2": baseFamily, + } + out := make(chan prometheus.Metric, 10) + + go func() { + m.SendSumOfGaugesPerUserWithLabels(out, desc, metricName, labelOneName) + close(out) + }() + + userOneMet := <-out + userOneBuf := &dto.Metric{} + err := userOneMet.Write(userOneBuf) + require.NoError(t, err) + userOneSummedVal := userOneBuf.GetGauge().GetValue() + require.Equal(t, val*2, userOneSummedVal) + for _, l := range userOneBuf.GetLabel() { + switch l.GetName() { + case "user": + require.Equal(t, "user1", l.GetValue()) + case labelOneName: + require.Equal(t, labelValueOne, l.GetValue()) + default: + require.FailNow(t, "unexpected label="+l.GetName()) + } + } + + userTwoMet := <-out + userTwoBuf := &dto.Metric{} + err = userTwoMet.Write(userTwoBuf) + require.NoError(t, err) + userTwoSummedVal := userTwoBuf.GetGauge().GetValue() + require.Equal(t, val*2, userTwoSummedVal) + for _, l := range userTwoBuf.GetLabel() { + switch l.GetName() { + case "user": + require.Equal(t, "user2", l.GetValue()) + case labelOneName: + require.Equal(t, labelValueOne, l.GetValue()) + default: + require.FailNow(t, "unexpected label="+l.GetName()) + } + } + + _, closed := <-out + require.False(t, closed) +}