From 380f407e8bcbfe2c213dd2fa7838fefc0af760bd Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Tue, 11 Feb 2020 23:49:09 -0500 Subject: [PATCH 1/7] add per-tenant alertmanager metrics Signed-off-by: Jacob Lisi --- CHANGELOG.md | 1 + go.sum | 3 + pkg/alertmanager/alertmanager.go | 66 ++- pkg/alertmanager/alertmanager_metrics.go | 210 +++++++++ pkg/alertmanager/alertmanager_metrics_test.go | 421 ++++++++++++++++++ pkg/alertmanager/multitenant.go | 32 +- pkg/alertmanager/multitenant_test.go | 31 +- pkg/cortex/modules.go | 2 +- pkg/util/metrics_helper.go | 18 + pkg/util/metrics_helper_test.go | 81 ++++ 10 files changed, 842 insertions(+), 23 deletions(-) create mode 100644 pkg/alertmanager/alertmanager_metrics.go create mode 100644 pkg/alertmanager/alertmanager_metrics_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 1103805f88e..278ae55f652 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ * [FEATURE] Add /config HTTP endpoint which exposes the current Cortex configuration as YAML. #2165 * [FEATURE] Allow Prometheus remote write directly to ingesters. #1491 * [FEATURE] Add flag `-experimental.tsdb.stripe-size` to expose TSDB stripe size option. #2185 +* [ENHANCEMENT] Alertmanager: Expose Per-tenant alertmanager metrics #2124 * [ENHANCEMENT] Add `status` label to `cortex_alertmanager_configs` metric to gauge the number of valid and invalid configs. #2125 * [ENHANCEMENT] Cassandra Authentication: added the `custom_authenticators` config option that allows users to authenticate with cassandra clusters using password authenticators that are not approved by default in [gocql](https://github.com/gocql/gocql/blob/81b8263d9fe526782a588ef94d3fa5c6148e5d67/conn.go#L27) #2093 * [ENHANCEMENT] Experimental TSDB: Export TSDB Syncer metrics from Compactor component, they are prefixed with `cortex_compactor_`. #2023 diff --git a/go.sum b/go.sum index 87e824fcd96..cbc4a88eb4b 100644 --- a/go.sum +++ b/go.sum @@ -225,6 +225,7 @@ github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMo github.com/fsouza/fake-gcs-server v1.7.0 h1:Un0BXUXrRWYSmYyC1Rqm2e2WJfTPyDy/HGMz31emTi8= github.com/fsouza/fake-gcs-server v1.7.0/go.mod h1:5XIRs4YvwNbNoz+1JF8j6KLAyDh7RHGAyAK3EP2EsNk= github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= +github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/globalsign/mgo v0.0.0-20180905125535-1ca0a4f7cbcb/go.mod h1:xkRDCp4j0OGD1HRkm4kmhM+pmpv3AKq5SU7GMg4oO/Q= github.com/globalsign/mgo v0.0.0-20181015135952-eeefdecb41b8/go.mod h1:xkRDCp4j0OGD1HRkm4kmhM+pmpv3AKq5SU7GMg4oO/Q= @@ -361,6 +362,7 @@ github.com/google/martian v2.1.0+incompatible h1:/CP5g8u/VJHijgedC/Legn3BAbAaWPg github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20190723021845-34ac40c74b70 h1:XTnP8fJpa4Kvpw2qARB4KS9izqxPS0Sd92cDlY3uk+w= github.com/google/pprof v0.0.0-20190723021845-34ac40c74b70/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -461,6 +463,7 @@ github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= +github.com/influxdata/influxdb v1.7.7 h1:UvNzAPfBrKMENVbQ4mr4ccA9sW+W1Ihl0Yh1s0BiVAg= github.com/influxdata/influxdb v1.7.7/go.mod h1:qZna6X/4elxqT3yI9iZYdZrWWdeFOOprn86kgg4+IzY= github.com/jackc/fake v0.0.0-20150926172116-812a484cc733/go.mod h1:WrMFNQdiFJ80sQsxDoMokWK1W5TQtxBFNpzWTD84ibQ= github.com/jackc/pgx v3.2.0+incompatible/go.mod h1:0ZGrqGqkRlliWnWB4zKnWtjbSWbGkVEFm4TeybAXq+I= diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 04f534c1388..cafb95ea38a 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -10,6 +10,7 @@ import ( "time" "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" "github.com/prometheus/alertmanager/api" "github.com/prometheus/alertmanager/cluster" "github.com/prometheus/alertmanager/config" @@ -66,6 +67,9 @@ type Alertmanager struct { wg sync.WaitGroup mux *http.ServeMux registry *prometheus.Registry + + activeMtx sync.Mutex + active bool } var webReload = make(chan chan error) @@ -81,17 +85,16 @@ func init() { } // New creates a new Alertmanager. -func New(cfg *Config) (*Alertmanager, error) { +func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am := &Alertmanager{ - cfg: cfg, - logger: log.With(cfg.Logger, "user", cfg.UserID), - stop: make(chan struct{}), + cfg: cfg, + logger: log.With(cfg.Logger, "user", cfg.UserID), + stop: make(chan struct{}), + active: false, + activeMtx: sync.Mutex{}, } - // TODO(cortex): Build a registry that can merge metrics from multiple users. - // For now, these metrics are ignored, as we can't register the same - // metric twice with a single registry. - am.registry = prometheus.NewRegistry() + am.registry = reg am.wg.Add(1) nflogID := fmt.Sprintf("nflog:%s", cfg.UserID) @@ -173,7 +176,12 @@ func clusterWait(p *cluster.Peer, timeout time.Duration) func() time.Duration { // ApplyConfig applies a new configuration to an Alertmanager. func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config) error { - templateFiles := make([]string, len(conf.Templates)) + // Ensure the alertmanager is set to active + am.activeMtx.Lock() + am.active = true + am.activeMtx.Unlock() + + templateFiles := make([]string, len(conf.Templates), len(conf.Templates)) if len(conf.Templates) > 0 { for i, t := range conf.Templates { templateFiles[i] = filepath.Join(am.cfg.DataDir, "templates", userID, t) @@ -236,9 +244,47 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config) error { return nil } +// Pause running jobs in the alertmanager that are able to be restarted and sets +// to inactive +func (am *Alertmanager) Pause() { + // Set to inactive + am.activeMtx.Lock() + am.active = false + am.activeMtx.Unlock() + + // Ensure inhibitor is set before being called + if am.inhibitor != nil { + am.inhibitor.Stop() + } + + // Ensure dispatcher is set before being called + if am.dispatcher != nil { + am.dispatcher.Stop() + } + + // Remove all of the active silences from the alertmanager + silences, _, err := am.silences.Query() + if err != nil { + level.Warn(am.logger).Log("msg", "unable to retrieve silences for removal", "err", err) + } + for _, si := range silences { + err = am.silences.Expire(si.Id) + if err != nil { + level.Warn(am.logger).Log("msg", "unable to remove silence", "err", err, "silence", si.Id) + } + } +} + // Stop stops the Alertmanager. func (am *Alertmanager) Stop() { - am.dispatcher.Stop() + if am.inhibitor != nil { + am.inhibitor.Stop() + } + + if am.dispatcher != nil { + am.dispatcher.Stop() + } + am.alerts.Close() close(am.stop) am.wg.Wait() diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go new file mode 100644 index 00000000000..723f1355d74 --- /dev/null +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -0,0 +1,210 @@ +package alertmanager + +import ( + "sync" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/cortexproject/cortex/pkg/util" +) + +// This struct aggregates metrics exported by Alertmanager +// and re-exports those aggregates as Cortex metrics. +type alertmanagerMetrics struct { + // Maps userID -> registry + regsMu sync.Mutex + regs map[string]*prometheus.Registry + + // exported metrics, gathered from Alertmanager API + alertsReceived *prometheus.Desc + alertsInvalid *prometheus.Desc + + // exported metrics, gathered from Alertmanager PipelineBuilder + numNotifications *prometheus.Desc + numFailedNotifications *prometheus.Desc + notificationLatencySeconds *prometheus.Desc + + // exported metrics, gathered from Alertmanager nflog + nflogGCDuration *prometheus.Desc + nflogSnapshotDuration *prometheus.Desc + nflogSnapshotSize *prometheus.Desc + nflogQueriesTotal *prometheus.Desc + nflogQueryErrorsTotal *prometheus.Desc + nflogQueryDuration *prometheus.Desc + nflogPropagatedMessagesTotal *prometheus.Desc + + // exported metrics, gathered from Alertmanager Marker + markerAlerts *prometheus.Desc + + // exported metrics, gathered from Alertmanager Silences + silencesGCDuration *prometheus.Desc + silencesSnapshotDuration *prometheus.Desc + silencesSnapshotSize *prometheus.Desc + silencesQueriesTotal *prometheus.Desc + silencesQueryErrorsTotal *prometheus.Desc + silencesQueryDuration *prometheus.Desc + silences *prometheus.Desc + silencesPropagatedMessagesTotal *prometheus.Desc +} + +func newAlertmanagerMetrics() *alertmanagerMetrics { + return &alertmanagerMetrics{ + regs: map[string]*prometheus.Registry{}, + regsMu: sync.Mutex{}, + alertsReceived: prometheus.NewDesc( + "cortex_alertmanager_alerts_received_total", + "The total number of received alerts.", + nil, nil), + alertsInvalid: prometheus.NewDesc( + "cortex_alertmanager_alerts_invalid_total", + "The total number of received alerts that were invalid.", + nil, nil), + numNotifications: prometheus.NewDesc( + "cortex_alertmanager_notifications_total", + "The total number of attempted notifications.", + []string{"user"}, nil), + numFailedNotifications: prometheus.NewDesc( + "cortex_alertmanager_notifications_failed_total", + "The total number of failed notifications.", + []string{"user"}, nil), + notificationLatencySeconds: prometheus.NewDesc( + "cortex_alertmanager_notification_latency_seconds", + "The latency of notifications in seconds.", + nil, nil), + nflogGCDuration: prometheus.NewDesc( + "cortex_alertmanager_nflog_gc_duration_seconds", + "Duration of the last notification log garbage collection cycle.", + nil, nil), + nflogSnapshotDuration: prometheus.NewDesc( + "cortex_alertmanager_nflog_snapshot_duration_seconds", + "Duration of the last notification log snapshot.", + nil, nil), + nflogSnapshotSize: prometheus.NewDesc( + "cortex_alertmanager_nflog_snapshot_size_bytes", + "Size of the last notification log snapshot in bytes.", + nil, nil), + nflogQueriesTotal: prometheus.NewDesc( + "cortex_alertmanager_nflog_queries_total", + "Number of notification log queries were received.", + nil, nil), + nflogQueryErrorsTotal: prometheus.NewDesc( + "cortex_alertmanager_nflog_query_errors_total", + "Number notification log received queries that failed.", + nil, nil), + nflogQueryDuration: prometheus.NewDesc( + "cortex_alertmanager_nflog_query_duration_seconds", + "Duration of notification log query evaluation.", + nil, nil), + nflogPropagatedMessagesTotal: prometheus.NewDesc( + "cortex_alertmanager_nflog_gossip_messages_propagated_total", + "Number of received gossip messages that have been further gossiped.", + nil, nil), + markerAlerts: prometheus.NewDesc( + "cortex_alertmanager_alerts", + "How many alerts by state.", + []string{"user", "state"}, nil), + silencesGCDuration: prometheus.NewDesc( + "cortex_alertmanager_silences_gc_duration_seconds", + "Duration of the last silence garbage collection cycle.", + nil, nil), + silencesSnapshotDuration: prometheus.NewDesc( + "cortex_alertmanager_silences_snapshot_duration_seconds", + "Duration of the last silence snapshot.", + nil, nil), + silencesSnapshotSize: prometheus.NewDesc( + "cortex_alertmanager_silences_snapshot_size_bytes", + "Size of the last silence snapshot in bytes.", + nil, nil), + silencesQueriesTotal: prometheus.NewDesc( + "cortex_alertmanager_silences_queries_total", + "How many silence queries were received.", + nil, nil), + silencesQueryErrorsTotal: prometheus.NewDesc( + "cortex_alertmanager_silences_query_errors_total", + "How many silence received queries did not succeed.", + nil, nil), + silencesQueryDuration: prometheus.NewDesc( + "cortex_alertmanager_silences_query_duration_seconds", + "Duration of silence query evaluation.", + nil, nil), + silencesPropagatedMessagesTotal: prometheus.NewDesc( + "cortex_alertmanager_silences_gossip_messages_propagated_total", + "Number of received gossip messages that have been further gossiped.", + nil, nil), + silences: prometheus.NewDesc( + "cortex_alertmanager_silences", + "How many silences by state.", + []string{"user", "state"}, nil), + } +} + +func (m *alertmanagerMetrics) addUserRegistry(user string, reg *prometheus.Registry) { + m.regsMu.Lock() + m.regs[user] = reg + m.regsMu.Unlock() +} + +func (m *alertmanagerMetrics) registries() map[string]*prometheus.Registry { + regs := map[string]*prometheus.Registry{} + + m.regsMu.Lock() + defer m.regsMu.Unlock() + for uid, r := range m.regs { + regs[uid] = r + } + + return regs +} + +func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { + out <- m.alertsReceived + out <- m.alertsInvalid + out <- m.numNotifications + out <- m.numFailedNotifications + out <- m.notificationLatencySeconds + out <- m.nflogGCDuration + out <- m.nflogSnapshotDuration + out <- m.nflogSnapshotSize + out <- m.nflogQueriesTotal + out <- m.nflogQueryErrorsTotal + out <- m.nflogQueryDuration + out <- m.nflogPropagatedMessagesTotal + out <- m.markerAlerts + out <- m.silencesGCDuration + out <- m.silencesSnapshotDuration + out <- m.silencesSnapshotSize + out <- m.silencesQueriesTotal + out <- m.silencesQueryErrorsTotal + out <- m.silencesQueryDuration + out <- m.silences + out <- m.silencesPropagatedMessagesTotal +} + +func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { + data := util.BuildMetricFamiliesPerUserFromUserRegistries(m.registries()) + + data.SendSumOfCounters(out, m.alertsReceived, "alertmanager_alerts_received_total") + data.SendSumOfCounters(out, m.alertsInvalid, "alertmanager_alerts_invalid_total") + + data.SendSumOfCountersPerUser(out, m.numNotifications, "alertmanager_notifications_total") + data.SendSumOfCountersPerUser(out, m.numFailedNotifications, "alertmanager_notifications_failed_total") + data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds") + data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state") + + data.SendSumOfSummaries(out, m.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds") + data.SendSumOfSummaries(out, m.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds") + data.SendSumOfGauges(out, m.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes") + data.SendSumOfCounters(out, m.nflogQueriesTotal, "alertmanager_nflog_queries_total") + data.SendSumOfCounters(out, m.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total") + data.SendSumOfHistograms(out, m.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds") + data.SendSumOfCounters(out, m.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total") + + data.SendSumOfSummaries(out, m.silencesGCDuration, "alertmanager_silences_gc_duration_seconds") + data.SendSumOfSummaries(out, m.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds") + data.SendSumOfGauges(out, m.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes") + data.SendSumOfCounters(out, m.silencesQueriesTotal, "alertmanager_silences_queries_total") + data.SendSumOfCounters(out, m.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total") + data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds") + data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total") + data.SendSumOfGaugesPerUserWithLabels(out, m.silences, "alertmanager_silences", "state") +} diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go new file mode 100644 index 00000000000..bd893c646ba --- /dev/null +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -0,0 +1,421 @@ +package alertmanager + +import ( + "bytes" + "testing" + + "github.com/prometheus/alertmanager/types" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/require" +) + +var integrations = []string{ + "email", + "hipchat", + "pagerduty", + "wechat", + "pushover", + "slack", + "opsgenie", + "webhook", + "victorops", +} + +func TestAlertmanagerMetricsStore(t *testing.T) { + mainReg := prometheus.NewPedanticRegistry() + + alertmanangerMetrics := newAlertmanagerMetrics() + mainReg.MustRegister(alertmanangerMetrics) + alertmanangerMetrics.addUserRegistry("user1", populateAlertmanager(1)) + alertmanangerMetrics.addUserRegistry("user2", populateAlertmanager(10)) + alertmanangerMetrics.addUserRegistry("user3", populateAlertmanager(100)) + + //noinspection ALL + err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(` + # HELP cortex_alertmanager_alerts How many alerts by state. + # TYPE cortex_alertmanager_alerts gauge + cortex_alertmanager_alerts{state="active",user="user1"} 1 + cortex_alertmanager_alerts{state="active",user="user2"} 10 + cortex_alertmanager_alerts{state="active",user="user3"} 100 + cortex_alertmanager_alerts{state="suppressed",user="user1"} 2 + cortex_alertmanager_alerts{state="suppressed",user="user2"} 20 + cortex_alertmanager_alerts{state="suppressed",user="user3"} 200 + # HELP cortex_alertmanager_alerts_invalid_total The total number of received alerts that were invalid. + # TYPE cortex_alertmanager_alerts_invalid_total counter + cortex_alertmanager_alerts_invalid_total 222 + # HELP cortex_alertmanager_alerts_received_total The total number of received alerts. + # TYPE cortex_alertmanager_alerts_received_total counter + cortex_alertmanager_alerts_received_total 1110 + # HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle. + # TYPE cortex_alertmanager_nflog_gc_duration_seconds summary + cortex_alertmanager_nflog_gc_duration_seconds_sum 111 + cortex_alertmanager_nflog_gc_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_gossip_messages_propagated_total Number of received gossip messages that have been further gossiped. + # TYPE cortex_alertmanager_nflog_gossip_messages_propagated_total counter + cortex_alertmanager_nflog_gossip_messages_propagated_total 111 + # HELP cortex_alertmanager_nflog_queries_total Number of notification log queries were received. + # TYPE cortex_alertmanager_nflog_queries_total counter + cortex_alertmanager_nflog_queries_total 111 + # HELP cortex_alertmanager_nflog_query_duration_seconds Duration of notification log query evaluation. + # TYPE cortex_alertmanager_nflog_query_duration_seconds histogram + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.005"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.01"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.025"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.05"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.1"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.25"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="0.5"} 0 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="1"} 1 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="2.5"} 1 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="5"} 1 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="10"} 2 + cortex_alertmanager_nflog_query_duration_seconds_bucket{le="+Inf"} 3 + cortex_alertmanager_nflog_query_duration_seconds_sum 111 + cortex_alertmanager_nflog_query_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_query_errors_total Number notification log received queries that failed. + # TYPE cortex_alertmanager_nflog_query_errors_total counter + cortex_alertmanager_nflog_query_errors_total 111 + # HELP cortex_alertmanager_nflog_snapshot_duration_seconds Duration of the last notification log snapshot. + # TYPE cortex_alertmanager_nflog_snapshot_duration_seconds summary + cortex_alertmanager_nflog_snapshot_duration_seconds_sum 111 + cortex_alertmanager_nflog_snapshot_duration_seconds_count 3 + # HELP cortex_alertmanager_nflog_snapshot_size_bytes Size of the last notification log snapshot in bytes. + # TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge + cortex_alertmanager_nflog_snapshot_size_bytes 111 + # HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds. + # TYPE cortex_alertmanager_notification_latency_seconds histogram + cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 15 + cortex_alertmanager_notification_latency_seconds_bucket{le="5"} 21 + cortex_alertmanager_notification_latency_seconds_bucket{le="10"} 23 + cortex_alertmanager_notification_latency_seconds_bucket{le="15"} 25 + cortex_alertmanager_notification_latency_seconds_bucket{le="20"} 27 + cortex_alertmanager_notification_latency_seconds_bucket{le="+Inf"} 27 + cortex_alertmanager_notification_latency_seconds_sum 99.9 + cortex_alertmanager_notification_latency_seconds_count 27 + # HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications. + # TYPE cortex_alertmanager_notifications_failed_total counter + cortex_alertmanager_notifications_failed_total{user="user1"} 36 + cortex_alertmanager_notifications_failed_total{user="user2"} 360 + cortex_alertmanager_notifications_failed_total{user="user3"} 3600 + # HELP cortex_alertmanager_notifications_total The total number of attempted notifications. + # TYPE cortex_alertmanager_notifications_total counter + cortex_alertmanager_notifications_total{user="user1"} 36 + cortex_alertmanager_notifications_total{user="user2"} 360 + cortex_alertmanager_notifications_total{user="user3"} 3600 + # HELP cortex_alertmanager_silences How many silences by state. + # TYPE cortex_alertmanager_silences gauge + cortex_alertmanager_silences{state="active",user="user1"} 1 + cortex_alertmanager_silences{state="active",user="user2"} 10 + cortex_alertmanager_silences{state="active",user="user3"} 100 + cortex_alertmanager_silences{state="expired",user="user1"} 2 + cortex_alertmanager_silences{state="expired",user="user2"} 20 + cortex_alertmanager_silences{state="expired",user="user3"} 200 + cortex_alertmanager_silences{state="pending",user="user1"} 3 + cortex_alertmanager_silences{state="pending",user="user2"} 30 + cortex_alertmanager_silences{state="pending",user="user3"} 300 + # HELP cortex_alertmanager_silences_gc_duration_seconds Duration of the last silence garbage collection cycle. + # TYPE cortex_alertmanager_silences_gc_duration_seconds summary + cortex_alertmanager_silences_gc_duration_seconds_sum 111 + cortex_alertmanager_silences_gc_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_gossip_messages_propagated_total Number of received gossip messages that have been further gossiped. + # TYPE cortex_alertmanager_silences_gossip_messages_propagated_total counter + cortex_alertmanager_silences_gossip_messages_propagated_total 111 + # HELP cortex_alertmanager_silences_queries_total How many silence queries were received. + # TYPE cortex_alertmanager_silences_queries_total counter + cortex_alertmanager_silences_queries_total 111 + # HELP cortex_alertmanager_silences_query_duration_seconds Duration of silence query evaluation. + # TYPE cortex_alertmanager_silences_query_duration_seconds histogram + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.005"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.01"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.025"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.05"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.1"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.25"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="0.5"} 0 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="1"} 1 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="2.5"} 1 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="5"} 1 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="10"} 2 + cortex_alertmanager_silences_query_duration_seconds_bucket{le="+Inf"} 3 + cortex_alertmanager_silences_query_duration_seconds_sum 111 + cortex_alertmanager_silences_query_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_query_errors_total How many silence received queries did not succeed. + # TYPE cortex_alertmanager_silences_query_errors_total counter + cortex_alertmanager_silences_query_errors_total 111 + # HELP cortex_alertmanager_silences_snapshot_duration_seconds Duration of the last silence snapshot. + # TYPE cortex_alertmanager_silences_snapshot_duration_seconds summary + cortex_alertmanager_silences_snapshot_duration_seconds_sum 111 + cortex_alertmanager_silences_snapshot_duration_seconds_count 3 + # HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes. + # TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge + cortex_alertmanager_silences_snapshot_size_bytes 111 + +`)) + require.NoError(t, err) +} + +func populateAlertmanager(base float64) *prometheus.Registry { + reg := prometheus.NewRegistry() + s := newSilenceMetrics(reg) + s.gcDuration.Observe(base) + s.snapshotDuration.Observe(base) + s.snapshotSize.Add(base) + s.queriesTotal.Add(base) + s.queryErrorsTotal.Add(base) + s.queryDuration.Observe(base) + s.propagatedMessagesTotal.Add(base) + s.silencesActive.Set(base) + s.silencesExpired.Set(base * 2) + s.silencesPending.Set(base * 3) + + n := newNflogMetrics(reg) + n.gcDuration.Observe(base) + n.snapshotDuration.Observe(base) + n.snapshotSize.Add(base) + n.queriesTotal.Add(base) + n.queryErrorsTotal.Add(base) + n.queryDuration.Observe(base) + n.propagatedMessagesTotal.Add(base) + + nm := newNotifyMetrics(reg) + for i, integration := range integrations { + nm.numNotifications.WithLabelValues(integration).Add(base * float64(i)) + nm.numFailedNotifications.WithLabelValues(integration).Add(base * float64(i)) + nm.notificationLatencySeconds.WithLabelValues(integration).Observe(base * float64(i) * 0.025) + } + + m := newMarkerMetrics(reg) + m.alerts.WithLabelValues(string(types.AlertStateActive)).Add(base) + m.alerts.WithLabelValues(string(types.AlertStateSuppressed)).Add(base * 2) + + v1APIMetrics := newAPIMetrics("v1", reg) + v1APIMetrics.firing.Add(base * 2) + v1APIMetrics.invalid.Add(base) + v1APIMetrics.resolved.Add(base * 3) + + v2APIMetrics := newAPIMetrics("v2", reg) + v2APIMetrics.firing.Add(base * 2) + v2APIMetrics.invalid.Add(base) + v2APIMetrics.resolved.Add(base * 3) + + return reg +} + +// Copied from github.com/alertmanager/nflog/nflog.go +type nflogMetrics struct { + gcDuration prometheus.Summary + snapshotDuration prometheus.Summary + snapshotSize prometheus.Gauge + queriesTotal prometheus.Counter + queryErrorsTotal prometheus.Counter + queryDuration prometheus.Histogram + propagatedMessagesTotal prometheus.Counter +} + +func newNflogMetrics(r prometheus.Registerer) *nflogMetrics { + m := &nflogMetrics{} + + m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_nflog_gc_duration_seconds", + Help: "Duration of the last notification log garbage collection cycle.", + Objectives: map[float64]float64{}, + }) + m.snapshotDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_nflog_snapshot_duration_seconds", + Help: "Duration of the last notification log snapshot.", + Objectives: map[float64]float64{}, + }) + m.snapshotSize = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_nflog_snapshot_size_bytes", + Help: "Size of the last notification log snapshot in bytes.", + }) + m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_queries_total", + Help: "Number of notification log queries were received.", + }) + m.queryErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_query_errors_total", + Help: "Number notification log received queries that failed.", + }) + m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "alertmanager_nflog_query_duration_seconds", + Help: "Duration of notification log query evaluation.", + }) + m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_gossip_messages_propagated_total", + Help: "Number of received gossip messages that have been further gossiped.", + }) + + if r != nil { + r.MustRegister( + m.gcDuration, + m.snapshotDuration, + m.snapshotSize, + m.queriesTotal, + m.queryErrorsTotal, + m.queryDuration, + m.propagatedMessagesTotal, + ) + } + return m +} + +// Copied from github.com/alertmanager/silence/silence.go +type silenceMetrics struct { + gcDuration prometheus.Summary + snapshotDuration prometheus.Summary + snapshotSize prometheus.Gauge + queriesTotal prometheus.Counter + queryErrorsTotal prometheus.Counter + queryDuration prometheus.Histogram + silencesActive prometheus.Gauge + silencesPending prometheus.Gauge + silencesExpired prometheus.Gauge + propagatedMessagesTotal prometheus.Counter +} + +func newSilenceMetrics(r prometheus.Registerer) *silenceMetrics { + m := &silenceMetrics{} + + m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_silences_gc_duration_seconds", + Help: "Duration of the last silence garbage collection cycle.", + Objectives: map[float64]float64{}, + }) + m.snapshotDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "alertmanager_silences_snapshot_duration_seconds", + Help: "Duration of the last silence snapshot.", + Objectives: map[float64]float64{}, + }) + m.snapshotSize = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences_snapshot_size_bytes", + Help: "Size of the last silence snapshot in bytes.", + }) + m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_queries_total", + Help: "How many silence queries were received.", + }) + m.queryErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_query_errors_total", + Help: "How many silence received queries did not succeed.", + }) + m.queryDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "alertmanager_silences_query_duration_seconds", + Help: "Duration of silence query evaluation.", + }) + m.propagatedMessagesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_gossip_messages_propagated_total", + Help: "Number of received gossip messages that have been further gossiped.", + }) + m.silencesActive = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences", + Help: "How many silences by state.", + ConstLabels: prometheus.Labels{"state": string(types.SilenceStateActive)}, + }) + m.silencesPending = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences", + Help: "How many silences by state.", + ConstLabels: prometheus.Labels{"state": string(types.SilenceStatePending)}, + }) + m.silencesExpired = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_silences", + Help: "How many silences by state.", + ConstLabels: prometheus.Labels{"state": string(types.SilenceStateExpired)}, + }) + + if r != nil { + r.MustRegister( + m.gcDuration, + m.snapshotDuration, + m.snapshotSize, + m.queriesTotal, + m.queryErrorsTotal, + m.queryDuration, + m.silencesActive, + m.silencesPending, + m.silencesExpired, + m.propagatedMessagesTotal, + ) + } + return m +} + +// Copied from github.com/alertmanager/notify/notify.go +type notifyMetrics struct { + numNotifications *prometheus.CounterVec + numFailedNotifications *prometheus.CounterVec + notificationLatencySeconds *prometheus.HistogramVec +} + +func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics { + m := ¬ifyMetrics{ + numNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "alertmanager", + Name: "notifications_total", + Help: "The total number of attempted notifications.", + }, []string{"integration"}), + numFailedNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "alertmanager", + Name: "notifications_failed_total", + Help: "The total number of failed notifications.", + }, []string{"integration"}), + notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "alertmanager", + Name: "notification_latency_seconds", + Help: "The latency of notifications in seconds.", + Buckets: []float64{1, 5, 10, 15, 20}, + }, []string{"integration"}), + } + for _, integration := range integrations { + m.numNotifications.WithLabelValues(integration) + m.numFailedNotifications.WithLabelValues(integration) + m.notificationLatencySeconds.WithLabelValues(integration) + } + r.MustRegister(m.numNotifications, m.numFailedNotifications, m.notificationLatencySeconds) + return m +} + +type markerMetrics struct { + alerts *prometheus.GaugeVec +} + +func newMarkerMetrics(r prometheus.Registerer) *markerMetrics { + m := &markerMetrics{ + alerts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "alertmanager_alerts", + Help: "How many alerts by state.", + }, []string{"state"}), + } + + r.MustRegister(m.alerts) + return m +} + +// Copied from github.com/alertmanager/api/metrics/metrics.go +type apiMetrics struct { + firing prometheus.Counter + resolved prometheus.Counter + invalid prometheus.Counter +} + +func newAPIMetrics(version string, r prometheus.Registerer) *apiMetrics { + numReceivedAlerts := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "alertmanager_alerts_received_total", + Help: "The total number of received alerts.", + ConstLabels: prometheus.Labels{"version": version}, + }, []string{"status"}) + numInvalidAlerts := prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_alerts_invalid_total", + Help: "The total number of received alerts that were invalid.", + ConstLabels: prometheus.Labels{"version": version}, + }) + if r != nil { + r.MustRegister(numReceivedAlerts, numInvalidAlerts) + } + return &apiMetrics{ + firing: numReceivedAlerts.WithLabelValues("firing"), + resolved: numReceivedAlerts.WithLabelValues("resolved"), + invalid: numInvalidAlerts, + } +} diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 60b8271a46e..58a573adab3 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -143,7 +143,8 @@ type MultitenantAlertmanager struct { alertmanagersMtx sync.Mutex alertmanagers map[string]*Alertmanager - logger log.Logger + logger log.Logger + metrics *alertmanagerMetrics peer *cluster.Peer @@ -152,7 +153,7 @@ type MultitenantAlertmanager struct { } // NewMultitenantAlertmanager creates a new MultitenantAlertmanager. -func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, logger log.Logger) (*MultitenantAlertmanager, error) { +func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, logger log.Logger, registerer prometheus.Registerer) (*MultitenantAlertmanager, error) { err := os.MkdirAll(cfg.DataDir, 0777) if err != nil { return nil, fmt.Errorf("unable to create Alertmanager data directory %q: %s", cfg.DataDir, err) @@ -174,7 +175,7 @@ func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, logger log.L if cfg.ClusterBindAddr != "" { peer, err = cluster.Create( log.With(logger, "component", "cluster"), - prometheus.DefaultRegisterer, + registerer, cfg.ClusterBindAddr, cfg.ClusterAdvertiseAddr, cfg.Peers, @@ -205,12 +206,18 @@ func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, logger log.L fallbackConfig: string(fallbackConfig), cfgs: map[string]alerts.AlertConfigDesc{}, alertmanagers: map[string]*Alertmanager{}, + metrics: newAlertmanagerMetrics(), peer: peer, store: store, logger: log.With(logger, "component", "MultiTenantAlertmanager"), stop: make(chan struct{}), done: make(chan struct{}), } + + if registerer != nil { + registerer.MustRegister(am.metrics) + } + return am, nil } @@ -300,15 +307,14 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi defer am.alertmanagersMtx.Unlock() for user, userAM := range am.alertmanagers { if _, exists := cfgs[user]; !exists { - level.Info(am.logger).Log("msg", "deleting per-tenant alertmanager", "user", user) - userAM.Stop() - delete(am.alertmanagers, user) + level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", user) + userAM.Pause() delete(am.cfgs, user) - level.Info(am.logger).Log("msg", "deleted per-tenant alertmanager", "user", user) + level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", user) } } totalConfigs.WithLabelValues("invalid").Set(float64(invalid)) - totalConfigs.WithLabelValues("valid").Set(float64(len(am.alertmanagers) - invalid)) + totalConfigs.WithLabelValues("valid").Set(float64(len(am.cfgs) - invalid)) } func (am *MultitenantAlertmanager) transformConfig(userID string, amConfig *amconfig.Config) (*amconfig.Config, error) { @@ -423,6 +429,8 @@ func (am *MultitenantAlertmanager) setConfig(cfg alerts.AlertConfigDesc) error { } func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amconfig.Config) (*Alertmanager, error) { + reg := prometheus.NewRegistry() + am.metrics.addUserRegistry(userID, reg) newAM, err := New(&Config{ UserID: userID, DataDir: am.cfg.DataDir, @@ -431,7 +439,7 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amco PeerTimeout: am.cfg.PeerTimeout, Retention: am.cfg.Retention, ExternalURL: am.cfg.ExternalURL.URL, - }) + }, reg) if err != nil { return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err) } @@ -452,7 +460,11 @@ func (am *MultitenantAlertmanager) ServeHTTP(w http.ResponseWriter, req *http.Re am.alertmanagersMtx.Lock() userAM, ok := am.alertmanagers[userID] am.alertmanagersMtx.Unlock() - if !ok { + + userAM.activeMtx.Lock() + defer userAM.activeMtx.Unlock() + + if !ok || !userAM.active { http.Error(w, fmt.Sprintf("no Alertmanager for this user ID"), http.StatusNotFound) return } diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 708004fd6e6..d8cb8207ccd 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -77,6 +77,7 @@ func TestLoadAllConfigs(t *testing.T) { logger: log.NewNopLogger(), stop: make(chan struct{}), done: make(chan struct{}), + metrics: newAlertmanagerMetrics(), } // Ensure the configs are synced correctly @@ -112,10 +113,36 @@ func TestLoadAllConfigs(t *testing.T) { require.True(t, exists) require.Equal(t, simpleConfigTwo, currentConfig.RawConfig) - // Test Delete User + // Test Delete User, ensure config is remove but alertmananger + // exists and is set to inactive delete(mockStore.configs, "user3") require.NoError(t, am.updateConfigs()) currentConfig, exists = am.cfgs["user3"] require.False(t, exists) - require.Equal(t, alerts.AlertConfigDesc{}, currentConfig) + require.Equal(t, "", currentConfig.RawConfig) + + userAM, exists := am.alertmanagers["user3"] + require.True(t, exists) + userAM.activeMtx.Lock() + require.False(t, userAM.active) + userAM.activeMtx.Unlock() + + // Ensure when a 3rd config is re-added, it is synced correctly + mockStore.configs["user3"] = alerts.AlertConfigDesc{ + User: "user3", + RawConfig: simpleConfigOne, + Templates: []*alerts.TemplateDesc{}, + } + + require.NoError(t, am.updateConfigs()) + + currentConfig, exists = am.cfgs["user3"] + require.True(t, exists) + require.Equal(t, simpleConfigOne, currentConfig.RawConfig) + + userAM, exists = am.alertmanagers["user3"] + require.True(t, exists) + userAM.activeMtx.Lock() + require.True(t, userAM.active) + userAM.activeMtx.Unlock() } diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index 5a6d71c3dc6..ff7d4d65ec3 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -533,7 +533,7 @@ func (t *Cortex) stopConfigs() error { } func (t *Cortex) initAlertmanager(cfg *Config) (err error) { - t.alertmanager, err = alertmanager.NewMultitenantAlertmanager(&cfg.Alertmanager, util.Logger) + t.alertmanager, err = alertmanager.NewMultitenantAlertmanager(&cfg.Alertmanager, util.Logger, prometheus.DefaultRegisterer) if err != nil { return err } diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index 234b154ebd6..e934ab9c665 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -31,6 +31,13 @@ func (m singleValueWithLabelsMap) aggregateFn(labelsKey string, labelValues []st m[labelsKey] = r } +func (m singleValueWithLabelsMap) appendUserLabelValue(user string) { + for key, mlv := range m { + mlv.LabelValues = append([]string{user}, mlv.LabelValues...) + m[key] = mlv + } +} + func (m singleValueWithLabelsMap) WriteToMetricChannel(out chan<- prometheus.Metric, desc *prometheus.Desc, valueType prometheus.ValueType) { for _, cr := range m { out <- prometheus.MustNewConstMetric(desc, valueType, cr.Value, cr.LabelValues...) @@ -162,6 +169,17 @@ func (d MetricFamiliesPerUser) SendSumOfGaugesWithLabels(out chan<- prometheus.M d.sumOfSingleValuesWithLabels(gauge, gaugeValue, labelNames).WriteToMetricChannel(out, desc, prometheus.GaugeValue) } +// SendSumOfGaugesPerUserWithLabels provides metrics with the provided label names on a per-user basis. This function assumes that `user` is the +// first label on the provided metric Desc +func (d MetricFamiliesPerUser) SendSumOfGaugesPerUserWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, metric string, labelNames ...string) { + for user, userMetrics := range d { + result := singleValueWithLabelsMap{} + userMetrics.sumOfSingleValuesWithLabels(metric, labelNames, gaugeValue, result.aggregateFn) + result.appendUserLabelValue(user) + result.WriteToMetricChannel(out, desc, prometheus.GaugeValue) + } +} + func (d MetricFamiliesPerUser) sumOfSingleValuesWithLabels(metric string, fn func(*dto.Metric) float64, labelNames []string) singleValueWithLabelsMap { result := singleValueWithLabelsMap{} for _, userMetrics := range d { diff --git a/pkg/util/metrics_helper_test.go b/pkg/util/metrics_helper_test.go index 2b4b3f6ab81..aec2a348bbe 100644 --- a/pkg/util/metrics_helper_test.go +++ b/pkg/util/metrics_helper_test.go @@ -4,6 +4,7 @@ import ( "testing" "github.com/gogo/protobuf/proto" + "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" ) @@ -81,3 +82,83 @@ func makeLabels(namesAndValues ...string) []*dto.LabelPair { return out } + +// TestSendSumOfGaugesPerUserWithLabels tests to ensure multiple metrics for the same user with a matching label are +// summed correctly +func TestSendSumOfGaugesPerUserWithLabels(t *testing.T) { + user1Metric := prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: "test_metric"}, []string{"label_one", "label_two"}) + user2Metric := prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: "test_metric"}, []string{"label_one", "label_two"}) + user1Metric.WithLabelValues("a", "b").Set(100) + user1Metric.WithLabelValues("a", "c").Set(80) + user2Metric.WithLabelValues("a", "b").Set(60) + user2Metric.WithLabelValues("a", "c").Set(40) + + user1Reg := prometheus.NewRegistry() + user2Reg := prometheus.NewRegistry() + user1Reg.MustRegister(user1Metric) + user2Reg.MustRegister(user2Metric) + + mf := BuildMetricFamiliesPerUserFromUserRegistries(map[string]*prometheus.Registry{ + "user-1": user1Reg, + "user-2": user2Reg, + }) + + desc := prometheus.NewDesc("test_metric", "", []string{"user", "label_one"}, nil) + actual, err := collectMetrics(func(out chan prometheus.Metric) { + mf.SendSumOfGaugesPerUserWithLabels(out, desc, "test_metric", "label_one") + }) + require.NoError(t, err) + expected := []*dto.Metric{ + {Label: makeLabels("label_one", "a", "user", "user-1"), Gauge: &dto.Gauge{Value: proto.Float64(180)}}, + {Label: makeLabels("label_one", "a", "user", "user-2"), Gauge: &dto.Gauge{Value: proto.Float64(100)}}, + } + require.ElementsMatch(t, expected, actual) + + desc = prometheus.NewDesc("test_metric", "", []string{"user", "label_two"}, nil) + actual, err = collectMetrics(func(out chan prometheus.Metric) { + mf.SendSumOfGaugesPerUserWithLabels(out, desc, "test_metric", "label_two") + }) + require.NoError(t, err) + expected = []*dto.Metric{ + {Label: makeLabels("label_two", "b", "user", "user-1"), Gauge: &dto.Gauge{Value: proto.Float64(100)}}, + {Label: makeLabels("label_two", "c", "user", "user-1"), Gauge: &dto.Gauge{Value: proto.Float64(80)}}, + {Label: makeLabels("label_two", "b", "user", "user-2"), Gauge: &dto.Gauge{Value: proto.Float64(60)}}, + {Label: makeLabels("label_two", "c", "user", "user-2"), Gauge: &dto.Gauge{Value: proto.Float64(40)}}, + } + require.ElementsMatch(t, expected, actual) + + desc = prometheus.NewDesc("test_metric", "", []string{"user", "label_one", "label_two"}, nil) + actual, err = collectMetrics(func(out chan prometheus.Metric) { + mf.SendSumOfGaugesPerUserWithLabels(out, desc, "test_metric", "label_one", "label_two") + }) + require.NoError(t, err) + expected = []*dto.Metric{ + {Label: makeLabels("label_one", "a", "label_two", "b", "user", "user-1"), Gauge: &dto.Gauge{Value: proto.Float64(100)}}, + {Label: makeLabels("label_one", "a", "label_two", "c", "user", "user-1"), Gauge: &dto.Gauge{Value: proto.Float64(80)}}, + {Label: makeLabels("label_one", "a", "label_two", "b", "user", "user-2"), Gauge: &dto.Gauge{Value: proto.Float64(60)}}, + {Label: makeLabels("label_one", "a", "label_two", "c", "user", "user-2"), Gauge: &dto.Gauge{Value: proto.Float64(40)}}, + } + require.ElementsMatch(t, expected, actual) +} + +func collectMetrics(send func(out chan prometheus.Metric)) ([]*dto.Metric, error) { + out := make(chan prometheus.Metric) + + go func() { + send(out) + close(out) + }() + + metrics := []*dto.Metric{} + for m := range out { + collected := &dto.Metric{} + err := m.Write(collected) + if err != nil { + return nil, err + } + + metrics = append(metrics, collected) + } + + return metrics, nil +} From 3bece6ca6bba5994099fcf07ab89192b8ef78aa8 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Thu, 27 Feb 2020 10:48:05 -0500 Subject: [PATCH 2/7] fix linting error Signed-off-by: Jacob Lisi --- pkg/alertmanager/alertmanager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index cafb95ea38a..95ce35787f5 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -181,7 +181,7 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config) error { am.active = true am.activeMtx.Unlock() - templateFiles := make([]string, len(conf.Templates), len(conf.Templates)) + templateFiles := make([]string, len(conf.Templates)) if len(conf.Templates) > 0 { for i, t := range conf.Templates { templateFiles[i] = filepath.Join(am.cfg.DataDir, "templates", userID, t) From 20cfea6ac6d92fb54b5ebc169f8c60bca97b5c57 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Fri, 28 Feb 2020 12:01:19 -0500 Subject: [PATCH 3/7] update in regards to PR comments Signed-off-by: Jacob Lisi --- pkg/alertmanager/alertmanager.go | 25 ++++++++++++++++--------- pkg/alertmanager/multitenant.go | 7 ++++++- pkg/alertmanager/multitenant_test.go | 8 ++------ pkg/util/metrics_helper.go | 4 ++-- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 95ce35787f5..3851a3afcb6 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -176,11 +176,6 @@ func clusterWait(p *cluster.Peer, timeout time.Duration) func() time.Duration { // ApplyConfig applies a new configuration to an Alertmanager. func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config) error { - // Ensure the alertmanager is set to active - am.activeMtx.Lock() - am.active = true - am.activeMtx.Unlock() - templateFiles := make([]string, len(conf.Templates)) if len(conf.Templates) > 0 { for i, t := range conf.Templates { @@ -241,25 +236,37 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config) error { go am.dispatcher.Run() go am.inhibitor.Run() + // Ensure the alertmanager is set to active + am.activeMtx.Lock() + am.active = true + am.activeMtx.Unlock() + return nil } +func (am *Alertmanager) isActive() bool { + am.activeMtx.Lock() + defer am.activeMtx.Unlock() + return am.active +} + // Pause running jobs in the alertmanager that are able to be restarted and sets -// to inactive +// to inactives func (am *Alertmanager) Pause() { // Set to inactive am.activeMtx.Lock() am.active = false am.activeMtx.Unlock() - // Ensure inhibitor is set before being called + // Stop the inhibitor and dispatcher which will be recreated when + // a new config is applied if am.inhibitor != nil { am.inhibitor.Stop() + am.inhibitor = nil } - - // Ensure dispatcher is set before being called if am.dispatcher != nil { am.dispatcher.Stop() + am.dispatcher = nil } // Remove all of the active silences from the alertmanager diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 58a573adab3..93c4089db54 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -307,6 +307,9 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi defer am.alertmanagersMtx.Unlock() for user, userAM := range am.alertmanagers { if _, exists := cfgs[user]; !exists { + // The user alertmanager is only paused in order to retain the prometheus metrics + // it has reported to it's registry. If a new config for this user appears, this structure + // will be reused. level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", user) userAM.Pause() delete(am.cfgs, user) @@ -447,6 +450,8 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amco if err := newAM.ApplyConfig(userID, amConfig); err != nil { return nil, fmt.Errorf("unable to apply initial config for user %v: %v", userID, err) } + + am.metrics.addUserRegistry(userID, reg) return newAM, nil } @@ -464,7 +469,7 @@ func (am *MultitenantAlertmanager) ServeHTTP(w http.ResponseWriter, req *http.Re userAM.activeMtx.Lock() defer userAM.activeMtx.Unlock() - if !ok || !userAM.active { + if !ok || !userAM.isActive() { http.Error(w, fmt.Sprintf("no Alertmanager for this user ID"), http.StatusNotFound) return } diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index d8cb8207ccd..f38c35beaa5 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -123,9 +123,7 @@ func TestLoadAllConfigs(t *testing.T) { userAM, exists := am.alertmanagers["user3"] require.True(t, exists) - userAM.activeMtx.Lock() - require.False(t, userAM.active) - userAM.activeMtx.Unlock() + require.False(t, userAM.isActive()) // Ensure when a 3rd config is re-added, it is synced correctly mockStore.configs["user3"] = alerts.AlertConfigDesc{ @@ -142,7 +140,5 @@ func TestLoadAllConfigs(t *testing.T) { userAM, exists = am.alertmanagers["user3"] require.True(t, exists) - userAM.activeMtx.Lock() - require.True(t, userAM.active) - userAM.activeMtx.Unlock() + require.True(t, userAM.isActive()) } diff --git a/pkg/util/metrics_helper.go b/pkg/util/metrics_helper.go index e934ab9c665..47cc0d30f2d 100644 --- a/pkg/util/metrics_helper.go +++ b/pkg/util/metrics_helper.go @@ -31,7 +31,7 @@ func (m singleValueWithLabelsMap) aggregateFn(labelsKey string, labelValues []st m[labelsKey] = r } -func (m singleValueWithLabelsMap) appendUserLabelValue(user string) { +func (m singleValueWithLabelsMap) prependUserLabelValue(user string) { for key, mlv := range m { mlv.LabelValues = append([]string{user}, mlv.LabelValues...) m[key] = mlv @@ -175,7 +175,7 @@ func (d MetricFamiliesPerUser) SendSumOfGaugesPerUserWithLabels(out chan<- prome for user, userMetrics := range d { result := singleValueWithLabelsMap{} userMetrics.sumOfSingleValuesWithLabels(metric, labelNames, gaugeValue, result.aggregateFn) - result.appendUserLabelValue(user) + result.prependUserLabelValue(user) result.WriteToMetricChannel(out, desc, prometheus.GaugeValue) } } From ef6b4ff718104adb0293dbe0e218a0538f1a55aa Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Sun, 1 Mar 2020 14:02:46 -0500 Subject: [PATCH 4/7] fix double lock Signed-off-by: Jacob Lisi --- pkg/alertmanager/multitenant.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 93c4089db54..e26073541a4 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -466,9 +466,6 @@ func (am *MultitenantAlertmanager) ServeHTTP(w http.ResponseWriter, req *http.Re userAM, ok := am.alertmanagers[userID] am.alertmanagersMtx.Unlock() - userAM.activeMtx.Lock() - defer userAM.activeMtx.Unlock() - if !ok || !userAM.isActive() { http.Error(w, fmt.Sprintf("no Alertmanager for this user ID"), http.StatusNotFound) return From 87bf3d41a4b2d66f04b49e44fc37809a5c520ebf Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Sun, 1 Mar 2020 14:23:20 -0500 Subject: [PATCH 5/7] adjust cortex_alertmanager_alerts_received_total to be per user Signed-off-by: Jacob Lisi --- pkg/alertmanager/alertmanager_metrics.go | 8 ++++---- pkg/alertmanager/alertmanager_metrics_test.go | 8 ++++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go index 723f1355d74..721053dcbb4 100644 --- a/pkg/alertmanager/alertmanager_metrics.go +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -54,11 +54,11 @@ func newAlertmanagerMetrics() *alertmanagerMetrics { alertsReceived: prometheus.NewDesc( "cortex_alertmanager_alerts_received_total", "The total number of received alerts.", - nil, nil), + []string{"user"}, nil), alertsInvalid: prometheus.NewDesc( "cortex_alertmanager_alerts_invalid_total", "The total number of received alerts that were invalid.", - nil, nil), + []string{"user"}, nil), numNotifications: prometheus.NewDesc( "cortex_alertmanager_notifications_total", "The total number of attempted notifications.", @@ -183,8 +183,8 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data := util.BuildMetricFamiliesPerUserFromUserRegistries(m.registries()) - data.SendSumOfCounters(out, m.alertsReceived, "alertmanager_alerts_received_total") - data.SendSumOfCounters(out, m.alertsInvalid, "alertmanager_alerts_invalid_total") + data.SendSumOfCountersPerUser(out, m.alertsReceived, "alertmanager_alerts_received_total") + data.SendSumOfCountersPerUser(out, m.alertsInvalid, "alertmanager_alerts_invalid_total") data.SendSumOfCountersPerUser(out, m.numNotifications, "alertmanager_notifications_total") data.SendSumOfCountersPerUser(out, m.numFailedNotifications, "alertmanager_notifications_failed_total") diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index bd893c646ba..6a8c2b6ba0c 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -43,10 +43,14 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_alerts{state="suppressed",user="user3"} 200 # HELP cortex_alertmanager_alerts_invalid_total The total number of received alerts that were invalid. # TYPE cortex_alertmanager_alerts_invalid_total counter - cortex_alertmanager_alerts_invalid_total 222 + cortex_alertmanager_alerts_invalid_total{user="user1"} 2 + cortex_alertmanager_alerts_invalid_total{user="user2"} 20 + cortex_alertmanager_alerts_invalid_total{user="user3"} 200 # HELP cortex_alertmanager_alerts_received_total The total number of received alerts. # TYPE cortex_alertmanager_alerts_received_total counter - cortex_alertmanager_alerts_received_total 1110 + cortex_alertmanager_alerts_received_total{user="user1"} 10 + cortex_alertmanager_alerts_received_total{user="user2"} 100 + cortex_alertmanager_alerts_received_total{user="user3"} 1000 # HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle. # TYPE cortex_alertmanager_nflog_gc_duration_seconds summary cortex_alertmanager_nflog_gc_duration_seconds_sum 111 From f50885f47700030471d19d23d19c9fe86177dc6f Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Mon, 2 Mar 2020 09:56:01 -0500 Subject: [PATCH 6/7] make isActive public function and fix typo Signed-off-by: Jacob Lisi --- pkg/alertmanager/alertmanager.go | 2 +- pkg/alertmanager/multitenant.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 3851a3afcb6..89e845ea130 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -244,7 +244,7 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config) error { return nil } -func (am *Alertmanager) isActive() bool { +func (am *Alertmanager) IsActive() bool { am.activeMtx.Lock() defer am.activeMtx.Unlock() return am.active diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index e26073541a4..3e2d3aecd9c 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -308,7 +308,7 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi for user, userAM := range am.alertmanagers { if _, exists := cfgs[user]; !exists { // The user alertmanager is only paused in order to retain the prometheus metrics - // it has reported to it's registry. If a new config for this user appears, this structure + // it has reported to its registry. If a new config for this user appears, this structure // will be reused. level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", user) userAM.Pause() @@ -466,7 +466,7 @@ func (am *MultitenantAlertmanager) ServeHTTP(w http.ResponseWriter, req *http.Re userAM, ok := am.alertmanagers[userID] am.alertmanagersMtx.Unlock() - if !ok || !userAM.isActive() { + if !ok || !userAM.IsActive() { http.Error(w, fmt.Sprintf("no Alertmanager for this user ID"), http.StatusNotFound) return } From cabeb79e29ab90e3e933ea704381181c3767a0e2 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Mon, 2 Mar 2020 09:56:54 -0500 Subject: [PATCH 7/7] comment IsActive Signed-off-by: Jacob Lisi --- pkg/alertmanager/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 89e845ea130..0d7cce998e4 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -244,6 +244,8 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config) error { return nil } +// IsActive returns if the alertmanager is currently running +// or is paused func (am *Alertmanager) IsActive() bool { am.activeMtx.Lock() defer am.activeMtx.Unlock()