Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* `--experimental.distributor.user-subring-size`
* [FEATURE] Added flag `-experimental.ruler.enable-api` to enable the ruler api which implements the Prometheus API `/api/v1/rules` and `/api/v1/alerts` endpoints under the configured `-http.prefix`. #1999
* [FEATURE] Added sharding support to compactor when using the experimental TSDB blocks storage. #2113
* [ENHANCEMENT] Alertmanager: Expose Per-tenant alertmanager metrics #2116
* [ENHANCEMENT] Experimental TSDB: Export TSDB Syncer metrics from Compactor component, they are prefixed with `cortex_compactor_`. #2023
* [ENHANCEMENT] Experimental TSDB: Added dedicated flag `-experimental.tsdb.bucket-store.tenant-sync-concurrency` to configure the maximum number of concurrent tenants for which blocks are synched. #2026
* [ENHANCEMENT] Experimental TSDB: Expose metrics for objstore operations (prefixed with `cortex_<component>_thanos_objstore_`, component being one of `ingester`, `querier` and `compactor`). #2027
Expand Down
7 changes: 2 additions & 5 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,17 +84,14 @@ func init() {
}

// New creates a new Alertmanager.
func New(cfg *Config) (*Alertmanager, error) {
func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
am := &Alertmanager{
cfg: cfg,
logger: log.With(cfg.Logger, "user", cfg.UserID),
stop: make(chan struct{}),
}

// TODO(cortex): Build a registry that can merge metrics from multiple users.
// For now, these metrics are ignored, as we can't register the same
// metric twice with a single registry.
am.registry = prometheus.NewRegistry()
am.registry = reg

am.wg.Add(1)
nflogID := fmt.Sprintf("nflog:%s", cfg.UserID)
Expand Down
209 changes: 209 additions & 0 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
package alertmanager

import (
"sync"

"github.com/prometheus/client_golang/prometheus"

"github.com/cortexproject/cortex/pkg/util"
)

// This struct aggregates metrics exported by Alertmanager
// and re-exports those aggregates as Cortex metrics.
type alertmanagerMetrics struct {
// Maps userID -> registry
regsMu sync.Mutex
regs map[string]*prometheus.Registry

// exported metrics, gathered from Alertmanager API
alertsReceived *prometheus.Desc
alertsInvalid *prometheus.Desc

// exported metrics, gathered from Alertmanager PipelineBuilder
numNotifications *prometheus.Desc
numFailedNotifications *prometheus.Desc
notificationLatencySeconds *prometheus.Desc

// exported metrics, gathered from Alertmanager nflog
nflogGCDuration *prometheus.Desc
nflogSnapshotDuration *prometheus.Desc
nflogSnapshotSize *prometheus.Desc
nflogQueriesTotal *prometheus.Desc
nflogQueryErrorsTotal *prometheus.Desc
nflogQueryDuration *prometheus.Desc
nflogPropagatedMessagesTotal *prometheus.Desc

// exported metrics, gathered from Alertmanager Marker
markerAlerts *prometheus.Desc

// exported metrics, gathered from Alertmanager Silences
silencesGCDuration *prometheus.Desc
silencesSnapshotDuration *prometheus.Desc
silencesSnapshotSize *prometheus.Desc
silencesQueriesTotal *prometheus.Desc
silencesQueryErrorsTotal *prometheus.Desc
silencesQueryDuration *prometheus.Desc
silences *prometheus.Desc
silencesPropagatedMessagesTotal *prometheus.Desc
}

func newAlertmanagerMetrics() *alertmanagerMetrics {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the impact, in terms of cardinality, for all the metrics with the user label? I'm wondering if this change may potentially lead to a series explosion.

For example, the metrics with integration can have 9 integrations, so the cardinality is 10x the number of tenants for each of such series (x the number of alert manager instances).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I paired down the number of metrics that are reporting a user label and removed the integration entirely.

return &alertmanagerMetrics{
regs: map[string]*prometheus.Registry{},
alertsReceived: prometheus.NewDesc(
"cortex_alertmanager_alerts_received_total",
"The total number of received alerts.",
nil, nil),
alertsInvalid: prometheus.NewDesc(
"cortex_alertmanager_alerts_invalid_total",
"The total number of received alerts that were invalid.",
nil, nil),
numNotifications: prometheus.NewDesc(
"cortex_alertmanager_notifications_total",
"The total number of attempted notifications.",
[]string{"user"}, nil),
numFailedNotifications: prometheus.NewDesc(
"cortex_alertmanager_notifications_failed_total",
"The total number of failed notifications.",
[]string{"user"}, nil),
notificationLatencySeconds: prometheus.NewDesc(
"cortex_alertmanager_notification_latency_seconds",
"The latency of notifications in seconds.",
nil, nil),
nflogGCDuration: prometheus.NewDesc(
"cortex_alertmanager_nflog_gc_duration_seconds",
"Duration of the last notification log garbage collection cycle.",
nil, nil),
nflogSnapshotDuration: prometheus.NewDesc(
"cortex_alertmanager_nflog_snapshot_duration_seconds",
"Duration of the last notification log snapshot.",
nil, nil),
nflogSnapshotSize: prometheus.NewDesc(
"cortex_alertmanager_nflog_snapshot_size_bytes",
"Size of the last notification log snapshot in bytes.",
nil, nil),
nflogQueriesTotal: prometheus.NewDesc(
"cortex_alertmanager_nflog_queries_total",
"Number of notification log queries were received.",
nil, nil),
nflogQueryErrorsTotal: prometheus.NewDesc(
"cortex_alertmanager_nflog_query_errors_total",
"Number notification log received queries that failed.",
nil, nil),
nflogQueryDuration: prometheus.NewDesc(
"cortex_alertmanager_nflog_query_duration_seconds",
"Duration of notification log query evaluation.",
nil, nil),
nflogPropagatedMessagesTotal: prometheus.NewDesc(
"cortex_alertmanager_nflog_gossip_messages_propagated_total",
"Number of received gossip messages that have been further gossiped.",
nil, nil),
markerAlerts: prometheus.NewDesc(
"cortex_alertmanager_alerts",
"How many alerts by state.",
[]string{"user", "state"}, nil),
silencesGCDuration: prometheus.NewDesc(
"cortex_alertmanager_silences_gc_duration_seconds",
"Duration of the last silence garbage collection cycle.",
nil, nil),
silencesSnapshotDuration: prometheus.NewDesc(
"cortex_alertmanager_silences_snapshot_duration_seconds",
"Duration of the last silence snapshot.",
nil, nil),
silencesSnapshotSize: prometheus.NewDesc(
"cortex_alertmanager_silences_snapshot_size_bytes",
"Size of the last silence snapshot in bytes.",
nil, nil),
silencesQueriesTotal: prometheus.NewDesc(
"cortex_alertmanager_silences_queries_total",
"How many silence queries were received.",
nil, nil),
silencesQueryErrorsTotal: prometheus.NewDesc(
"cortex_alertmanager_silences_query_errors_total",
"How many silence received queries did not succeed.",
nil, nil),
silencesQueryDuration: prometheus.NewDesc(
"cortex_alertmanager_silences_query_duration_seconds",
"Duration of silence query evaluation.",
nil, nil),
silencesPropagatedMessagesTotal: prometheus.NewDesc(
"cortex_alertmanager_silences_gossip_messages_propagated_total",
"Number of received gossip messages that have been further gossiped.",
nil, nil),
silences: prometheus.NewDesc(
"cortex_alertmanager_silences",
"How many silences by state.",
[]string{"user", "state"}, nil),
}
}

func (m *alertmanagerMetrics) addUserRegistry(user string, reg *prometheus.Registry) {
m.regsMu.Lock()
m.regs[user] = reg
m.regsMu.Unlock()
}

func (m *alertmanagerMetrics) registries() map[string]*prometheus.Registry {
regs := map[string]*prometheus.Registry{}

m.regsMu.Lock()
defer m.regsMu.Unlock()
for uid, r := range m.regs {
regs[uid] = r
}

return regs
}

func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
out <- m.alertsReceived
out <- m.alertsInvalid
out <- m.numNotifications
out <- m.numFailedNotifications
out <- m.notificationLatencySeconds
out <- m.nflogGCDuration
out <- m.nflogSnapshotDuration
out <- m.nflogSnapshotSize
out <- m.nflogQueriesTotal
out <- m.nflogQueryErrorsTotal
out <- m.nflogQueryDuration
out <- m.nflogPropagatedMessagesTotal
out <- m.markerAlerts
out <- m.silencesGCDuration
out <- m.silencesSnapshotDuration
out <- m.silencesSnapshotSize
out <- m.silencesQueriesTotal
out <- m.silencesQueryErrorsTotal
out <- m.silencesQueryDuration
out <- m.silences
out <- m.silencesPropagatedMessagesTotal
}

func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
data := util.BuildMetricFamiliesPerUserFromUserRegistries(m.registries())

data.SendSumOfCounters(out, m.alertsReceived, "alertmanager_alerts_received_total")
data.SendSumOfCounters(out, m.alertsInvalid, "alertmanager_alerts_invalid_total")

data.SendSumOfCountersPerUser(out, m.numNotifications, "alertmanager_notifications_total")
data.SendSumOfCountersPerUser(out, m.numFailedNotifications, "alertmanager_notifications_failed_total")
data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state")

data.SendSumOfSummaries(out, m.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds")
data.SendSumOfSummaries(out, m.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds")
data.SendSumOfGauges(out, m.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes")
data.SendSumOfCounters(out, m.nflogQueriesTotal, "alertmanager_nflog_queries_total")
data.SendSumOfCounters(out, m.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total")
data.SendSumOfHistograms(out, m.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
data.SendSumOfCounters(out, m.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")

data.SendSumOfSummaries(out, m.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
data.SendSumOfSummaries(out, m.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
data.SendSumOfGauges(out, m.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
data.SendSumOfCounters(out, m.silencesQueriesTotal, "alertmanager_silences_queries_total")
data.SendSumOfCounters(out, m.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total")
data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
data.SendSumOfGaugesPerUserWithLabels(out, m.silences, "alertmanager_silences", "state")
}
Loading