Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
* [FEATURE] Add /config HTTP endpoint which exposes the current Cortex configuration as YAML. #2165
* [FEATURE] Allow Prometheus remote write directly to ingesters. #1491
* [FEATURE] Add flag `-experimental.tsdb.stripe-size` to expose TSDB stripe size option. #2185
* [ENHANCEMENT] Alertmanager: Expose Per-tenant alertmanager metrics #2124
* [ENHANCEMENT] Add `status` label to `cortex_alertmanager_configs` metric to gauge the number of valid and invalid configs. #2125
* [ENHANCEMENT] Cassandra Authentication: added the `custom_authenticators` config option that allows users to authenticate with cassandra clusters using password authenticators that are not approved by default in [gocql](https://github.com/gocql/gocql/blob/81b8263d9fe526782a588ef94d3fa5c6148e5d67/conn.go#L27) #2093
* [ENHANCEMENT] Experimental TSDB: Export TSDB Syncer metrics from Compactor component, they are prefixed with `cortex_compactor_`. #2023
Expand Down
3 changes: 3 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMo
github.com/fsouza/fake-gcs-server v1.7.0 h1:Un0BXUXrRWYSmYyC1Rqm2e2WJfTPyDy/HGMz31emTi8=
github.com/fsouza/fake-gcs-server v1.7.0/go.mod h1:5XIRs4YvwNbNoz+1JF8j6KLAyDh7RHGAyAK3EP2EsNk=
github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/globalsign/mgo v0.0.0-20180905125535-1ca0a4f7cbcb/go.mod h1:xkRDCp4j0OGD1HRkm4kmhM+pmpv3AKq5SU7GMg4oO/Q=
github.com/globalsign/mgo v0.0.0-20181015135952-eeefdecb41b8/go.mod h1:xkRDCp4j0OGD1HRkm4kmhM+pmpv3AKq5SU7GMg4oO/Q=
Expand Down Expand Up @@ -361,6 +362,7 @@ github.com/google/martian v2.1.0+incompatible h1:/CP5g8u/VJHijgedC/Legn3BAbAaWPg
github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
github.com/google/pprof v0.0.0-20190723021845-34ac40c74b70 h1:XTnP8fJpa4Kvpw2qARB4KS9izqxPS0Sd92cDlY3uk+w=
github.com/google/pprof v0.0.0-20190723021845-34ac40c74b70/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
Expand Down Expand Up @@ -461,6 +463,7 @@ github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
github.com/influxdata/influxdb v1.7.7 h1:UvNzAPfBrKMENVbQ4mr4ccA9sW+W1Ihl0Yh1s0BiVAg=
github.com/influxdata/influxdb v1.7.7/go.mod h1:qZna6X/4elxqT3yI9iZYdZrWWdeFOOprn86kgg4+IzY=
github.com/jackc/fake v0.0.0-20150926172116-812a484cc733/go.mod h1:WrMFNQdiFJ80sQsxDoMokWK1W5TQtxBFNpzWTD84ibQ=
github.com/jackc/pgx v3.2.0+incompatible/go.mod h1:0ZGrqGqkRlliWnWB4zKnWtjbSWbGkVEFm4TeybAXq+I=
Expand Down
73 changes: 64 additions & 9 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/prometheus/alertmanager/api"
"github.com/prometheus/alertmanager/cluster"
"github.com/prometheus/alertmanager/config"
Expand Down Expand Up @@ -66,6 +67,9 @@ type Alertmanager struct {
wg sync.WaitGroup
mux *http.ServeMux
registry *prometheus.Registry

activeMtx sync.Mutex
active bool
}

var webReload = make(chan chan error)
Expand All @@ -81,17 +85,16 @@ func init() {
}

// New creates a new Alertmanager.
func New(cfg *Config) (*Alertmanager, error) {
func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
am := &Alertmanager{
cfg: cfg,
logger: log.With(cfg.Logger, "user", cfg.UserID),
stop: make(chan struct{}),
cfg: cfg,
logger: log.With(cfg.Logger, "user", cfg.UserID),
stop: make(chan struct{}),
active: false,
activeMtx: sync.Mutex{},
}

// TODO(cortex): Build a registry that can merge metrics from multiple users.
// For now, these metrics are ignored, as we can't register the same
// metric twice with a single registry.
am.registry = prometheus.NewRegistry()
am.registry = reg

am.wg.Add(1)
nflogID := fmt.Sprintf("nflog:%s", cfg.UserID)
Expand Down Expand Up @@ -233,12 +236,64 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config) error {
go am.dispatcher.Run()
go am.inhibitor.Run()

// Ensure the alertmanager is set to active
am.activeMtx.Lock()
am.active = true
am.activeMtx.Unlock()

return nil
}

// IsActive returns if the alertmanager is currently running
// or is paused
func (am *Alertmanager) IsActive() bool {
am.activeMtx.Lock()
defer am.activeMtx.Unlock()
return am.active
}

// Pause running jobs in the alertmanager that are able to be restarted and sets
// to inactives
func (am *Alertmanager) Pause() {
// Set to inactive
am.activeMtx.Lock()
am.active = false
am.activeMtx.Unlock()

// Stop the inhibitor and dispatcher which will be recreated when
// a new config is applied
if am.inhibitor != nil {
am.inhibitor.Stop()
am.inhibitor = nil
}
if am.dispatcher != nil {
am.dispatcher.Stop()
am.dispatcher = nil
}

// Remove all of the active silences from the alertmanager
silences, _, err := am.silences.Query()
if err != nil {
level.Warn(am.logger).Log("msg", "unable to retrieve silences for removal", "err", err)
}
for _, si := range silences {
err = am.silences.Expire(si.Id)
if err != nil {
level.Warn(am.logger).Log("msg", "unable to remove silence", "err", err, "silence", si.Id)
}
}
}

// Stop stops the Alertmanager.
func (am *Alertmanager) Stop() {
am.dispatcher.Stop()
if am.inhibitor != nil {
am.inhibitor.Stop()
}

if am.dispatcher != nil {
am.dispatcher.Stop()
}

am.alerts.Close()
close(am.stop)
am.wg.Wait()
Expand Down
210 changes: 210 additions & 0 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
package alertmanager

import (
"sync"

"github.com/prometheus/client_golang/prometheus"

"github.com/cortexproject/cortex/pkg/util"
)

// This struct aggregates metrics exported by Alertmanager
// and re-exports those aggregates as Cortex metrics.
type alertmanagerMetrics struct {
// Maps userID -> registry
regsMu sync.Mutex
regs map[string]*prometheus.Registry

// exported metrics, gathered from Alertmanager API
alertsReceived *prometheus.Desc
alertsInvalid *prometheus.Desc

// exported metrics, gathered from Alertmanager PipelineBuilder
numNotifications *prometheus.Desc
numFailedNotifications *prometheus.Desc
notificationLatencySeconds *prometheus.Desc

// exported metrics, gathered from Alertmanager nflog
nflogGCDuration *prometheus.Desc
nflogSnapshotDuration *prometheus.Desc
nflogSnapshotSize *prometheus.Desc
nflogQueriesTotal *prometheus.Desc
nflogQueryErrorsTotal *prometheus.Desc
nflogQueryDuration *prometheus.Desc
nflogPropagatedMessagesTotal *prometheus.Desc

// exported metrics, gathered from Alertmanager Marker
markerAlerts *prometheus.Desc

// exported metrics, gathered from Alertmanager Silences
silencesGCDuration *prometheus.Desc
silencesSnapshotDuration *prometheus.Desc
silencesSnapshotSize *prometheus.Desc
silencesQueriesTotal *prometheus.Desc
silencesQueryErrorsTotal *prometheus.Desc
silencesQueryDuration *prometheus.Desc
silences *prometheus.Desc
silencesPropagatedMessagesTotal *prometheus.Desc
}

func newAlertmanagerMetrics() *alertmanagerMetrics {
return &alertmanagerMetrics{
regs: map[string]*prometheus.Registry{},
regsMu: sync.Mutex{},
alertsReceived: prometheus.NewDesc(
"cortex_alertmanager_alerts_received_total",
"The total number of received alerts.",
[]string{"user"}, nil),
alertsInvalid: prometheus.NewDesc(
"cortex_alertmanager_alerts_invalid_total",
"The total number of received alerts that were invalid.",
[]string{"user"}, nil),
numNotifications: prometheus.NewDesc(
"cortex_alertmanager_notifications_total",
"The total number of attempted notifications.",
[]string{"user"}, nil),
numFailedNotifications: prometheus.NewDesc(
"cortex_alertmanager_notifications_failed_total",
"The total number of failed notifications.",
[]string{"user"}, nil),
notificationLatencySeconds: prometheus.NewDesc(
"cortex_alertmanager_notification_latency_seconds",
"The latency of notifications in seconds.",
nil, nil),
nflogGCDuration: prometheus.NewDesc(
"cortex_alertmanager_nflog_gc_duration_seconds",
"Duration of the last notification log garbage collection cycle.",
nil, nil),
nflogSnapshotDuration: prometheus.NewDesc(
"cortex_alertmanager_nflog_snapshot_duration_seconds",
"Duration of the last notification log snapshot.",
nil, nil),
nflogSnapshotSize: prometheus.NewDesc(
"cortex_alertmanager_nflog_snapshot_size_bytes",
"Size of the last notification log snapshot in bytes.",
nil, nil),
nflogQueriesTotal: prometheus.NewDesc(
"cortex_alertmanager_nflog_queries_total",
"Number of notification log queries were received.",
nil, nil),
nflogQueryErrorsTotal: prometheus.NewDesc(
"cortex_alertmanager_nflog_query_errors_total",
"Number notification log received queries that failed.",
nil, nil),
nflogQueryDuration: prometheus.NewDesc(
"cortex_alertmanager_nflog_query_duration_seconds",
"Duration of notification log query evaluation.",
nil, nil),
nflogPropagatedMessagesTotal: prometheus.NewDesc(
"cortex_alertmanager_nflog_gossip_messages_propagated_total",
"Number of received gossip messages that have been further gossiped.",
nil, nil),
markerAlerts: prometheus.NewDesc(
"cortex_alertmanager_alerts",
"How many alerts by state.",
[]string{"user", "state"}, nil),
silencesGCDuration: prometheus.NewDesc(
"cortex_alertmanager_silences_gc_duration_seconds",
"Duration of the last silence garbage collection cycle.",
nil, nil),
silencesSnapshotDuration: prometheus.NewDesc(
"cortex_alertmanager_silences_snapshot_duration_seconds",
"Duration of the last silence snapshot.",
nil, nil),
silencesSnapshotSize: prometheus.NewDesc(
"cortex_alertmanager_silences_snapshot_size_bytes",
"Size of the last silence snapshot in bytes.",
nil, nil),
silencesQueriesTotal: prometheus.NewDesc(
"cortex_alertmanager_silences_queries_total",
"How many silence queries were received.",
nil, nil),
silencesQueryErrorsTotal: prometheus.NewDesc(
"cortex_alertmanager_silences_query_errors_total",
"How many silence received queries did not succeed.",
nil, nil),
silencesQueryDuration: prometheus.NewDesc(
"cortex_alertmanager_silences_query_duration_seconds",
"Duration of silence query evaluation.",
nil, nil),
silencesPropagatedMessagesTotal: prometheus.NewDesc(
"cortex_alertmanager_silences_gossip_messages_propagated_total",
"Number of received gossip messages that have been further gossiped.",
nil, nil),
silences: prometheus.NewDesc(
"cortex_alertmanager_silences",
"How many silences by state.",
[]string{"user", "state"}, nil),
}
}

func (m *alertmanagerMetrics) addUserRegistry(user string, reg *prometheus.Registry) {
m.regsMu.Lock()
m.regs[user] = reg
m.regsMu.Unlock()
}

func (m *alertmanagerMetrics) registries() map[string]*prometheus.Registry {
regs := map[string]*prometheus.Registry{}

m.regsMu.Lock()
defer m.regsMu.Unlock()
for uid, r := range m.regs {
regs[uid] = r
}

return regs
}

func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
out <- m.alertsReceived
out <- m.alertsInvalid
out <- m.numNotifications
out <- m.numFailedNotifications
out <- m.notificationLatencySeconds
out <- m.nflogGCDuration
out <- m.nflogSnapshotDuration
out <- m.nflogSnapshotSize
out <- m.nflogQueriesTotal
out <- m.nflogQueryErrorsTotal
out <- m.nflogQueryDuration
out <- m.nflogPropagatedMessagesTotal
out <- m.markerAlerts
out <- m.silencesGCDuration
out <- m.silencesSnapshotDuration
out <- m.silencesSnapshotSize
out <- m.silencesQueriesTotal
out <- m.silencesQueryErrorsTotal
out <- m.silencesQueryDuration
out <- m.silences
out <- m.silencesPropagatedMessagesTotal
}

func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
data := util.BuildMetricFamiliesPerUserFromUserRegistries(m.registries())

data.SendSumOfCountersPerUser(out, m.alertsReceived, "alertmanager_alerts_received_total")
data.SendSumOfCountersPerUser(out, m.alertsInvalid, "alertmanager_alerts_invalid_total")

data.SendSumOfCountersPerUser(out, m.numNotifications, "alertmanager_notifications_total")
data.SendSumOfCountersPerUser(out, m.numFailedNotifications, "alertmanager_notifications_failed_total")
data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state")

data.SendSumOfSummaries(out, m.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds")
data.SendSumOfSummaries(out, m.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds")
data.SendSumOfGauges(out, m.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes")
data.SendSumOfCounters(out, m.nflogQueriesTotal, "alertmanager_nflog_queries_total")
data.SendSumOfCounters(out, m.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total")
data.SendSumOfHistograms(out, m.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
data.SendSumOfCounters(out, m.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")

data.SendSumOfSummaries(out, m.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
data.SendSumOfSummaries(out, m.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
data.SendSumOfGauges(out, m.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
data.SendSumOfCounters(out, m.silencesQueriesTotal, "alertmanager_silences_queries_total")
data.SendSumOfCounters(out, m.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total")
data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
data.SendSumOfGaugesPerUserWithLabels(out, m.silences, "alertmanager_silences", "state")
}
Loading